Update brancheneinstufung.py

This commit is contained in:
2025-03-30 11:15:10 +00:00
parent dc72a8104e
commit 4504cc6d6f

View File

@@ -1,4 +1,4 @@
# Neue Version mit Wikipedia-Validierung, GPT-Schutz und Antwortlogging
# Neue Version mit Wikipedia-Validierung, GPT-Schutz und Antwortlogging inkl. Retry
import os
import time
@@ -12,6 +12,7 @@ from bs4 import BeautifulSoup
import requests
from oauth2client.service_account import ServiceAccountCredentials
from datetime import datetime
from openai.error import OpenAIError
# === KONFIGURATION ===
EXCEL = "Bestandsfirmen.xlsx"
@@ -98,7 +99,6 @@ system_prompt = {
)
}
# === WIKIPEDIA FUNKTION ===
WHITELIST_KATEGORIEN = ["Unternehmen", "Hersteller", "Produktion", "Industrie", "Maschinenbau", "Technik", "Dienstleistungsunternehmen"]
def get_wikipedia_data(name, website_hint=""):
@@ -156,27 +156,38 @@ def get_wikipedia_data(name, website_hint=""):
continue
return "", "k.A.", "k.A."
# === GPT BEWERTUNG ===
def classify_company(row, wikipedia_url=""):
user_prompt = {
"role": "user",
"content": f"{row[0]};{row[1]};{row[2]};{row[4]};{row[5]}\nWikipedia-Link: {wikipedia_url}"
}
response = openai.chat.completions.create(
model="gpt-3.5-turbo",
messages=[system_prompt, user_prompt],
temperature=0
)
full_text = response.choices[0].message.content.strip()
for attempt in range(3):
try:
response = openai.chat.completions.create(
model="gpt-3.5-turbo",
messages=[system_prompt, user_prompt],
temperature=0
)
full_text = response.choices[0].message.content.strip()
break
except OpenAIError as e:
print(f"⚠️ GPT-Fehler bei Versuch {attempt+1}: {e}")
time.sleep(10)
else:
print("❌ GPT 3x fehlgeschlagen setze alles auf 'k.A.'")
full_text = "k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A."
lines = full_text.splitlines()
csv_line = next((l for l in lines if ";" in l and not l.lower().startswith("wikipedia-branche")), "")
parts = [v.strip().strip('"') for v in csv_line.split(";")] if csv_line else []
parts = [v.strip().strip('"') for v in csv_line.split(";")] if csv_line else ["k.A."] * 8
if len(parts) != 8:
print("⚠️ Antwort unvollständig → Setze alles auf 'k.A.'")
parts = ["k.A."] * 8
with open(LOG_CSV, "a", newline="", encoding="utf-8") as log:
writer = csv.writer(log, delimiter=";")
writer.writerow([datetime.now().strftime("%Y-%m-%d %H:%M:%S"), row[0], *parts, full_text])
return parts
# === VERARBEITUNG ===