Update brancheneinstufung.py
This commit is contained in:
@@ -1,4 +1,4 @@
|
|||||||
# Neue Version mit Wikipedia-Validierung, GPT-Schutz und Antwortlogging
|
# Neue Version mit Wikipedia-Validierung, GPT-Schutz und Antwortlogging inkl. Retry
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
@@ -12,6 +12,7 @@ from bs4 import BeautifulSoup
|
|||||||
import requests
|
import requests
|
||||||
from oauth2client.service_account import ServiceAccountCredentials
|
from oauth2client.service_account import ServiceAccountCredentials
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from openai.error import OpenAIError
|
||||||
|
|
||||||
# === KONFIGURATION ===
|
# === KONFIGURATION ===
|
||||||
EXCEL = "Bestandsfirmen.xlsx"
|
EXCEL = "Bestandsfirmen.xlsx"
|
||||||
@@ -98,7 +99,6 @@ system_prompt = {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
# === WIKIPEDIA FUNKTION ===
|
|
||||||
WHITELIST_KATEGORIEN = ["Unternehmen", "Hersteller", "Produktion", "Industrie", "Maschinenbau", "Technik", "Dienstleistungsunternehmen"]
|
WHITELIST_KATEGORIEN = ["Unternehmen", "Hersteller", "Produktion", "Industrie", "Maschinenbau", "Technik", "Dienstleistungsunternehmen"]
|
||||||
|
|
||||||
def get_wikipedia_data(name, website_hint=""):
|
def get_wikipedia_data(name, website_hint=""):
|
||||||
@@ -156,27 +156,38 @@ def get_wikipedia_data(name, website_hint=""):
|
|||||||
continue
|
continue
|
||||||
return "", "k.A.", "k.A."
|
return "", "k.A.", "k.A."
|
||||||
|
|
||||||
# === GPT BEWERTUNG ===
|
|
||||||
def classify_company(row, wikipedia_url=""):
|
def classify_company(row, wikipedia_url=""):
|
||||||
user_prompt = {
|
user_prompt = {
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": f"{row[0]};{row[1]};{row[2]};{row[4]};{row[5]}\nWikipedia-Link: {wikipedia_url}"
|
"content": f"{row[0]};{row[1]};{row[2]};{row[4]};{row[5]}\nWikipedia-Link: {wikipedia_url}"
|
||||||
}
|
}
|
||||||
|
for attempt in range(3):
|
||||||
|
try:
|
||||||
response = openai.chat.completions.create(
|
response = openai.chat.completions.create(
|
||||||
model="gpt-3.5-turbo",
|
model="gpt-3.5-turbo",
|
||||||
messages=[system_prompt, user_prompt],
|
messages=[system_prompt, user_prompt],
|
||||||
temperature=0
|
temperature=0
|
||||||
)
|
)
|
||||||
full_text = response.choices[0].message.content.strip()
|
full_text = response.choices[0].message.content.strip()
|
||||||
|
break
|
||||||
|
except OpenAIError as e:
|
||||||
|
print(f"⚠️ GPT-Fehler bei Versuch {attempt+1}: {e}")
|
||||||
|
time.sleep(10)
|
||||||
|
else:
|
||||||
|
print("❌ GPT 3x fehlgeschlagen – setze alles auf 'k.A.'")
|
||||||
|
full_text = "k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A."
|
||||||
|
|
||||||
lines = full_text.splitlines()
|
lines = full_text.splitlines()
|
||||||
csv_line = next((l for l in lines if ";" in l and not l.lower().startswith("wikipedia-branche")), "")
|
csv_line = next((l for l in lines if ";" in l and not l.lower().startswith("wikipedia-branche")), "")
|
||||||
parts = [v.strip().strip('"') for v in csv_line.split(";")] if csv_line else []
|
parts = [v.strip().strip('"') for v in csv_line.split(";")] if csv_line else ["k.A."] * 8
|
||||||
|
|
||||||
if len(parts) != 8:
|
if len(parts) != 8:
|
||||||
print("⚠️ Antwort unvollständig → Setze alles auf 'k.A.'")
|
|
||||||
parts = ["k.A."] * 8
|
parts = ["k.A."] * 8
|
||||||
|
|
||||||
with open(LOG_CSV, "a", newline="", encoding="utf-8") as log:
|
with open(LOG_CSV, "a", newline="", encoding="utf-8") as log:
|
||||||
writer = csv.writer(log, delimiter=";")
|
writer = csv.writer(log, delimiter=";")
|
||||||
writer.writerow([datetime.now().strftime("%Y-%m-%d %H:%M:%S"), row[0], *parts, full_text])
|
writer.writerow([datetime.now().strftime("%Y-%m-%d %H:%M:%S"), row[0], *parts, full_text])
|
||||||
|
|
||||||
return parts
|
return parts
|
||||||
|
|
||||||
# === VERARBEITUNG ===
|
# === VERARBEITUNG ===
|
||||||
|
|||||||
Reference in New Issue
Block a user