Deepseek V2
This commit is contained in:
@@ -40,37 +40,31 @@ WHITELIST_KATEGORIEN = [
|
||||
"logistik", "automobil", "handel", "textil", "klima"
|
||||
]
|
||||
|
||||
# === SYSTEM PROMPT ===
|
||||
branches = [...] # Branchenliste hier einfügen
|
||||
|
||||
system_prompt = {
|
||||
"role": "system",
|
||||
"content": (
|
||||
"Du bist ein Experte für Brancheneinstufung. Beantworte ausschließlich "
|
||||
"basierend auf den gegebenen Unternehmensdaten. Format: "
|
||||
"Wikipedia-Branche;LinkedIn-Branche;Umsatz (Mio €);Empfohlene Neueinstufung;"
|
||||
"Begründung;FSM-Relevanz;Techniker-Einschätzung;Techniker-Begründung"
|
||||
)
|
||||
}
|
||||
|
||||
# === HELFERFUNKTIONEN ===
|
||||
def extract_domain(url):
|
||||
"""Extrahiert den Domain-Schlüssel aus der URL"""
|
||||
if not url or not isinstance(url, str):
|
||||
return ""
|
||||
if not url.startswith("http"):
|
||||
url = f"https://{url}"
|
||||
return url.split("//")[-1].split("/")[0].split(".")[0]
|
||||
|
||||
def validate_wikipedia_content(content, name, domain):
|
||||
"""Prüft ob der Artikel zum Unternehmen gehört"""
|
||||
if not content or not name:
|
||||
return False
|
||||
name_fragments = name.lower().split()[:2]
|
||||
return (
|
||||
any(frag in content.lower() for frag in name_fragments) or
|
||||
(domain and domain.lower() in content.lower())
|
||||
domain_check = domain and domain.lower() in content.lower()
|
||||
name_check = any(frag in content.lower() for frag in name_fragments)
|
||||
return domain_check or name_check
|
||||
|
||||
def parse_infobox(soup):
|
||||
"""Extrahiert Branche und Umsatz aus der Infobox"""
|
||||
branche = umsatz = ""
|
||||
|
||||
if not soup:
|
||||
return branche, umsatz
|
||||
|
||||
for row in soup.find_all("tr"):
|
||||
th = row.find("th")
|
||||
td = row.find("td")
|
||||
@@ -95,6 +89,9 @@ def parse_infobox(soup):
|
||||
|
||||
def get_wikipedia_data(name, website):
|
||||
"""Holt validierte Wikipedia-Daten"""
|
||||
if not name:
|
||||
return "", "k.A.", "k.A."
|
||||
|
||||
domain = extract_domain(website) if website else ""
|
||||
|
||||
for attempt in range(MAX_RETRIES):
|
||||
@@ -118,7 +115,7 @@ def get_wikipedia_data(name, website):
|
||||
|
||||
return page.url, branche or "k.A.", umsatz or "k.A."
|
||||
|
||||
except wikipedia.exceptions.PageError:
|
||||
except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError):
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"⚠️ Wikipedia-Fehler ({name}): {str(e)[:100]}")
|
||||
@@ -128,8 +125,21 @@ def get_wikipedia_data(name, website):
|
||||
|
||||
def query_gpt(row, wiki_url):
|
||||
"""Verarbeitet die GPT-Abfrage mit verbessertem Error-Handling"""
|
||||
if not row or len(row) < 6:
|
||||
return "k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A."
|
||||
|
||||
user_content = f"{row[0]};{row[1]};{row[2]};{row[4]};{row[5]}\nWikipedia: {wiki_url}"
|
||||
|
||||
system_prompt = {
|
||||
"role": "system",
|
||||
"content": (
|
||||
"Du bist ein Experte für Brancheneinstufung. Beantworte ausschließlich "
|
||||
"basierend auf den gegebenen Unternehmensdaten. Format: "
|
||||
"Wikipedia-Branche;LinkedIn-Branche;Umsatz (Mio €);Empfohlene Neueinstufung;"
|
||||
"Begründung;FSM-Relevanz;Techniker-Einschätzung;Techniker-Begründung"
|
||||
)
|
||||
}
|
||||
|
||||
for attempt in range(MAX_RETRIES):
|
||||
try:
|
||||
response = openai.chat.completions.create(
|
||||
@@ -147,18 +157,25 @@ def query_gpt(row, wiki_url):
|
||||
return "k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A."
|
||||
|
||||
# === HAUPTLOGIK ===
|
||||
start_index = next((i for i, row in enumerate(sheet_values[1:], start=1) if not row[13].strip()), 1)
|
||||
try:
|
||||
start_index = next((i for i, row in enumerate(sheet_values[1:], start=1) if len(row) > 13 and not row[13].strip() else 1, 1)
|
||||
except StopIteration:
|
||||
start_index = 1
|
||||
|
||||
for i in range(start_index, min(start_index + DURCHLÄUFE, len(sheet_values))):
|
||||
if i >= len(sheet_values):
|
||||
break
|
||||
|
||||
row = sheet_values[i]
|
||||
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {i+1}: {row[0]}")
|
||||
|
||||
# Wikipedia-Daten holen
|
||||
wiki_url, wiki_branche, wiki_umsatz = get_wikipedia_data(row[0], row[1])
|
||||
wiki_url, wiki_branche, wiki_umsatz = get_wikipedia_data(row[0], row[1] if len(row) > 1 else "")
|
||||
|
||||
# GPT-Abfrage
|
||||
gpt_response = query_gpt(row, wiki_url)
|
||||
gpt_data = [x.strip('"') for x in gpt_response.split(";")][:8]
|
||||
gpt_data = [x.strip('"').strip() for x in gpt_response.split(";")]
|
||||
gpt_data += ["k.A."] * (8 - len(gpt_data)) # Sicherstellen dass wir 8 Werte haben
|
||||
|
||||
# Finale Werte
|
||||
final_branche = wiki_branche if wiki_url else "k.A."
|
||||
@@ -178,11 +195,15 @@ for i in range(start_index, min(start_index + DURCHLÄUFE, len(sheet_values))):
|
||||
gpt_data[7] # P: Techniker-Begründung
|
||||
]
|
||||
|
||||
sheet.update(
|
||||
range_name=f"G{i+1}:P{i+1}",
|
||||
values=[update_values]
|
||||
)
|
||||
print(f"✅ Aktualisiert: {update_values[:3]}...")
|
||||
try:
|
||||
sheet.update(
|
||||
range_name=f"G{i+1}:P{i+1}",
|
||||
values=[update_values]
|
||||
)
|
||||
print(f"✅ Aktualisiert: {update_values[:3]}...")
|
||||
except Exception as e:
|
||||
print(f"⚠️ Google Sheets Update fehlgeschlagen: {str(e)[:100]}")
|
||||
|
||||
time.sleep(RETRY_DELAY)
|
||||
|
||||
print("\n✅ Prozess erfolgreich abgeschlossen")
|
||||
Reference in New Issue
Block a user