Deepseek V2
This commit is contained in:
@@ -40,37 +40,31 @@ WHITELIST_KATEGORIEN = [
|
|||||||
"logistik", "automobil", "handel", "textil", "klima"
|
"logistik", "automobil", "handel", "textil", "klima"
|
||||||
]
|
]
|
||||||
|
|
||||||
# === SYSTEM PROMPT ===
|
|
||||||
branches = [...] # Branchenliste hier einfügen
|
|
||||||
|
|
||||||
system_prompt = {
|
|
||||||
"role": "system",
|
|
||||||
"content": (
|
|
||||||
"Du bist ein Experte für Brancheneinstufung. Beantworte ausschließlich "
|
|
||||||
"basierend auf den gegebenen Unternehmensdaten. Format: "
|
|
||||||
"Wikipedia-Branche;LinkedIn-Branche;Umsatz (Mio €);Empfohlene Neueinstufung;"
|
|
||||||
"Begründung;FSM-Relevanz;Techniker-Einschätzung;Techniker-Begründung"
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
# === HELFERFUNKTIONEN ===
|
# === HELFERFUNKTIONEN ===
|
||||||
def extract_domain(url):
|
def extract_domain(url):
|
||||||
"""Extrahiert den Domain-Schlüssel aus der URL"""
|
"""Extrahiert den Domain-Schlüssel aus der URL"""
|
||||||
|
if not url or not isinstance(url, str):
|
||||||
|
return ""
|
||||||
if not url.startswith("http"):
|
if not url.startswith("http"):
|
||||||
url = f"https://{url}"
|
url = f"https://{url}"
|
||||||
return url.split("//")[-1].split("/")[0].split(".")[0]
|
return url.split("//")[-1].split("/")[0].split(".")[0]
|
||||||
|
|
||||||
def validate_wikipedia_content(content, name, domain):
|
def validate_wikipedia_content(content, name, domain):
|
||||||
"""Prüft ob der Artikel zum Unternehmen gehört"""
|
"""Prüft ob der Artikel zum Unternehmen gehört"""
|
||||||
|
if not content or not name:
|
||||||
|
return False
|
||||||
name_fragments = name.lower().split()[:2]
|
name_fragments = name.lower().split()[:2]
|
||||||
return (
|
domain_check = domain and domain.lower() in content.lower()
|
||||||
any(frag in content.lower() for frag in name_fragments) or
|
name_check = any(frag in content.lower() for frag in name_fragments)
|
||||||
(domain and domain.lower() in content.lower())
|
return domain_check or name_check
|
||||||
|
|
||||||
def parse_infobox(soup):
|
def parse_infobox(soup):
|
||||||
"""Extrahiert Branche und Umsatz aus der Infobox"""
|
"""Extrahiert Branche und Umsatz aus der Infobox"""
|
||||||
branche = umsatz = ""
|
branche = umsatz = ""
|
||||||
|
|
||||||
|
if not soup:
|
||||||
|
return branche, umsatz
|
||||||
|
|
||||||
for row in soup.find_all("tr"):
|
for row in soup.find_all("tr"):
|
||||||
th = row.find("th")
|
th = row.find("th")
|
||||||
td = row.find("td")
|
td = row.find("td")
|
||||||
@@ -95,6 +89,9 @@ def parse_infobox(soup):
|
|||||||
|
|
||||||
def get_wikipedia_data(name, website):
|
def get_wikipedia_data(name, website):
|
||||||
"""Holt validierte Wikipedia-Daten"""
|
"""Holt validierte Wikipedia-Daten"""
|
||||||
|
if not name:
|
||||||
|
return "", "k.A.", "k.A."
|
||||||
|
|
||||||
domain = extract_domain(website) if website else ""
|
domain = extract_domain(website) if website else ""
|
||||||
|
|
||||||
for attempt in range(MAX_RETRIES):
|
for attempt in range(MAX_RETRIES):
|
||||||
@@ -118,7 +115,7 @@ def get_wikipedia_data(name, website):
|
|||||||
|
|
||||||
return page.url, branche or "k.A.", umsatz or "k.A."
|
return page.url, branche or "k.A.", umsatz or "k.A."
|
||||||
|
|
||||||
except wikipedia.exceptions.PageError:
|
except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError):
|
||||||
continue
|
continue
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"⚠️ Wikipedia-Fehler ({name}): {str(e)[:100]}")
|
print(f"⚠️ Wikipedia-Fehler ({name}): {str(e)[:100]}")
|
||||||
@@ -128,8 +125,21 @@ def get_wikipedia_data(name, website):
|
|||||||
|
|
||||||
def query_gpt(row, wiki_url):
|
def query_gpt(row, wiki_url):
|
||||||
"""Verarbeitet die GPT-Abfrage mit verbessertem Error-Handling"""
|
"""Verarbeitet die GPT-Abfrage mit verbessertem Error-Handling"""
|
||||||
|
if not row or len(row) < 6:
|
||||||
|
return "k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A."
|
||||||
|
|
||||||
user_content = f"{row[0]};{row[1]};{row[2]};{row[4]};{row[5]}\nWikipedia: {wiki_url}"
|
user_content = f"{row[0]};{row[1]};{row[2]};{row[4]};{row[5]}\nWikipedia: {wiki_url}"
|
||||||
|
|
||||||
|
system_prompt = {
|
||||||
|
"role": "system",
|
||||||
|
"content": (
|
||||||
|
"Du bist ein Experte für Brancheneinstufung. Beantworte ausschließlich "
|
||||||
|
"basierend auf den gegebenen Unternehmensdaten. Format: "
|
||||||
|
"Wikipedia-Branche;LinkedIn-Branche;Umsatz (Mio €);Empfohlene Neueinstufung;"
|
||||||
|
"Begründung;FSM-Relevanz;Techniker-Einschätzung;Techniker-Begründung"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
for attempt in range(MAX_RETRIES):
|
for attempt in range(MAX_RETRIES):
|
||||||
try:
|
try:
|
||||||
response = openai.chat.completions.create(
|
response = openai.chat.completions.create(
|
||||||
@@ -147,18 +157,25 @@ def query_gpt(row, wiki_url):
|
|||||||
return "k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A."
|
return "k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A."
|
||||||
|
|
||||||
# === HAUPTLOGIK ===
|
# === HAUPTLOGIK ===
|
||||||
start_index = next((i for i, row in enumerate(sheet_values[1:], start=1) if not row[13].strip()), 1)
|
try:
|
||||||
|
start_index = next((i for i, row in enumerate(sheet_values[1:], start=1) if len(row) > 13 and not row[13].strip() else 1, 1)
|
||||||
|
except StopIteration:
|
||||||
|
start_index = 1
|
||||||
|
|
||||||
for i in range(start_index, min(start_index + DURCHLÄUFE, len(sheet_values))):
|
for i in range(start_index, min(start_index + DURCHLÄUFE, len(sheet_values))):
|
||||||
|
if i >= len(sheet_values):
|
||||||
|
break
|
||||||
|
|
||||||
row = sheet_values[i]
|
row = sheet_values[i]
|
||||||
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {i+1}: {row[0]}")
|
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {i+1}: {row[0]}")
|
||||||
|
|
||||||
# Wikipedia-Daten holen
|
# Wikipedia-Daten holen
|
||||||
wiki_url, wiki_branche, wiki_umsatz = get_wikipedia_data(row[0], row[1])
|
wiki_url, wiki_branche, wiki_umsatz = get_wikipedia_data(row[0], row[1] if len(row) > 1 else "")
|
||||||
|
|
||||||
# GPT-Abfrage
|
# GPT-Abfrage
|
||||||
gpt_response = query_gpt(row, wiki_url)
|
gpt_response = query_gpt(row, wiki_url)
|
||||||
gpt_data = [x.strip('"') for x in gpt_response.split(";")][:8]
|
gpt_data = [x.strip('"').strip() for x in gpt_response.split(";")]
|
||||||
|
gpt_data += ["k.A."] * (8 - len(gpt_data)) # Sicherstellen dass wir 8 Werte haben
|
||||||
|
|
||||||
# Finale Werte
|
# Finale Werte
|
||||||
final_branche = wiki_branche if wiki_url else "k.A."
|
final_branche = wiki_branche if wiki_url else "k.A."
|
||||||
@@ -178,11 +195,15 @@ for i in range(start_index, min(start_index + DURCHLÄUFE, len(sheet_values))):
|
|||||||
gpt_data[7] # P: Techniker-Begründung
|
gpt_data[7] # P: Techniker-Begründung
|
||||||
]
|
]
|
||||||
|
|
||||||
|
try:
|
||||||
sheet.update(
|
sheet.update(
|
||||||
range_name=f"G{i+1}:P{i+1}",
|
range_name=f"G{i+1}:P{i+1}",
|
||||||
values=[update_values]
|
values=[update_values]
|
||||||
)
|
)
|
||||||
print(f"✅ Aktualisiert: {update_values[:3]}...")
|
print(f"✅ Aktualisiert: {update_values[:3]}...")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ Google Sheets Update fehlgeschlagen: {str(e)[:100]}")
|
||||||
|
|
||||||
time.sleep(RETRY_DELAY)
|
time.sleep(RETRY_DELAY)
|
||||||
|
|
||||||
print("\n✅ Prozess erfolgreich abgeschlossen")
|
print("\n✅ Prozess erfolgreich abgeschlossen")
|
||||||
Reference in New Issue
Block a user