komplett neue Version mit optimiertem Prompt

2025-03-29 21:08:31 +00:00
parent ef59697442
commit aefa89ca15
1 changed files with 164 additions and 212 deletions
--- a/brancheneinstufung.py
+++ b/brancheneinstufung.py
@@ -1,3 +1,5 @@
+# Neue Version mit Token-Optimierung, festem Prompt und Begrenzung der Durchläufe
+
 import os
 import time
 import pandas as pd
@@ -9,49 +11,23 @@ import requests
 from oauth2client.service_account import ServiceAccountCredentials
 from datetime import datetime

-# === CONFIG ===
-EXCEL = "Bestandsfirmen.xlsx"
+# === KONFIGURATION ===
+EXCEL = "Bestandsfirmen.xlsx"  # optional, falls du später exportieren willst
 SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
 CREDENTIALS = "service_account.json"
-CHUNK = 10
 LANG = "de"
+DURCHLÄUFE = int(input("Wieviele Zeilen sollen überprüft werden? "))

-# === AUTHENTICATION ===
+# === GOOGLE SHEET VERBINDUNG ===
 scope = ["https://www.googleapis.com/auth/spreadsheets"]
 creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS, scope)
 sheet = gspread.authorize(creds).open_by_url(SHEET_URL).sheet1
-
-# OpenAI API-Key aus externer Datei laden
-with open("api_key.txt", "r") as f:
-    openai.api_key = f.read().strip()
-
-# === LOAD DATA ===
-df = pd.read_excel(EXCEL)
-for col in ["Wikipedia-URL", "Wikipedia-Branche", "LinkedIn-Branche", "Umsatz (Mio €)",
-            "Empfohlene Neueinstufung", "Begründung Neueinstufung", "FSM-Relevanz", "Letzte Prüfung",
-            "Techniker-Einschätzung (Auto)", "Techniker-Einschätzung (Begründung)", "Techniker-Einschätzung (Manuell)"]:
-    if col not in df.columns:
-        df[col] = ""
-
-# === STARTE BEI ERSTER LEERER ZEILE IN SPALTE 'Letzte Prüfung' (Spalte N) ===
 sheet_values = sheet.get_all_values()
-filled_n = [row[13] if len(row) > 13 else '' for row in sheet_values[1:]]
-start = next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip() or str(v).lower() == 'nan'), len(filled_n) + 1)
-print(f"Starte bei Zeile {start+1} (erste leere Zeile in Spalte N)")

-mapping_dict = {}
+# === WIKIPEDIA KONFIG ===
 wikipedia.set_lang(LANG)

-# === ÜBERSETZUNGSTABELLE VORBEREITEN ===
-sheet_trans_title = "Branchen-Mapping"
-try:
-    sheet_trans = sheet.spreadsheet.worksheet(sheet_trans_title)
-except gspread.exceptions.WorksheetNotFound:
-    sheet_trans = sheet.spreadsheet.add_worksheet(title=sheet_trans_title, rows="100", cols="3")
-sheet_trans.clear()
-sheet_trans.update(range_name="A1:B1", values=[["Wikipedia-Branche", "Ziel-Branchenschema"]])
-
-# === BRANCHENSCHEMA ===
+# === SYSTEM PROMPT ===
 branches = [
    "Hersteller / Produzenten > Maschinenbau",
    "Hersteller / Produzenten > Automobil",
@@ -98,33 +74,40 @@ branches = [
 system_prompt = {
    "role": "system",
    "content": (
-        "Du bist ein Experte für Brancheneinstufung und FSM-Potenzialbewertung. Nutze das folgende Ziel‑Branchenschema als Referenz:\n\n"
-        + "\n".join(branches)
+        "Du bist ein Experte für Brancheneinstufung und FSM-Potenzialbewertung. "
+        "FSM steht für Field Service Management – Software zur Planung und Unterstützung mobiler Techniker. "
+        "Ziel ist es, Unternehmen zu identifizieren, die mehr als 50 Techniker im Außeneinsatz beschäftigen (z. B. Servicetechniker, Instandhalter, Medizintechniker etc.).\n\n"
+        "Dir liegt pro Unternehmen eine strukturierte Eingabezeile vor, bestehend aus:\n"
+        "Firmenname; Website; Ort; Aktuelle Einstufung; Beschreibung der Branche Extern\n\n"
+        "Bitte führe für jede Firma eine fundierte Bewertung durch:\n"
+        "- Nimm eine Brancheneinstufung anhand des untenstehenden Ziel-Branchenschemas vor.\n"
+        "- Berücksichtige dabei alle vorliegenden Informationen (auch externe Beschreibung, Wikipedia, LinkedIn, Website) sowie die bisherige Einstufung.\n"
+        "- Wenn die bisherige Einstufung korrekt ist, bestätige sie – wenn nicht, schlage eine neue Einstufung vor und begründe diese.\n"
+        "- Gib zusätzlich an, ob das Unternehmen FSM-relevant ist (Ja / Nein / k.A. mit Begründung).\n"
+        "- Schätze die Anzahl mobiler Techniker anhand öffentlich verfügbarer Infos und gib eine Stufe an: <50 / >50 / >100 / >500, mit Begründung.\n\n"
+        "Ziel-Branchenschema:\n" + "\n".join(branches)
    )
 }

-# === WIKIPEDIA LOOKUP ===
-def get_wikipedia_data(firmenname):
-    suchbegriffe = [firmenname.strip(), " ".join(firmenname.split()[:2])]
-    for suchbegriff in suchbegriffe:
+# === HILFSFUNKTIONEN ===
+def get_wikipedia_data(name):
+    for suchbegriff in [name.strip(), " ".join(name.split()[:2])]:
        try:
            page = wikipedia.page(suchbegriff, auto_suggest=False)
            url = page.url
            html = requests.get(url).text
            soup = BeautifulSoup(html, 'html.parser')
            infobox = soup.find("table", {"class": "infobox"})
-            branche = ""
-            umsatz = ""
+            branche = umsatz = ""
            if infobox:
                for row in infobox.find_all("tr"):
-                    header = row.find("th")
-                    data = row.find("td")
-                    if not header or not data:
+                    th, td = row.find("th"), row.find("td")
+                    if not th or not td:
                        continue
-                    if "Branche" in header.text:
-                        branche = data.text.strip()
-                    if "Umsatz" in header.text:
-                        umsatz = data.text.strip()
+                    if "Branche" in th.text:
+                        branche = td.text.strip()
+                    if "Umsatz" in th.text:
+                        umsatz = td.text.strip()
            if not branche:
                cats = page.categories
                branche = cats[0] if cats else ""
@@ -133,80 +116,49 @@ def get_wikipedia_data(firmenname):
            continue
    return "", "", ""

-# === KLASSIFIZIERUNG ===
 def classify_company(row):
    user_prompt = {
        "role": "user",
-        "content": (
-            "Bitte prüfe die vorliegenden Informationen zum Unternehmen. Gib die Antwort im CSV-Format zurück:\n"
-            "Wikipedia-Branche; LinkedIn-Branche; Umsatz (Mio €); Empfohlene Neueinstufung; Begründung; FSM-Relevanz (Ja/Nein/k.A. mit Begründung); Techniker-Einschätzung (<50/>50/>100/>500); Techniker-Begründung\n\n"
-            f"Beschreibung: {row['Beschreibung des Unternehmens'] or ''}\n"
-            f"Aktuelle Einstufung: {row['Aktuelle Einstufung'] or ''}\n"
-            f"Externe Branchenbeschreibung: {row['Beschreibung der Branche Extern'] or ''}\n"
-            f"Website: {row['Website'] or ''}"
-        )
+        "content": f"{row[0]};{row[1]};{row[2]};{row[4]};{row[5]}"
    }
-    resp = openai.chat.completions.create(
-        model="gpt-4",
+    response = openai.chat.completions.create(
+        model="gpt-3.5-turbo",
        messages=[system_prompt, user_prompt],
        temperature=0
    )
-    result = resp.choices[0].message.content.strip()
-    parts = [v.strip().strip('"') for v in result.split(";", 7)]
+    parts = [v.strip().strip('"') for v in response.choices[0].message.content.strip().split(";", 7)]
    while len(parts) < 8:
        parts.append("k.A.")
    return parts

-# === LOOP ===
-for df_idx in range(start - 1, len(df)):
-    row = df.iloc[df_idx]
-    if str(row.get("Letzte Prüfung", "")).strip():
-        continue
+# === STARTINDEX SUCHEN ===
+filled = [row[11] if len(row) > 11 else '' for row in sheet_values[1:]]
+start = next((i + 1 for i, v in enumerate(filled, start=1) if not v.strip()), len(filled) + 1)
+print(f"Starte bei Zeile {start+1}")

-    print(f"[{time.strftime('%H:%M:%S')}] Verarbeite Zeile {df_idx+1}: {row['Firmenname']}")
+# === VERARBEITUNG ===
+for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))):
+    row = sheet_values[i]
+    print(f"[{time.strftime('%H:%M:%S')}] Verarbeite Zeile {i+1}: {row[0]}")

-    url, wiki_branche, umsatz = get_wikipedia_data(row['Firmenname'])
-    df.at[df_idx, "Wikipedia-URL"] = url
-    df.at[df_idx, "Wikipedia-Branche"] = wiki_branche.strip('"')
-    if not df.at[df_idx, "Umsatz (Mio €)"]:
-        df.at[df_idx, "Umsatz (Mio €)"] = umsatz
+    url, wiki_branche, umsatz = get_wikipedia_data(row[0])
+    wiki, linkedin, umsatz_chat, new_cat, reason, fsm, techniker, techniker_reason = classify_company(row)

-    wiki, linkedin, umsatz_chat, new_cat, reason, fsm_relevant, techniker, techniker_reason = classify_company(row)
-    df.at[df_idx, "Wikipedia-Branche"] = wiki or wiki_branche
-    df.at[df_idx, "LinkedIn-Branche"] = linkedin
-    if not df.at[df_idx, "Umsatz (Mio €)"]:
-        df.at[df_idx, "Umsatz (Mio €)"] = umsatz_chat
-    df.at[df_idx, "Empfohlene Neueinstufung"] = new_cat
-
-    current_cat = str(row.get("Aktuelle Einstufung") or "").strip().strip('"')
-    if new_cat != current_cat:
-        df.at[df_idx, "Begründung Neueinstufung"] = reason
-    else:
-        df.at[df_idx, "Begründung Neueinstufung"] = ""
-
-    df.at[df_idx, "FSM-Relevanz"] = fsm_relevant
-    df.at[df_idx, "Techniker-Einschätzung (Auto)"] = techniker
-    df.at[df_idx, "Techniker-Einschätzung (Begründung)"] = techniker_reason
-
-    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    df.at[df_idx, "Letzte Prüfung"] = now
-
-    key = df.at[df_idx, "Wikipedia-Branche"]
-    val = df.at[df_idx, "Empfohlene Neueinstufung"]
-    if key and val and key not in mapping_dict:
-        mapping_dict[key] = val
-        sheet_trans.update(range_name=f"A{len(mapping_dict)+1}:B{len(mapping_dict)+1}", values=[[key, val]])
-
-    sheet.update(
-        values=[df.loc[df_idx, [
-            "Wikipedia-Branche", "LinkedIn-Branche", "Umsatz (Mio €)",
-            "Empfohlene Neueinstufung", "Begründung Neueinstufung",
-            "FSM-Relevanz", "Wikipedia-URL", "Letzte Prüfung",
-            "Techniker-Einschätzung (Auto)", "Techniker-Einschätzung (Begründung)"
-        ]].tolist()],
-        range_name=f"G{df_idx+2}:Q{df_idx+2}"
-    )
+    values = [
+        wiki or wiki_branche,
+        linkedin,
+        umsatz_chat or umsatz,
+        new_cat,
+        reason,
+        fsm,
+        url,
+        datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+        techniker,
+        techniker_reason
+    ]

+    # Schreibe in die Spalten G bis P (7–16, nullbasiert also 6–15)
+    sheet.update(range_name=f"G{i+2}:P{i+2}", values=[values])
    time.sleep(5)

-print("✅ Fertig!")
+print("✅ Durchläufe abgeschlossen")