Brancheneinstufung2/brancheneinstufung.py

# Neue Version mit Wikipedia-Validierung, GPT-Schutz und Antwortlogging

import os
import time
import csv
import pandas as pd
import gspread
import openai
import wikipedia
from bs4 import BeautifulSoup
import requests
from oauth2client.service_account import ServiceAccountCredentials
from datetime import datetime

# === KONFIGURATION ===
EXCEL = "Bestandsfirmen.xlsx"
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
CREDENTIALS = "service_account.json"
LANG = "de"
LOG_CSV = "gpt_antworten_log.csv"
DURCHLÄUFE = int(input("Wieviele Zeilen sollen überprüft werden? "))

# === OpenAI API-KEY LADEN ===
with open("api_key.txt", "r") as f:
    openai.api_key = f.read().strip()

# === GOOGLE SHEET VERBINDUNG ===
scope = ["https://www.googleapis.com/auth/spreadsheets"]
creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS, scope)
sheet = gspread.authorize(creds).open_by_url(SHEET_URL).sheet1
sheet_values = sheet.get_all_values()

# === STARTINDEX SUCHEN (Spalte N = Index 13) ===
filled_n = [row[13] if len(row) > 13 else '' for row in sheet_values[1:]]
start = next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip()), len(filled_n) + 1)
print(f"Starte bei Zeile {start+1}")

wikipedia.set_lang(LANG)

# === SYSTEM PROMPT ===
branches = [
    # (gekürzt für Übersicht – wie gehabt einfügen)
]

system_prompt = {
    "role": "system",
    "content": (
        "Du bist ein Experte für Brancheneinstufung und FSM-Potenzialbewertung. "
        "Bitte beziehe dich ausschließlich auf das konkret genannte Unternehmen. Ähnlich klingende Firmennamen dürfen nicht verwendet werden.\n"
        "FSM steht für Field Service Management – Software zur Planung und Unterstützung mobiler Techniker.\n"
        "Ziel ist es, Unternehmen mit >50 Technikern im Außeneinsatz zu identifizieren.\n\n"
        "Struktur: Firmenname; Website; Ort; Aktuelle Einstufung; Beschreibung der Branche Extern\n\n"
        "Gib deine Antwort im CSV-Format (1 Zeile, 8 Spalten, durch Semikolon getrennt):\n"
        "Wikipedia-Branche;LinkedIn-Branche;Umsatz (Mio €);Empfohlene Neueinstufung;Begründung;FSM-Relevanz (Ja/Nein/k.A. mit Begründung);Techniker-Einschätzung;Techniker-Begründung\n\n"
        "Ziel-Branchenschema:\n" + "\n".join(branches)
    )
}

# === WIKIPEDIA DATEN LADEN ===
def get_wikipedia_data(name, website_hint=""):
    begriffe = [name.strip(), " ".join(name.split()[:2])]
    if website_hint:
        begriffe.append(website_hint.split(".")[1])
    for suchbegriff in begriffe:
        try:
            page = wikipedia.page(suchbegriff, auto_suggest=False)
            if name.lower().split()[0] not in page.title.lower():
                continue
            url = page.url
            html = requests.get(url).text
            soup = BeautifulSoup(html, 'html.parser')
            infobox = soup.find("table", {"class": "infobox"})
            branche = umsatz = ""
            if infobox:
                for row in infobox.find_all("tr"):
                    th, td = row.find("th"), row.find("td")
                    if not th or not td:
                        continue
                    if "Branche" in th.text:
                        branche = td.text.strip()
                    if "Umsatz" in th.text:
                        umsatz = td.text.strip()
            if not branche:
                cats = page.categories
                branche = cats[0] if cats else "k.A."
            return url, branche or "k.A.", umsatz or "k.A."
        except:
            continue
    return "", "k.A.", "k.A."

# === GPT BEWERTUNG ===
def classify_company(row):
    user_prompt = {
        "role": "user",
        "content": f"{row[0]};{row[1]};{row[2]};{row[4]};{row[5]}"
    }
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[system_prompt, user_prompt],
        temperature=0
    )
    full_text = response.choices[0].message.content.strip()
    lines = full_text.splitlines()
    csv_line = next((l for l in lines if ";" in l and not l.lower().startswith("wikipedia-branche")), "")
    parts = [v.strip().strip('"') for v in csv_line.split(";")] if csv_line else []
    if len(parts) != 8:
        print("⚠️  Antwort unvollständig → Setze alles auf 'k.A.'")
        parts = ["k.A."] * 8
    with open(LOG_CSV, "a", newline="", encoding="utf-8") as log:
        writer = csv.writer(log, delimiter=";")
        writer.writerow([row[0], *parts, full_text])
    return parts

# === VERARBEITUNG ===
for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))):
    row = sheet_values[i]
    print(f"[{time.strftime('%H:%M:%S')}] Verarbeite Zeile {i+1}: {row[0]}")

    url, wiki_branche, umsatz = get_wikipedia_data(row[0], row[1])
    wiki, linkedin, umsatz_chat, new_cat, reason, fsm, techniker, techniker_reason = classify_company(row)

    values = [
        wiki or wiki_branche,
        linkedin,
        umsatz_chat or umsatz,
        new_cat,
        reason,
        fsm,
        url,
        datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        techniker,
        techniker_reason
    ]

    sheet.update(range_name=f"G{i+2}:P{i+2}", values=[values])
    time.sleep(5)

print("✅ Durchläufe abgeschlossen")