Brancheneinstufung2/brancheneinstufung.py

import os
import time
import csv
import re
import gspread
import openai
import wikipedia
from bs4 import BeautifulSoup
import requests
from oauth2client.service_account import ServiceAccountCredentials
from datetime import datetime

# === KONFIGURATION ===
EXCEL = "Bestandsfirmen.xlsx"
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
CREDENTIALS = "service_account.json"
LANG = "de"
LOG_CSV = "gpt_antworten_log.csv"
DURCHLÄUFE = int(input("Wieviele Zeilen sollen überprüft werden? "))
MAX_RETRIES = 3
RETRY_DELAY = 5

# === OpenAI INIT ===
with open("api_key.txt", "r") as f:
    openai.api_key = f.read().strip()

# === GOOGLE SHEETS ===
scope = ["https://www.googleapis.com/auth/spreadsheets"]
creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS, scope)
client = gspread.authorize(creds)
sheet = client.open_by_url(SHEET_URL).sheet1
sheet_values = sheet.get_all_values()

# === WIKIPEDIA KONFIG ===
wikipedia.set_lang(LANG)
WHITELIST_KATEGORIEN = [
    "unternehmen", "hersteller", "produktion", "industrie",
    "maschinenbau", "technik", "dienstleistung", "chemie",
    "pharma", "elektro", "medizin", "bau", "energie",
    "logistik", "automobil", "handel", "textil", "klima"
]

# === SYSTEM PROMPT ===
branches = [...]  # Branchenliste hier einfügen

system_prompt = {
    "role": "system",
    "content": (
        "Du bist ein Experte für Brancheneinstufung. Beantworte ausschließlich "
        "basierend auf den gegebenen Unternehmensdaten. Format: "
        "Wikipedia-Branche;LinkedIn-Branche;Umsatz (Mio €);Empfohlene Neueinstufung;"
        "Begründung;FSM-Relevanz;Techniker-Einschätzung;Techniker-Begründung"
    )
}

# === HELFERFUNKTIONEN ===
def extract_domain(url):
    """Extrahiert den Domain-Schlüssel aus der URL"""
    if not url.startswith("http"):
        url = f"https://{url}"
    return url.split("//")[-1].split("/")[0].split(".")[0]

def validate_wikipedia_content(content, name, domain):
    """Prüft ob der Artikel zum Unternehmen gehört"""
    name_fragments = name.lower().split()[:2]
    return (
        any(frag in content.lower() for frag in name_fragments) or
        (domain and domain.lower() in content.lower())

def parse_infobox(soup):
    """Extrahiert Branche und Umsatz aus der Infobox"""
    branche = umsatz = ""

    for row in soup.find_all("tr"):
        th = row.find("th")
        td = row.find("td")
        if not th or not td:
            continue

        header = th.get_text(strip=True).lower()
        value = td.get_text(separator=" ", strip=True)

        # Branche erkennen
        if any(s in header for s in ["branche", "industrie", "tätigkeitsfeld"]):
            branche = value

        # Umsatz erkennen
        if "umsatz" in header:
            if "mio" in value.lower():
                match = re.search(r"(\d{1,3}(?:[.,]\d{3})*(?:[.,]\d+)?)", value)
                if match:
                    umsatz = match.group(1).replace(",", ".")

    return branche, umsatz

def get_wikipedia_data(name, website):
    """Holt validierte Wikipedia-Daten"""
    domain = extract_domain(website) if website else ""

    for attempt in range(MAX_RETRIES):
        try:
            results = wikipedia.search(name, results=3)
            for title in results:
                try:
                    page = wikipedia.page(title, auto_suggest=False)
                    if not validate_wikipedia_content(page.content, name, domain):
                        continue

                    soup = BeautifulSoup(requests.get(page.url).text, "html.parser")
                    branche, umsatz = parse_infobox(soup)

                    # Fallback auf Kategorien
                    if not branche:
                        for cat in page.categories:
                            if any(kw in cat.lower() for kw in WHITELIST_KATEGORIEN):
                                branche = cat
                                break

                    return page.url, branche or "k.A.", umsatz or "k.A."

                except wikipedia.exceptions.PageError:
                    continue
        except Exception as e:
            print(f"⚠️ Wikipedia-Fehler ({name}): {str(e)[:100]}")
            time.sleep(RETRY_DELAY)

    return "", "k.A.", "k.A."

def query_gpt(row, wiki_url):
    """Verarbeitet die GPT-Abfrage mit verbessertem Error-Handling"""
    user_content = f"{row[0]};{row[1]};{row[2]};{row[4]};{row[5]}\nWikipedia: {wiki_url}"

    for attempt in range(MAX_RETRIES):
        try:
            response = openai.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[system_prompt, {"role": "user", "content": user_content}],
                temperature=0,
                timeout=15
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"⚠️ GPT-Fehler (Versuch {attempt+1}): {str(e)[:100]}")
            time.sleep(RETRY_DELAY)

    print("❌ GPT-Abfrage fehlgeschlagen")
    return "k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A."

# === HAUPTLOGIK ===
start_index = next((i for i, row in enumerate(sheet_values[1:], start=1) if not row[13].strip()), 1)

for i in range(start_index, min(start_index + DURCHLÄUFE, len(sheet_values))):
    row = sheet_values[i]
    print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {i+1}: {row[0]}")

    # Wikipedia-Daten holen
    wiki_url, wiki_branche, wiki_umsatz = get_wikipedia_data(row[0], row[1])

    # GPT-Abfrage
    gpt_response = query_gpt(row, wiki_url)
    gpt_data = [x.strip('"') for x in gpt_response.split(";")][:8]

    # Finale Werte
    final_branche = wiki_branche if wiki_url else "k.A."
    final_umsatz = wiki_umsatz if wiki_url else "k.A."

    # Google Sheet aktualisieren
    update_values = [
        final_branche,        # G: Wikipedia-Branche
        gpt_data[1],          # H: LinkedIn-Branche
        final_umsatz,         # I: Umsatz
        gpt_data[3],          # J: Neueinstufung
        gpt_data[4],          # K: Begründung
        gpt_data[5],          # L: FSM-Relevanz
        wiki_url,             # M: Wikipedia-URL
        datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        gpt_data[6],          # O: Techniker-Einschätzung
        gpt_data[7]           # P: Techniker-Begründung
    ]

    sheet.update(
        range_name=f"G{i+1}:P{i+1}",
        values=[update_values]
    )
    print(f"✅ Aktualisiert: {update_values[:3]}...")
    time.sleep(RETRY_DELAY)

print("\n✅ Prozess erfolgreich abgeschlossen")