diff --git a/brancheneinstufung.py b/brancheneinstufung.py index 4deee19c..20352e54 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -1,209 +1,192 @@ +# Schritt 1: Nur Wikipedia-Daten extrahieren und in Google Sheet schreiben + import os import time -import csv import re import gspread -import openai import wikipedia -from bs4 import BeautifulSoup import requests +import openai +import csv +from bs4 import BeautifulSoup from oauth2client.service_account import ServiceAccountCredentials from datetime import datetime # === KONFIGURATION === -EXCEL = "Bestandsfirmen.xlsx" -SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo" -CREDENTIALS = "service_account.json" LANG = "de" -LOG_CSV = "gpt_antworten_log.csv" +CREDENTIALS = "service_account.json" +SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo" DURCHLÄUFE = int(input("Wieviele Zeilen sollen überprüft werden? ")) MAX_RETRIES = 3 RETRY_DELAY = 5 +LOG_CSV = "gpt_antworten_log.csv" -# === OpenAI INIT === +# === OpenAI API-KEY LADEN === with open("api_key.txt", "r") as f: openai.api_key = f.read().strip() -# === GOOGLE SHEETS === +# === GOOGLE SHEET VERBINDUNG === scope = ["https://www.googleapis.com/auth/spreadsheets"] creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS, scope) -client = gspread.authorize(creds) -sheet = client.open_by_url(SHEET_URL).sheet1 +sheet = gspread.authorize(creds).open_by_url(SHEET_URL).sheet1 sheet_values = sheet.get_all_values() -# === WIKIPEDIA KONFIG === +# === STARTINDEX SUCHEN (Spalte N = Index 13) === +filled_n = [row[13] if len(row) > 13 else '' for row in sheet_values[1:]] +start = next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip()), len(filled_n) + 1) +print(f"Starte bei Zeile {start+1}") + wikipedia.set_lang(LANG) + +# === DOMAIN SCHLÜSSEL === +def extract_domain_key(url): + if not url: + return "" + clean_url = url.replace("https://", "").replace("http://", "").split("/")[0] + parts = clean_url.split(".") + return parts[0] if len(parts) > 1 else "" + +# === INFOBOX-PARSING === +def parse_infobox(soup): + infobox = soup.find("table", class_=["infobox", "infobox vcard"]) + branche = umsatz = "" + if infobox: + for row in infobox.find_all("tr"): + th, td = row.find("th"), row.find("td") + if not th or not td: + continue + if "branche" in th.text.lower(): + branche = td.get_text(separator=" ", strip=True) + if "umsatz" in th.text.lower(): + umsatz_text = td.get_text(strip=True) + if "Mio" in umsatz_text: + match = re.search(r"(\d+[\d.,]*)\s*Mio", umsatz_text) + if match: + umsatz = match.group(1).replace(",", ".") + return branche, umsatz + +# === WIKIPEDIA DATEN === WHITELIST_KATEGORIEN = [ "unternehmen", "hersteller", "produktion", "industrie", "maschinenbau", "technik", "dienstleistung", "chemie", "pharma", "elektro", "medizin", "bau", "energie", - "logistik", "automobil", "handel", "textil", "klima" + "logistik", "automobil" ] -# === HELFERFUNKTIONEN === -def extract_domain(url): - """Extrahiert den Domain-Schlüssel aus der URL""" - if not url or not isinstance(url, str): - return "" - if not url.startswith("http"): - url = f"https://{url}" - return url.split("//")[-1].split("/")[0].split(".")[0] - -def validate_wikipedia_content(content, name, domain): - """Prüft ob der Artikel zum Unternehmen gehört""" - if not content or not name: - return False - name_fragments = name.lower().split()[:2] - domain_check = domain and domain.lower() in content.lower() - name_check = any(frag in content.lower() for frag in name_fragments) - return domain_check or name_check - -def parse_infobox(soup): - """Extrahiert Branche und Umsatz aus der Infobox""" - branche = umsatz = "" - - if not soup: - return branche, umsatz - - for row in soup.find_all("tr"): - th = row.find("th") - td = row.find("td") - if not th or not td: +def get_wikipedia_data(name, website_hint=""): + domain_key = extract_domain_key(website_hint) + search_terms = [name, domain_key] if domain_key else [name] + for term in search_terms: + if not term: continue - - header = th.get_text(strip=True).lower() - value = td.get_text(separator=" ", strip=True) - - # Branche erkennen - if any(s in header for s in ["branche", "industrie", "tätigkeitsfeld"]): - branche = value - - # Umsatz erkennen - if "umsatz" in header: - if "mio" in value.lower(): - match = re.search(r"(\d{1,3}(?:[.,]\d{3})*(?:[.,]\d+)?)", value) - if match: - umsatz = match.group(1).replace(",", ".") - - return branche, umsatz - -def get_wikipedia_data(name, website): - """Holt validierte Wikipedia-Daten""" - if not name: - return "", "k.A.", "k.A." - - domain = extract_domain(website) if website else "" - - for attempt in range(MAX_RETRIES): - try: - results = wikipedia.search(name, results=3) - for title in results: - try: - page = wikipedia.page(title, auto_suggest=False) - if not validate_wikipedia_content(page.content, name, domain): + for attempt in range(MAX_RETRIES): + try: + results = wikipedia.search(term, results=3) + for title in results: + try: + page = wikipedia.page(title, auto_suggest=False) + html = requests.get(page.url, timeout=10).text + if name.split()[0].lower() in page.content.lower() or (domain_key and domain_key.lower() in html.lower()): + soup = BeautifulSoup(html, "html.parser") + branche, umsatz = parse_infobox(soup) + if not branche: + for category in page.categories: + if any(kw in category.lower() for kw in WHITELIST_KATEGORIEN): + branche = category + break + return page.url, branche or "k.A.", umsatz or "k.A." + except: continue - - soup = BeautifulSoup(requests.get(page.url).text, "html.parser") - branche, umsatz = parse_infobox(soup) - - # Fallback auf Kategorien - if not branche: - for cat in page.categories: - if any(kw in cat.lower() for kw in WHITELIST_KATEGORIEN): - branche = cat - break - - return page.url, branche or "k.A.", umsatz or "k.A." - - except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError): - continue - except Exception as e: - print(f"⚠️ Wikipedia-Fehler ({name}): {str(e)[:100]}") - time.sleep(RETRY_DELAY) - + except Exception as e: + print(f"⚠️ Wikipedia-Fehler ({term}, Versuch {attempt+1}): {str(e)[:100]}") + time.sleep(RETRY_DELAY) return "", "k.A.", "k.A." -def query_gpt(row, wiki_url): - """Verarbeitet die GPT-Abfrage mit verbessertem Error-Handling""" - if not row or len(row) < 6: - return "k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A." - - user_content = f"{row[0]};{row[1]};{row[2]};{row[4]};{row[5]}\nWikipedia: {wiki_url}" - - system_prompt = { - "role": "system", - "content": ( - "Du bist ein Experte für Brancheneinstufung. Beantworte ausschließlich " - "basierend auf den gegebenen Unternehmensdaten. Format: " - "Wikipedia-Branche;LinkedIn-Branche;Umsatz (Mio €);Empfohlene Neueinstufung;" - "Begründung;FSM-Relevanz;Techniker-Einschätzung;Techniker-Begründung" - ) +# === SCHRITT 1: WIKIPEDIA VERARBEITUNG === +for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))): + row = sheet_values[i] + print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {i+1}: {row[0]}") + url, wiki_branche, umsatz = get_wikipedia_data(row[0], row[1]) + wiki_final = wiki_branche if url else "k.A." + umsatz_final = umsatz if url else "k.A." + values = [ + wiki_final, + "k.A.", # LinkedIn-Branche leer + umsatz_final, + "k.A.", "k.A.", "k.A.", + url, + datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "k.A.", "k.A." + ] + sheet.update(range_name=f"G{i+1}:P{i+1}", values=[values]) + print(f"✅ Aktualisiert: {values[:3]}...") + time.sleep(RETRY_DELAY) + +# === SCHRITT 2: GPT-BEWERTUNG === +def classify_company(row, wikipedia_url=""): + user_prompt = { + "role": "user", + "content": f"{row[0]};{row[1]};{row[2]};{row[4]};{row[5]}\nWikipedia-Link: {wikipedia_url}" } - for attempt in range(MAX_RETRIES): try: response = openai.chat.completions.create( model="gpt-3.5-turbo", - messages=[system_prompt, {"role": "user", "content": user_content}], + messages=[ + { + "role": "system", + "content": ( + "Du bist ein Experte für Brancheneinstufung und FSM-Potenzialbewertung.\n" + "Bitte beziehe dich ausschließlich auf das konkret genannte Unternehmen.\n" + "FSM steht für Field Service Management. Ziel ist es, Unternehmen mit >50 Technikern im Außendienst zu identifizieren.\n\n" + "Struktur: Firmenname; Website; Ort; Aktuelle Einstufung; Beschreibung der Branche Extern\n\n" + "Gib deine Antwort im CSV-Format zurück (1 Zeile, 8 Spalten):\n" + "Wikipedia-Branche;LinkedIn-Branche;Umsatz (Mio €);Empfohlene Neueinstufung;Begründung;FSM-Relevanz;Techniker-Einschätzung;Techniker-Begründung" + ) + }, + user_prompt + ], temperature=0, timeout=15 ) - return response.choices[0].message.content.strip() + full_text = response.choices[0].message.content.strip() + break except Exception as e: print(f"⚠️ GPT-Fehler (Versuch {attempt+1}): {str(e)[:100]}") time.sleep(RETRY_DELAY) - - print("❌ GPT-Abfrage fehlgeschlagen") - return "k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A." + else: + print("❌ GPT 3x fehlgeschlagen – Standardwerte") + full_text = "k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A." -# === HAUPTLOGIK === -try: - start_index = next((i for i, row in enumerate(sheet_values[1:], start=1) if len(row) > 13 and not row[13].strip()), 1) -except StopIteration: - start_index = 1 + lines = full_text.splitlines() + csv_line = next((l for l in lines if ";" in l), "") + parts = [v.strip() for v in csv_line.split(";")] if csv_line else ["k.A."] * 8 -for i in range(start_index, min(start_index + DURCHLÄUFE, len(sheet_values))): - if i >= len(sheet_values): - break - + with open(LOG_CSV, "a", newline="", encoding="utf-8") as log: + writer = csv.writer(log, delimiter=";") + writer.writerow([datetime.now().strftime("%Y-%m-%d %H:%M:%S"), row[0], *parts, full_text]) + + return parts + +# === SCHRITT 2 DURCHFÜHREN === +for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))): row = sheet_values[i] - print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {i+1}: {row[0]}") - - # Wikipedia-Daten holen - wiki_url, wiki_branche, wiki_umsatz = get_wikipedia_data(row[0], row[1] if len(row) > 1 else "") - - # GPT-Abfrage - gpt_response = query_gpt(row, wiki_url) - gpt_data = [x.strip('"').strip() for x in gpt_response.split(";")] - gpt_data += ["k.A."] * (8 - len(gpt_data)) # Sicherstellen dass wir 8 Werte haben - - # Finale Werte - final_branche = wiki_branche if wiki_url else "k.A." - final_umsatz = wiki_umsatz if wiki_url else "k.A." - - # Google Sheet aktualisieren - update_values = [ - final_branche, # G: Wikipedia-Branche - gpt_data[1], # H: LinkedIn-Branche - final_umsatz, # I: Umsatz - gpt_data[3], # J: Neueinstufung - gpt_data[4], # K: Begründung - gpt_data[5], # L: FSM-Relevanz - wiki_url, # M: Wikipedia-URL + print(f"\n[{datetime.now().strftime('%H:%M:%S')}] GPT-Bewertung für Zeile {i+1}: {row[0]}") + wiki_url = row[12] if len(row) > 12 else "" + wiki, linkedin, umsatz_chat, new_cat, reason, fsm, techniker, techniker_reason = classify_company(row, wikipedia_url=wiki_url) + values = [ + wiki, + linkedin, + umsatz_chat, + new_cat, + reason, + fsm, + wiki_url, datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - gpt_data[6], # O: Techniker-Einschätzung - gpt_data[7] # P: Techniker-Begründung + techniker, + techniker_reason ] - - try: - sheet.update( - range_name=f"G{i+1}:P{i+1}", - values=[update_values] - ) - print(f"✅ Aktualisiert: {update_values[:3]}...") - except Exception as e: - print(f"⚠️ Google Sheets Update fehlgeschlagen: {str(e)[:100]}") - + sheet.update(range_name=f"G{i+1}:P{i+1}", values=[values]) time.sleep(RETRY_DELAY) -print("\n✅ Prozess erfolgreich abgeschlossen") \ No newline at end of file +print("\n✅ GPT-Bewertung abgeschlossen")