import os import time import csv import re import gspread import openai import wikipedia from bs4 import BeautifulSoup import requests from oauth2client.service_account import ServiceAccountCredentials from datetime import datetime # === KONFIGURATION === EXCEL = "Bestandsfirmen.xlsx" SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo" CREDENTIALS = "service_account.json" LANG = "de" LOG_CSV = "gpt_antworten_log.csv" DURCHLÄUFE = int(input("Wieviele Zeilen sollen überprüft werden? ")) MAX_RETRIES = 3 RETRY_DELAY = 5 # === OpenAI INIT === with open("api_key.txt", "r") as f: openai.api_key = f.read().strip() # === GOOGLE SHEETS === scope = ["https://www.googleapis.com/auth/spreadsheets"] creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS, scope) client = gspread.authorize(creds) sheet = client.open_by_url(SHEET_URL).sheet1 sheet_values = sheet.get_all_values() # === WIKIPEDIA KONFIG === wikipedia.set_lang(LANG) WHITELIST_KATEGORIEN = [ "unternehmen", "hersteller", "produktion", "industrie", "maschinenbau", "technik", "dienstleistung", "chemie", "pharma", "elektro", "medizin", "bau", "energie", "logistik", "automobil", "handel", "textil", "klima" ] # === HELFERFUNKTIONEN === def extract_domain(url): """Extrahiert den Domain-Schlüssel aus der URL""" if not url or not isinstance(url, str): return "" if not url.startswith("http"): url = f"https://{url}" return url.split("//")[-1].split("/")[0].split(".")[0] def validate_wikipedia_content(content, name, domain): """Prüft ob der Artikel zum Unternehmen gehört""" if not content or not name: return False name_fragments = name.lower().split()[:2] domain_check = domain and domain.lower() in content.lower() name_check = any(frag in content.lower() for frag in name_fragments) return domain_check or name_check def parse_infobox(soup): """Extrahiert Branche und Umsatz aus der Infobox""" branche = umsatz = "" if not soup: return branche, umsatz for row in soup.find_all("tr"): th = row.find("th") td = row.find("td") if not th or not td: continue header = th.get_text(strip=True).lower() value = td.get_text(separator=" ", strip=True) # Branche erkennen if any(s in header for s in ["branche", "industrie", "tätigkeitsfeld"]): branche = value # Umsatz erkennen if "umsatz" in header: if "mio" in value.lower(): match = re.search(r"(\d{1,3}(?:[.,]\d{3})*(?:[.,]\d+)?)", value) if match: umsatz = match.group(1).replace(",", ".") return branche, umsatz def get_wikipedia_data(name, website): """Holt validierte Wikipedia-Daten""" if not name: return "", "k.A.", "k.A." domain = extract_domain(website) if website else "" for attempt in range(MAX_RETRIES): try: results = wikipedia.search(name, results=3) for title in results: try: page = wikipedia.page(title, auto_suggest=False) if not validate_wikipedia_content(page.content, name, domain): continue soup = BeautifulSoup(requests.get(page.url).text, "html.parser") branche, umsatz = parse_infobox(soup) # Fallback auf Kategorien if not branche: for cat in page.categories: if any(kw in cat.lower() for kw in WHITELIST_KATEGORIEN): branche = cat break return page.url, branche or "k.A.", umsatz or "k.A." except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError): continue except Exception as e: print(f"⚠️ Wikipedia-Fehler ({name}): {str(e)[:100]}") time.sleep(RETRY_DELAY) return "", "k.A.", "k.A." def query_gpt(row, wiki_url): """Verarbeitet die GPT-Abfrage mit verbessertem Error-Handling""" if not row or len(row) < 6: return "k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A." user_content = f"{row[0]};{row[1]};{row[2]};{row[4]};{row[5]}\nWikipedia: {wiki_url}" system_prompt = { "role": "system", "content": ( "Du bist ein Experte für Brancheneinstufung. Beantworte ausschließlich " "basierend auf den gegebenen Unternehmensdaten. Format: " "Wikipedia-Branche;LinkedIn-Branche;Umsatz (Mio €);Empfohlene Neueinstufung;" "Begründung;FSM-Relevanz;Techniker-Einschätzung;Techniker-Begründung" ) } for attempt in range(MAX_RETRIES): try: response = openai.chat.completions.create( model="gpt-3.5-turbo", messages=[system_prompt, {"role": "user", "content": user_content}], temperature=0, timeout=15 ) return response.choices[0].message.content.strip() except Exception as e: print(f"⚠️ GPT-Fehler (Versuch {attempt+1}): {str(e)[:100]}") time.sleep(RETRY_DELAY) print("❌ GPT-Abfrage fehlgeschlagen") return "k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A." # === HAUPTLOGIK === try: start_index = next((i for i, row in enumerate(sheet_values[1:], start=1) if len(row) > 13 and not row[13].strip()), 1) except StopIteration: start_index = 1 for i in range(start_index, min(start_index + DURCHLÄUFE, len(sheet_values))): if i >= len(sheet_values): break row = sheet_values[i] print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {i+1}: {row[0]}") # Wikipedia-Daten holen wiki_url, wiki_branche, wiki_umsatz = get_wikipedia_data(row[0], row[1] if len(row) > 1 else "") # GPT-Abfrage gpt_response = query_gpt(row, wiki_url) gpt_data = [x.strip('"').strip() for x in gpt_response.split(";")] gpt_data += ["k.A."] * (8 - len(gpt_data)) # Sicherstellen dass wir 8 Werte haben # Finale Werte final_branche = wiki_branche if wiki_url else "k.A." final_umsatz = wiki_umsatz if wiki_url else "k.A." # Google Sheet aktualisieren update_values = [ final_branche, # G: Wikipedia-Branche gpt_data[1], # H: LinkedIn-Branche final_umsatz, # I: Umsatz gpt_data[3], # J: Neueinstufung gpt_data[4], # K: Begründung gpt_data[5], # L: FSM-Relevanz wiki_url, # M: Wikipedia-URL datetime.now().strftime("%Y-%m-%d %H:%M:%S"), gpt_data[6], # O: Techniker-Einschätzung gpt_data[7] # P: Techniker-Begründung ] try: sheet.update( range_name=f"G{i+1}:P{i+1}", values=[update_values] ) print(f"✅ Aktualisiert: {update_values[:3]}...") except Exception as e: print(f"⚠️ Google Sheets Update fehlgeschlagen: {str(e)[:100]}") time.sleep(RETRY_DELAY) print("\n✅ Prozess erfolgreich abgeschlossen")