# Schritt 1: Nur Wikipedia-Daten extrahieren und in Google Sheet schreiben import os import time import re import gspread import wikipedia import requests import openai import csv from bs4 import BeautifulSoup from oauth2client.service_account import ServiceAccountCredentials from datetime import datetime from difflib import SequenceMatcher # === KONFIGURATION === VERSION = "1.0.8" LANG = "de" CREDENTIALS = "service_account.json" SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo" DURCHLÄUFE = int(input("Wieviele Zeilen sollen überprüft werden? ")) MAX_RETRIES = 3 RETRY_DELAY = 5 LOG_CSV = "gpt_antworten_log.csv" # === OpenAI API-KEY LADEN === with open("api_key.txt", "r") as f: openai.api_key = f.read().strip() # === GOOGLE SHEET VERBINDUNG === scope = ["https://www.googleapis.com/auth/spreadsheets"] creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS, scope) sheet = gspread.authorize(creds).open_by_url(SHEET_URL).sheet1 sheet_values = sheet.get_all_values() # === STARTINDEX SUCHEN (Spalte N = Index 13) === filled_n = [row[13] if len(row) > 13 else '' for row in sheet_values[1:]] start = next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip()), len(filled_n) + 1) print(f"Starte bei Zeile {start+1}") wikipedia.set_lang(LANG) # === WIKIPEDIA DATEN LADEN === def get_wikipedia_data(name, website_hint=""): begriffe = [name.strip(), " ".join(name.split()[:2])] if website_hint: parts = website_hint.replace("https://", "").replace("http://", "").split(".") if len(parts) > 1: begriffe.append(parts[0]) # z. B. "heimbach" aus "www.heimbach.com" for suchbegriff in begriffe: results = wikipedia.search(suchbegriff, results=3) for title in results: try: page = wikipedia.page(title) if name.lower().split()[0] not in page.title.lower(): continue url = page.url html = requests.get(url).text soup = BeautifulSoup(html, 'html.parser') infobox = soup.find("table", {"class": "infobox"}) branche = umsatz = "" if infobox: for row in infobox.find_all("tr"): th, td = row.find("th"), row.find("td") if not th or not td: continue if "Branche" in th.text: branche = td.text.strip() if "Umsatz" in th.text: umsatz = td.text.strip() if not branche: cats = page.categories branche = cats[0] if cats else "k.A." return url, branche or "k.A.", umsatz or "k.A." except: continue return "", "k.A.", "k.A." # === VERARBEITUNG === for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))): row = sheet_values[i] print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {i+1}: {row[0]}") url, branche, umsatz = get_wikipedia_data(row[0], row[1]) branche_final = branche if url else "k.A." umsatz_final = umsatz if url else "k.A." values = [ branche_final, "k.A.", umsatz_final, "k.A.", "k.A.", "k.A.", url, datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "k.A.", "k.A.", VERSION ] sheet.update(range_name=f"G{i+1}:Q{i+1}", values=[values]) print(f"✅ Aktualisiert: {values[:3]}...") time.sleep(RETRY_DELAY) print("\n✅ Wikipedia-Auswertung abgeschlossen") # === SCHRITT 2: GPT-BEWERTUNG === def classify_company(row, wikipedia_url=""): user_prompt = { "role": "user", "content": f"{row[0]};{row[1]};{row[2]};{row[4]};{row[5]}\nWikipedia-Link: {wikipedia_url}" } for attempt in range(MAX_RETRIES): try: response = openai.chat.completions.create( model="gpt-3.5-turbo", messages=[ { "role": "system", "content": ( "Du bist ein Experte für Brancheneinstufung und FSM-Potenzialbewertung.\n" "Bitte beziehe dich ausschließlich auf das konkret genannte Unternehmen.\n" "FSM steht für Field Service Management. Ziel ist es, Unternehmen mit >50 Technikern im Außendienst zu identifizieren.\n\n" "Struktur: Firmenname; Website; Ort; Aktuelle Einstufung; Beschreibung der Branche Extern\n\n" "Gib deine Antwort im CSV-Format zurück (1 Zeile, 8 Spalten):\n" "Wikipedia-Branche;LinkedIn-Branche;Umsatz (Mio €);Empfohlene Neueinstufung;Begründung;FSM-Relevanz;Techniker-Einschätzung;Techniker-Begründung" ) }, user_prompt ], temperature=0, timeout=15 ) full_text = response.choices[0].message.content.strip() break except Exception as e: print(f"⚠️ GPT-Fehler (Versuch {attempt+1}): {str(e)[:100]}") time.sleep(RETRY_DELAY) else: print("❌ GPT 3x fehlgeschlagen – Standardwerte") full_text = "k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A." lines = full_text.splitlines() csv_line = next((l for l in lines if ";" in l), "") parts = [v.strip() for v in csv_line.split(";")] if csv_line else ["k.A."] * 8 with open(LOG_CSV, "a", newline="", encoding="utf-8") as log: writer = csv.writer(log, delimiter=";") writer.writerow([datetime.now().strftime("%Y-%m-%d %H:%M:%S"), row[0], *parts, full_text]) return parts # === SCHRITT 2 DURCHFÜHREN === for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))): row = sheet_values[i] print(f"\n[{datetime.now().strftime('%H:%M:%S')}] GPT-Bewertung für Zeile {i+1}: {row[0]}") wiki_url = row[12] if len(row) > 12 else "" wiki, linkedin, umsatz_chat, new_cat, reason, fsm, techniker, techniker_reason = classify_company(row, wikipedia_url=wiki_url) values = [ wiki, linkedin, umsatz_chat, new_cat, reason, fsm, wiki_url, datetime.now().strftime("%Y-%m-%d %H:%M:%S"), techniker, techniker_reason ] sheet.update(range_name=f"G{i+1}:P{i+1}", values=[values]) time.sleep(RETRY_DELAY) print("\n✅ GPT-Bewertung abgeschlossen")