# Schritt 1: Nur Wikipedia-Daten extrahieren und in Google Sheet schreiben import os import time import re import gspread import wikipedia import requests import openai import csv from bs4 import BeautifulSoup from lxml import html as lh from oauth2client.service_account import ServiceAccountCredentials from datetime import datetime from difflib import SequenceMatcher # === KONFIGURATION === VERSION = "1.0.9" LANG = "de" CREDENTIALS = "service_account.json" SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo" DURCHLÄUFE = int(input("Wieviele Zeilen sollen überprüft werden? ")) MAX_RETRIES = 3 RETRY_DELAY = 5 LOG_CSV = "gpt_antworten_log.csv" # === OpenAI API-KEY LADEN === with open("api_key.txt", "r") as f: openai.api_key = f.read().strip() # === GOOGLE SHEET VERBINDUNG === scope = ["https://www.googleapis.com/auth/spreadsheets"] creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS, scope) sheet = gspread.authorize(creds).open_by_url(SHEET_URL).sheet1 sheet_values = sheet.get_all_values() # === STARTINDEX SUCHEN (Spalte N = Index 13) === filled_n = [row[13] if len(row) > 13 else '' for row in sheet_values[1:]] start = next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip()), len(filled_n) + 1) print(f"Starte bei Zeile {start+1}") wikipedia.set_lang(LANG) def get_wikipedia_data(name, website_hint=""): begriffe = [name.strip(), " ".join(name.split()[:2])] if website_hint: parts = website_hint.replace("https://", "").replace("http://", "").split(".") if len(parts) > 1: begriffe.append(parts[0]) for suchbegriff in begriffe: results = wikipedia.search(suchbegriff, results=3) for title in results: try: page = wikipedia.page(title) if name.lower().split()[0] not in page.title.lower(): continue url = page.url html_raw = requests.get(url).text dom = lh.fromstring(html_raw) try: branche = dom.xpath("//th[contains(text(),'Branche') or contains(text(),'Tätigkeitsfeld')]/following-sibling::td[1]/text()") umsatz = dom.xpath("//th[contains(text(),'Umsatz')]/following-sibling::td[1]/text()") branche_clean = branche[0].strip() if branche else "k.A." umsatz_clean = umsatz[0].strip() if umsatz else "k.A." except: branche_clean, umsatz_clean = "k.A.", "k.A." return url, branche_clean, umsatz_clean except: continue return "", "k.A.", "k.A." # === VERARBEITUNG === for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))): row = sheet_values[i] print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {i+1}: {row[0]}") url, branche, umsatz = get_wikipedia_data(row[0], row[1]) branche_final = branche if url else "k.A." umsatz_final = umsatz if url else "k.A." values = [ branche_final, "k.A.", umsatz_final, "k.A.", "k.A.", "k.A.", url, datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "k.A.", "k.A.", VERSION ] sheet.update(range_name=f"G{i+1}:Q{i+1}", values=[values]) print(f"✅ Aktualisiert: {values[:3]}...") time.sleep(RETRY_DELAY) print("\n✅ Wikipedia-Auswertung abgeschlossen") # === SCHRITT 2: GPT-BEWERTUNG === def classify_company(row, wikipedia_url=""): user_prompt = { "role": "user", "content": f"{row[0]};{row[1]};{row[2]};{row[4]};{row[5]}\nWikipedia-Link: {wikipedia_url}" } for attempt in range(MAX_RETRIES): try: response = openai.chat.completions.create( model="gpt-3.5-turbo", messages=[ { "role": "system", "content": ( "Du bist ein Experte für Brancheneinstufung und FSM-Potenzialbewertung.\n" "Bitte beziehe dich ausschließlich auf das konkret genannte Unternehmen.\n" "FSM steht für Field Service Management. Ziel ist es, Unternehmen mit >50 Technikern im Außendienst zu identifizieren.\n\n" "Struktur: Firmenname; Website; Ort; Aktuelle Einstufung; Beschreibung der Branche Extern\n\n" "Gib deine Antwort im CSV-Format zurück (1 Zeile, 8 Spalten):\n" "Wikipedia-Branche;LinkedIn-Branche;Umsatz (Mio €);Empfohlene Neueinstufung;Begründung;FSM-Relevanz;Techniker-Einschätzung;Techniker-Begründung" ) }, user_prompt ], temperature=0, timeout=15 ) full_text = response.choices[0].message.content.strip() break except Exception as e: print(f"⚠️ GPT-Fehler (Versuch {attempt+1}): {str(e)[:100]}") time.sleep(RETRY_DELAY) else: print("❌ GPT 3x fehlgeschlagen – Standardwerte") full_text = "k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A." lines = full_text.splitlines() csv_line = next((l for l in lines if ";" in l), "") parts = [v.strip() for v in csv_line.split(";")] if csv_line else ["k.A."] * 8 with open(LOG_CSV, "a", newline="", encoding="utf-8") as log: writer = csv.writer(log, delimiter=";") writer.writerow([datetime.now().strftime("%Y-%m-%d %H:%M:%S"), row[0], *parts, full_text]) return parts # === SCHRITT 2 DURCHFÜHREN === for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))): row = sheet_values[i] print(f"\n[{datetime.now().strftime('%H:%M:%S')}] GPT-Bewertung für Zeile {i+1}: {row[0]}") wiki_url = row[12] if len(row) > 12 else "" wiki, linkedin, umsatz_chat, new_cat, reason, fsm, techniker, techniker_reason = classify_company(row, wikipedia_url=wiki_url) values = [ wiki, linkedin, umsatz_chat, new_cat, reason, fsm, wiki_url, datetime.now().strftime("%Y-%m-%d %H:%M:%S"), techniker, techniker_reason ] sheet.update(range_name=f"G{i+1}:P{i+1}", values=[values]) time.sleep(RETRY_DELAY) print("\n✅ GPT-Bewertung abgeschlossen")