import os import time import re import gspread import wikipedia import requests import openai from bs4 import BeautifulSoup from oauth2client.service_account import ServiceAccountCredentials from datetime import datetime from difflib import SequenceMatcher import unicodedata import csv # Optional: tiktoken für Token-Zählung (Modus 8) try: import tiktoken except ImportError: tiktoken = None # ==================== KONFIGURATION ==================== class Config: VERSION = "v1.3.18" # v1.3.18: Neuer Modus 8 (Batch-Token-Zählung) & Modus 51 (nur Verifizierung) LANG = "de" CREDENTIALS_FILE = "service_account.json" SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo" MAX_RETRIES = 3 RETRY_DELAY = 5 LOG_CSV = "gpt_antworten_log.csv" SIMILARITY_THRESHOLD = 0.65 DEBUG = True WIKIPEDIA_SEARCH_RESULTS = 5 HTML_PARSER = "html.parser" BATCH_SIZE = 10 TOKEN_MODEL = "gpt-3.5-turbo" # ==================== RETRY-DECORATOR ==================== def retry_on_failure(func): def wrapper(*args, **kwargs): for attempt in range(Config.MAX_RETRIES): try: return func(*args, **kwargs) except Exception as e: print(f"⚠️ Fehler bei {func.__name__} (Versuch {attempt+1}): {str(e)[:100]}") time.sleep(Config.RETRY_DELAY) return None return wrapper # ==================== LOGGING & HELPER FUNCTIONS ==================== if not os.path.exists("Log"): os.makedirs("Log") LOG_FILE = os.path.join("Log", f"{datetime.now().strftime('%d-%m-%Y_%H-%M')}_{Config.VERSION.replace('.', '')}.txt") def debug_print(message): if Config.DEBUG: print(f"[DEBUG] {message}") try: with open(LOG_FILE, "a", encoding="utf-8") as f: f.write(f"[DEBUG] {message}\n") except Exception as e: print(f"[DEBUG] Log-Schreibfehler: {e}") def clean_text(text): if not text: return "k.A." text = unicodedata.normalize("NFKC", str(text)) text = re.sub(r'\[\d+\]', '', text) text = re.sub(r'\s+', ' ', text).strip() return text if text else "k.A." def normalize_company_name(name): if not name: return "" forms = [ r'gmbh', r'g\.m\.b\.h\.', r'ug', r'u\.g\.', r'ug \(haftungsbeschränkt\)', r'u\.g\. \(haftungsbeschränkt\)', r'ag', r'a\.g\.', r'ohg', r'o\.h\.g\.', r'kg', r'k\.g\.', r'gmbh & co\.?\s*kg', r'g\.m\.b\.h\. & co\.?\s*k\.g\.', r'ag & co\.?\s*kg', r'a\.g\. & co\.?\s*k\.g\.', r'e\.k\.', r'e\.kfm\.', r'e\.kfr\.', r'ltd\.', r'ltd & co\.?\s*kg', r's\.a r\.l\.', r'stiftung', r'genossenschaft', r'ggmbh', r'gug', r'partg', r'partgmbb', r'kgaa', r'se', r'og', r'o\.g\.', r'e\.u\.', r'ges\.n\.b\.r\.', r'genmbh', r'verein', r'kollektivgesellschaft', r'kommanditgesellschaft', r'einzelfirma', r'sàrl', r'sa', r'sagl', r'gmbh & co\.?\s*ohg', r'ag & co\.?\s*ohg', r'gmbh & co\.?\s*kgaa', r'ag & co\.?\s*kgaa', r's\.a\.', r's\.p\.a\.', r'b\.v\.', r'n\.v\.' ] pattern = r'\b(' + '|'.join(forms) + r')\b' normalized = re.sub(pattern, '', name, flags=re.IGNORECASE) normalized = re.sub(r'[\-–]', ' ', normalized) normalized = re.sub(r'\s+', ' ', normalized).strip() return normalized.lower() def extract_numeric_value(raw_value, is_umsatz=False): raw_value = raw_value.strip() if not raw_value: return "k.A." raw_value = re.sub(r'\b(ca\.?|circa|über)\b', '', raw_value, flags=re.IGNORECASE) raw = raw_value.lower().replace("\xa0", " ") match = re.search(r'([\d.,]+)', raw, flags=re.UNICODE) if not match or not match.group(1).strip(): debug_print(f"Keine numerischen Zeichen gefunden im Rohtext: '{raw_value}'") return "k.A." num_str = match.group(1) if ',' in num_str: num_str = num_str.replace('.', '').replace(',', '.') try: num = float(num_str) except Exception as e: debug_print(f"Fehler bei der Umwandlung von '{num_str}' (Rohtext: '{raw_value}'): {e}") return raw_value else: num_str = num_str.replace(' ', '').replace('.', '') try: num = float(num_str) except Exception as e: debug_print(f"Fehler bei der Umwandlung von '{num_str}' (Rohtext: '{raw_value}'): {e}") return raw_value if is_umsatz: if "mrd" in raw or "milliarden" in raw: num *= 1000 elif "mio" in raw or "millionen" in raw: pass else: num /= 1e6 return str(int(round(num))) else: return str(int(round(num))) def compare_umsatz_values(crm, wiki): debug_print(f"Vergleich CRM Umsatz: '{crm}' mit Wikipedia Umsatz: '{wiki}'") try: crm_val = float(crm) wiki_val = float(wiki) except Exception as e: debug_print(f"Fehler beim Umwandeln der Werte: CRM='{crm}', Wiki='{wiki}': {e}") return "Daten unvollständig" if crm_val == 0: return "CRM Umsatz 0" diff = abs(crm_val - wiki_val) / crm_val if diff < 0.1: return "OK" else: diff_mio = abs(crm_val - wiki_val) return f"Abweichung: {int(round(diff_mio))} Mio €" def evaluate_umsatz_chatgpt(company_name, wiki_umsatz): try: with open("api_key.txt", "r") as f: api_key = f.read().strip() except Exception as e: debug_print(f"Fehler beim Lesen des API-Tokens: {e}") return "k.A." openai.api_key = api_key prompt = ( f"Bitte schätze den Umsatz in Mio. Euro für das Unternehmen '{company_name}'. " f"Die Wikipedia-Daten zeigen: '{wiki_umsatz}'. " "Antworte nur mit der Zahl." ) try: response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[{"role": "user", "content": prompt}], temperature=0.0 ) result = response.choices[0].message.content.strip() debug_print(f"ChatGPT Umsatzschätzung: '{result}'") try: value = float(result.replace(',', '.')) return str(int(round(value))) except Exception as conv_e: debug_print(f"Fehler bei der Verarbeitung der Umsatzschätzung '{result}': {conv_e}") return result except Exception as e: debug_print(f"Fehler beim Aufruf der ChatGPT API für Umsatzschätzung: {e}") return "k.A." def validate_article_with_chatgpt(crm_data, wiki_data): crm_headers = "Firmenname;Website;Ort;Beschreibung;Aktuelle Branche;Beschreibung Branche extern;Anzahl Techniker;Umsatz (CRM);Anzahl Mitarbeiter (CRM)" wiki_headers = "Wikipedia URL;Wikipedia Absatz;Wikipedia Branche;Wikipedia Umsatz;Wikipedia Mitarbeiter;Wikipedia Kategorien" prompt_text = ( "Bitte überprüfe, ob die folgenden beiden Datensätze grundsätzlich zum gleichen Unternehmen gehören. " "Berücksichtige dabei, dass leichte Abweichungen in Firmennamen (z. B. unterschiedliche Schreibweisen, Mutter-Tochter-Beziehungen) " "oder im Ort (z. B. 'Oberndorf' vs. 'Oberndorf/Neckar') tolerierbar sind. " "Vergleiche insbesondere den Firmennamen, den Ort und die Branche. Unterschiede im Umsatz können bis zu 10% abweichen. " "Wenn die Daten im Wesentlichen übereinstimmen, antworte ausschließlich mit 'OK'. " "Falls nicht, nenne bitte den wichtigsten Grund und eine kurze Begründung, warum die Abweichung plausibel sein könnte.\n\n" f"CRM-Daten:\n{crm_headers}\n{crm_data}\n\n" f"Wikipedia-Daten:\n{wiki_headers}\n{wiki_data}\n\n" "Antwort: " ) try: with open("api_key.txt", "r") as f: api_key = f.read().strip() except Exception as e: debug_print(f"Fehler beim Lesen des API-Tokens: {e}") return "k.A." openai.api_key = api_key try: response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[{"role": "system", "content": prompt_text}], temperature=0.0 ) result = response.choices[0].message.content.strip() debug_print(f"Validierungsantwort ChatGPT: '{result}'") return result except Exception as e: debug_print(f"Fehler beim Validierungs-API-Aufruf: {e}") return "k.A." def evaluate_branche_chatgpt(crm_branche, beschreibung, wiki_branche, wiki_kategorien): # Lade das Ziel-Branchenschema aus der CSV def load_target_branches(): try: with open("ziel_Branchenschema.csv", "r", encoding="utf-8") as csvfile: reader = csv.reader(csvfile) branches = [row[0] for row in reader if row] return branches except Exception as e: debug_print(f"Fehler beim Laden des Ziel-Branchenschemas: {e}") return [] target_branches = load_target_branches() target_branches_str = "\n".join(target_branches) focus_branches = [ "Gutachter / Versicherungen > Baugutachter", "Gutachter / Versicherungen > Technische Gutachten", "Gutachter / Versicherungen > Versicherungsgutachten", "Gutachter / Versicherungen > Medizinische Gutachten", "Hersteller / Produzenten > Anlagenbau", "Hersteller / Produzenten > Automaten (Vending, Slot)", "Hersteller / Produzenten > Gebäudetechnik Allgemein", "Hersteller / Produzenten > Gebäudetechnik Heizung, Lüftung, Klima", "Hersteller / Produzenten > Maschinenbau", "Hersteller / Produzenten > Medizintechnik", "Service provider (Dienstleister) > Aufzüge und Rolltreppen", "Service provider (Dienstleister) > Feuer- und Sicherheitssysteme", "Service provider (Dienstleister) > Servicedienstleister / Reparatur ohne Produktion", "Service provider (Dienstleister) > Facility Management", "Versorger > Telekommunikation" ] focus_branches_str = "\n".join(focus_branches) try: with open("api_key.txt", "r") as f: api_key = f.read().strip() except Exception as e: debug_print(f"Fehler beim Lesen des API-Tokens (Branche): {e}") return {"branch": "k.A.", "consistency": "k.A.", "justification": "k.A."} openai.api_key = api_key additional_instruction = "" if wiki_branche.strip() == "k.A.": additional_instruction = ( "Da keine Wikipedia-Branche vorliegt, berücksichtige bitte die Wikipedia-Kategorien mit erhöhter Gewichtung, " "insbesondere wenn Hinweise auf Personentransport oder öffentliche Verkehrsdienstleistungen vorliegen. " ) system_prompt = ( "Du bist ein Experte im Field Service Management. Deine Aufgabe ist es, ein Unternehmen basierend auf folgenden Angaben einer Branche zuzuordnen.\n\n" f"CRM-Branche (Spalte F): {crm_branche}\n" f"Branchenbeschreibung (Spalte G): {beschreibung}\n" f"Wikipedia-Branche (Spalte N): {wiki_branche}\n" f"Wikipedia-Kategorien (Spalte Q): {wiki_kategorien}\n\n" + additional_instruction + "Das Ziel-Branchenschema umfasst ALLE gültigen Branchen, also sowohl Fokusbranchen als auch weitere, z. B. 'Housing > Sozialbau Unternehmen'.\n" "Das vollständige Ziel-Branchenschema lautet:\n" f"{target_branches_str}\n\n" "Falls das Unternehmen mehreren Branchen zugeordnet werden könnte, wähle bitte bevorzugt eine Branche aus der folgenden Fokusliste, sofern zutreffend:\n" f"{focus_branches_str}\n\n" "Gewichtung der Angaben:\n" "1. Wikipedia-Branche (Spalte N) zusammen mit Wikipedia-Kategorien (Spalte Q) (höchste Priorität, wenn verifiziert, ansonsten erhöhte Gewichtung der Kategorien)\n" "2. Branchenbeschreibung (Spalte G)\n" "3. CRM-Branche (Spalte F)\n\n" "Ordne das Unternehmen exakt einer der oben genannten Branchen zu (es dürfen keine zusätzlichen Branchen erfunden werden). " "Bitte antworte in folgendem Format (ohne zusätzliche Informationen):\n" "Branche: \n" "Übereinstimmung: \n" "Begründung: " ) try: response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[{"role": "system", "content": system_prompt}], temperature=0.0 ) result = response.choices[0].message.content.strip() debug_print(f"Branchenabgleich ChatGPT Antwort: '{result}'") branch = "k.A." consistency = "k.A." justification = "" for line in result.split("\n"): if line.lower().startswith("branche:"): branch = line.split(":", 1)[1].strip() elif line.lower().startswith("übereinstimmung:"): consistency = line.split(":", 1)[1].strip() elif line.lower().startswith("begründung:"): justification = line.split(":", 1)[1].strip() return {"branch": branch, "consistency": consistency, "justification": justification} except Exception as e: debug_print(f"Fehler beim Aufruf der ChatGPT API für Branchenabgleich: {e}") return {"branch": "k.A.", "consistency": "k.A.", "justification": "k.A."} def evaluate_fsm_suitability(company_name, company_data): try: with open("api_key.txt", "r") as f: api_key = f.read().strip() except Exception as e: debug_print(f"Fehler beim Lesen des API-Tokens (FSM): {e}") return {"suitability": "k.A.", "justification": "k.A."} openai.api_key = api_key prompt = ( f"Bitte bewerte, ob das Unternehmen '{company_name}' für den Einsatz einer Field Service Management Lösung geeignet ist. " "Antworte ausschließlich mit 'Ja' oder 'Nein' und gib eine kurze Begründung." ) try: response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[{"role": "system", "content": prompt}], temperature=0.0 ) result = response.choices[0].message.content.strip() debug_print(f"FSM-Eignungsantwort ChatGPT: '{result}'") suitability = "k.A." justification = "" lines = result.split("\n") if len(lines) == 1: parts = result.split(" ", 1) suitability = parts[0].strip() justification = parts[1].strip() if len(parts) > 1 else "" else: for line in lines: if line.lower().startswith("eignung:"): suitability = line.split(":", 1)[1].strip() elif line.lower().startswith("begründung:"): justification = line.split(":", 1)[1].strip() if suitability not in ["Ja", "Nein"]: parts = result.split(" ", 1) suitability = parts[0].strip() justification = " ".join(result.split()[1:]).strip() return {"suitability": suitability, "justification": justification} except Exception as e: debug_print(f"Fehler beim Aufruf der ChatGPT API für FSM-Eignungsprüfung: {e}") return {"suitability": "k.A.", "justification": "k.A."} def evaluate_servicetechnicians_estimate(company_name, company_data): try: with open("serpApiKey.txt", "r") as f: serp_key = f.read().strip() except Exception as e: debug_print(f"Fehler beim Lesen des SerpAPI-Schlüssels (Servicetechniker): {e}") return "k.A." try: with open("api_key.txt", "r") as f: api_key = f.read().strip() except Exception as e: debug_print(f"Fehler beim Lesen des API-Tokens (Servicetechniker): {e}") return "k.A." openai.api_key = api_key prompt = ( f"Bitte schätze auf Basis öffentlich zugänglicher Informationen (vor allem verifizierte Wikipedia-Daten) " f"die Anzahl der Servicetechniker des Unternehmens '{company_name}' ein. " "Gib die Antwort ausschließlich in einer der folgenden Kategorien aus: " "'<50 Techniker', '>100 Techniker', '>200 Techniker', '>500 Techniker'." ) try: response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[{"role": "system", "content": prompt}], temperature=0.0 ) result = response.choices[0].message.content.strip() debug_print(f"Schätzung Servicetechniker ChatGPT: '{result}'") return result except Exception as e: debug_print(f"Fehler beim Aufruf der ChatGPT API für Servicetechniker-Schätzung: {e}") return "k.A." def evaluate_servicetechnicians_explanation(company_name, st_estimate, company_data): try: with open("api_key.txt", "r") as f: api_key = f.read().strip() except Exception as e: debug_print(f"Fehler beim Lesen des API-Tokens (ST-Erklärung): {e}") return "k.A." openai.api_key = api_key prompt = ( f"Bitte erkläre, warum du für das Unternehmen '{company_name}' die Anzahl der Servicetechniker als '{st_estimate}' geschätzt hast. " "Berücksichtige dabei öffentlich zugängliche Informationen wie Branche, Umsatz, Mitarbeiterzahl und andere relevante Daten." ) try: response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[{"role": "system", "content": prompt}], temperature=0.0 ) result = response.choices[0].message.content.strip() debug_print(f"Servicetechniker-Erklärung ChatGPT: '{result}'") return result except Exception as e: debug_print(f"Fehler beim Aufruf der ChatGPT API für Servicetechniker-Erklärung: {e}") return "k.A." def map_internal_technicians(value): try: num = int(value) except Exception: return "k.A." if num < 50: return "<50 Techniker" elif num < 100: return ">100 Techniker" elif num < 200: return ">200 Techniker" else: return ">500 Techniker" def wait_for_sheet_update(sheet, cell, expected_value, timeout=5): start_time = time.time() while time.time() - start_time < timeout: try: current_value = sheet.acell(cell).value if current_value == expected_value: return True except Exception as e: debug_print(f"Fehler beim Lesen von Zelle {cell}: {e}") time.sleep(0.5) return False # ==================== NEUE FUNKTION: LINKEDIN-KONTAKT-SUCHE (Einzelkontakt) ==================== def search_linkedin_contact(company_name, website, position_query): try: with open("serpApiKey.txt", "r") as f: serp_key = f.read().strip() except Exception as e: debug_print("Fehler beim Lesen des SerpAPI-Schlüssels: " + str(e)) return None query = f'site:linkedin.com/in "{position_query}" "{company_name}"' params = { "engine": "google", "q": query, "api_key": serp_key, "hl": "de" } try: response = requests.get("https://serpapi.com/search", params=params) data = response.json() if "organic_results" in data and len(data["organic_results"]) > 0: result = data["organic_results"][0] title = result.get("title", "") if "–" in title: parts = title.split("–") elif "-" in title: parts = title.split("-") else: parts = [title] if len(parts) >= 2: name_part = parts[0].strip() pos = parts[1].split("|")[0].strip() name_parts = name_part.split(" ", 1) if len(name_parts) == 2: firstname, lastname = name_parts else: firstname = name_part lastname = "" return {"Firmenname": company_name, "Website": website, "Vorname": firstname, "Nachname": lastname, "Position": pos} else: return {"Firmenname": company_name, "Website": website, "Vorname": "", "Nachname": "", "Position": title} else: return None except Exception as e: debug_print(f"Fehler bei der SerpAPI-Suche: {e}") return None def count_linkedin_contacts(company_name, website, position_query): try: with open("serpApiKey.txt", "r") as f: serp_key = f.read().strip() except Exception as e: debug_print("Fehler beim Lesen des SerpAPI-Schlüssels: " + str(e)) return 0 query = f'site:linkedin.com/in "{position_query}" "{company_name}"' params = { "engine": "google", "q": query, "api_key": serp_key, "hl": "de" } try: response = requests.get("https://serpapi.com/search", params=params) data = response.json() if "organic_results" in data: count = len(data["organic_results"]) debug_print(f"Anzahl Kontakte für Query '{query}': {count}") return count else: debug_print(f"Keine Ergebnisse für Query: {query}") return 0 except Exception as e: debug_print(f"Fehler bei der SerpAPI-Suche (Count): {e}") return 0 # ==================== VERIFIZIERUNGS-MODUS (Modus 51) ==================== def _process_verification_row(self, row_num, row_data): # Verarbeitung: Extrahiere relevante Daten für die Verifizierung company_name = row_data[1] if len(row_data) > 1 else "" website = row_data[3] if len(row_data) > 3 else "" crm_description = row_data[7] if len(row_data) > 7 else "" wiki_url = row_data[11] if len(row_data) > 11 and row_data[11].strip() not in ["", "k.A."] else "k.A." wiki_absatz = row_data[12] if len(row_data) > 12 else "k.A." wiki_categories = row_data[16] if len(row_data) > 16 else "k.A." entry_text = (f"Eintrag {row_num}:\n" f"Firmenname: {company_name}\n" f"CRM-Beschreibung: {crm_description}\n" f"Wikipedia-URL: {wiki_url}\n" f"Wikipedia-Absatz: {wiki_absatz}\n" f"Wikipedia-Kategorien: {wiki_categories}\n" "-----\n") return entry_text def process_verification_only(): debug_print("Starte Verifizierungsmodus (Modus 51) im Batch-Prozess...") gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name( Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"])) sh = gc.open_by_url(Config.SHEET_URL) main_sheet = sh.sheet1 data = main_sheet.get_all_values() batch_size = Config.BATCH_SIZE batch_entries = [] row_indices = [] for i, row in enumerate(data[1:], start=2): if len(row) <= 19 or row[18].strip() == "": entry_text = _process_verification_row(None, i, row) batch_entries.append(entry_text) row_indices.append(i) if len(batch_entries) == batch_size: break if not batch_entries: debug_print("Keine Einträge für die Verifizierung gefunden.") return aggregated_prompt = ("Du bist ein Experte in der Verifizierung von Wikipedia-Artikeln für Unternehmen. " "Für jeden der folgenden Einträge prüfe, ob der vorhandene Wikipedia-Artikel (URL, Absatz, Kategorien) plausibel passt. " "Gib für jeden Eintrag das Ergebnis im Format aus:\n" "Eintrag : \n" "Dabei gilt:\n" "- Wenn der Artikel passt, antworte mit 'OK'.\n" "- Wenn der Artikel unpassend ist, antworte mit 'Alternativer Wikipedia-Artikel vorgeschlagen: | X | '.\n" "- Wenn kein Artikel gefunden wurde, antworte mit 'Kein Wikipedia-Eintrag vorhanden.'\n\n") aggregated_prompt += "\n".join(batch_entries) debug_print("Aggregierter Prompt für Verifizierungs-Batch erstellt.") token_count = "n.v." if tiktoken: try: enc = tiktoken.encoding_for_model(Config.TOKEN_MODEL) token_count = len(enc.encode(aggregated_prompt)) debug_print(f"Token-Zahl für Batch: {token_count}") except Exception as e: debug_print(f"Fehler beim Token-Counting: {e}") try: with open("api_key.txt", "r") as f: api_key = f.read().strip() except Exception as e: debug_print(f"Fehler beim Lesen des API-Tokens (Verifizierung): {e}") return openai.api_key = api_key try: response = openai.ChatCompletion.create( model=Config.TOKEN_MODEL, messages=[{"role": "system", "content": aggregated_prompt}], temperature=0.0 ) result = response.choices[0].message.content.strip() debug_print(f"Antwort ChatGPT Verifizierung Batch: {result}") except Exception as e: debug_print(f"Fehler bei der ChatGPT Anfrage für Verifizierung: {e}") return answers = result.split("\n") for idx, row_num in enumerate(row_indices): answer = "k.A." for line in answers: if line.strip().startswith(f"Eintrag {row_num}:"): answer = line.split(":", 1)[1].strip() break if answer.upper() == "OK": wiki_confirm = "OK" alt_article = "" wiki_explanation = "" elif answer.upper() == "KEIN WIKIPEDIA-EINTRAG VORHANDEN.": wiki_confirm = "" alt_article = "Kein Wikipedia-Eintrag vorhanden." wiki_explanation = "" elif answer.startswith("Alternativer Wikipedia-Artikel vorgeschlagen:"): parts = answer.split(":", 1)[1].split("|") alt_article = parts[0].strip() if len(parts) > 0 else "k.A." wiki_explanation = parts[2].strip() if len(parts) > 2 else "" wiki_confirm = "X" else: wiki_confirm = "" alt_article = answer wiki_explanation = answer main_sheet.update(values=[[wiki_confirm]], range_name=f"S{row_num}") main_sheet.update(values=[[alt_article]], range_name=f"U{row_num}") main_sheet.update(values=[[wiki_explanation]], range_name=f"V{row_num}") crm_branch = data[row_num-1][6] if len(data[row_num-1]) > 6 else "k.A." ext_branch = data[row_num-1][7] if len(data[row_num-1]) > 7 else "k.A." wiki_branch = data[row_num-1][14] if len(data[row_num-1]) > 14 else "k.A." wiki_cats = data[row_num-1][17] if len(data[row_num-1]) > 17 else "k.A." branch_result = evaluate_branche_chatgpt(crm_branch, ext_branch, wiki_branch, wiki_cats) main_sheet.update(values=[[branch_result["branch"]]], range_name=f"W{row_num}") main_sheet.update(values=[[branch_result["consistency"]]], range_name=f"Y{row_num}") main_sheet.update(values=[[str(token_count)]], range_name=f"AQ{row_num}") current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S") main_sheet.update(values=[[current_dt]], range_name=f"AO{row_num}") main_sheet.update(values=[[Config.VERSION]], range_name=f"AP{row_num}") debug_print(f"Zeile {row_num} verifiziert: Antwort: {answer}") time.sleep(Config.RETRY_DELAY) debug_print("Verifizierungs-Batch abgeschlossen.") # ==================== GOOGLE SHEET HANDLER ==================== class GoogleSheetHandler: def __init__(self): self.sheet = None self.sheet_values = [] self._connect() def _connect(self): scope = ["https://www.googleapis.com/auth/spreadsheets"] creds = ServiceAccountCredentials.from_json_keyfile_name(Config.CREDENTIALS_FILE, scope) self.sheet = gspread.authorize(creds).open_by_url(Config.SHEET_URL).sheet1 self.sheet_values = self.sheet.get_all_values() def get_start_index(self): filled_n = [row[13] if len(row) > 13 else '' for row in self.sheet_values[1:]] return next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip()), len(filled_n) + 1) # ==================== ALIGNMENT DEMO (Hauptblatt) ==================== def alignment_demo(sheet): new_headers = [ [ "ReEval Flag", # A "CRM Name", # B "CRM Kurzform", # C "CRM Website", # D "CRM Ort", # E "CRM Beschreibung", # F "CRM Branche", # G "CRM Beschreibung Branche extern", # H "CRM Anzahl Techniker", # I "CRM Umsatz", # J "CRM Anzahl Mitarbeiter", # K "CRM Vorschlag Wiki URL", # L "Wiki URL", # M "Wiki Absatz", # N "Wiki Branche", # O "Wiki Umsatz", # P "Wiki Mitarbeiter", # Q "Wiki Kategorien", # R "Chat Wiki Konsistenzprüfung", # S "Chat Begründung Wiki Inkonsistenz", # T "Chat Vorschlag Wiki Artikel", # U "Begründung bei Abweichung", # V "Chat Vorschlag Branche", # W "Chat Konsistenz Branche", # X "Chat Begründung Abweichung Branche", # Y "Chat Prüfung FSM Relevanz", # Z "Chat Begründung für FSM Relevanz", # AA "Chat Schätzung Anzahl Mitarbeiter", # AB "Chat Konsistenzprüfung Mitarbeiterzahl", # AC "Chat Begründung Abweichung Mitarbeiterzahl", # AD "Chat Einschätzung Anzahl Servicetechniker", # AE "Chat Begründung Abweichung Anzahl Servicetechniker", # AF "Chat Schätzung Umsatz", # AG "Chat Begründung Abweichung Umsatz", # AH "Linked Serviceleiter gefunden", # AI "Linked It-Leiter gefunden", # AJ "Linked Management gefunden", # AK "Linked Disponent gefunden", # AL "Contact Search Timestamp", # AM "Wikipedia Timestamp", # AN "Timestamp letzte Prüfung", # AO "Version", # AP "Tokens" # AQ ], [ # <- Hier das fehlende Komma eingefügt "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "Wikipediascraper", "Wikipediascraper", "Wikipediascraper", "Wikipediascraper", "Wikipediascraper", "Wikipediascraper", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "LinkedIn (via SerpApi)", "LinkedIn (via SerpApi)", "LinkedIn (via SerpApi)", "LinkedIn (via SerpApi)", "System", "System", "System", "System", "System" ] ] header_range = "A1:AQ2" sheet.update(values=[new_headers], range_name=header_range) print("Alignment-Demo abgeschlossen: Neue Spaltenüberschriften in Zeile 11200 geschrieben.") # ==================== WIKIPEDIA SCRAPER ==================== class WikipediaScraper: def __init__(self): wikipedia.set_lang(Config.LANG) def _get_full_domain(self, website): if not website: return "" website = website.lower().strip() website = re.sub(r'^https?:\/\/', '', website) website = re.sub(r'^www\.', '', website) return website.split('/')[0] def _generate_search_terms(self, company_name, website): terms = [] full_domain = self._get_full_domain(website) if full_domain: terms.append(full_domain) normalized_name = normalize_company_name(company_name) candidate = " ".join(normalized_name.split()[:2]).strip() if candidate and candidate not in terms: terms.append(candidate) if normalized_name and normalized_name not in terms: terms.append(normalized_name) debug_print(f"Generierte Suchbegriffe: {terms}") return terms def _validate_article(self, page, company_name, website): full_domain = self._get_full_domain(website) domain_found = False if full_domain: try: html_raw = requests.get(page.url).text soup = BeautifulSoup(html_raw, Config.HTML_PARSER) infobox = soup.find('table', class_=lambda c: c and 'infobox' in c.lower()) if infobox: links = infobox.find_all('a', href=True) for link in links: href = link.get('href').lower() if href.startswith('/wiki/datei:'): continue if full_domain in href: debug_print(f"Definitiver Link-Match in Infobox gefunden: {href}") domain_found = True break if not domain_found and hasattr(page, 'externallinks'): for ext_link in page.externallinks: if full_domain in ext_link.lower(): debug_print(f"Definitiver Link-Match in externen Links gefunden: {ext_link}") domain_found = True break except Exception as e: debug_print(f"Fehler beim Extrahieren von Links: {str(e)}") normalized_title = normalize_company_name(page.title) normalized_company = normalize_company_name(company_name) similarity = SequenceMatcher(None, normalized_title, normalized_company).ratio() debug_print(f"Ähnlichkeit (normalisiert): {similarity:.2f} ({normalized_title} vs {normalized_company})") threshold = 0.60 if domain_found else Config.SIMILARITY_THRESHOLD return similarity >= threshold def extract_first_paragraph(self, page_url): try: response = requests.get(page_url) soup = BeautifulSoup(response.text, Config.HTML_PARSER) paragraphs = soup.find_all('p') for p in paragraphs: text = clean_text(p.get_text()) if len(text) > 50: return text return "k.A." except Exception as e: debug_print(f"Fehler beim Extrahieren des ersten Absatzes: {e}") return "k.A." def extract_categories(self, soup): cat_div = soup.find('div', id="mw-normal-catlinks") if cat_div: ul = cat_div.find('ul') if ul: cats = [clean_text(li.get_text()) for li in ul.find_all('li')] return ", ".join(cats) return "k.A." def _extract_infobox_value(self, soup, target): infobox = soup.find('table', class_=lambda c: c and any(kw in c.lower() for kw in ['infobox', 'vcard', 'unternehmen'])) if not infobox: return "k.A." keywords_map = { 'branche': ['branche', 'industrie', 'tätigkeit', 'geschäftsfeld', 'sektor', 'produkte', 'leistungen', 'aktivitäten', 'wirtschaftszweig'], 'umsatz': ['umsatz', 'jahresumsatz', 'konzernumsatz', 'gesamtumsatz', 'erlöse', 'umsatzerlöse', 'einnahmen', 'ergebnis', 'jahresergebnis'], 'mitarbeiter': ['mitarbeiter', 'beschäftigte', 'personal', 'mitarbeiterzahl', 'angestellte', 'belegschaft', 'personalstärke'] } keywords = keywords_map.get(target, []) for row in infobox.find_all('tr'): header = row.find('th') if header: header_text = clean_text(header.get_text()).lower() if any(kw in header_text for kw in keywords): value = row.find('td') if value: raw_value = clean_text(value.get_text()) if target == 'branche': clean_val = re.sub(r'\[.*?\]|\(.*?\)', '', raw_value) return ' '.join(clean_val.split()).strip() if target == 'umsatz': return extract_numeric_value(raw_value, is_umsatz=True) if target == 'mitarbeiter': return extract_numeric_value(raw_value, is_umsatz=False) return "k.A." def extract_full_infobox(self, soup): infobox = soup.find('table', class_=lambda c: c and any(kw in c.lower() for kw in ['infobox', 'vcard', 'unternehmen'])) if not infobox: return "k.A." return clean_text(infobox.get_text(separator=' | ')) def extract_fields_from_infobox_text(self, infobox_text, field_names): result = {} tokens = [token.strip() for token in infobox_text.split("|") if token.strip()] for i, token in enumerate(tokens): for field in field_names: if field.lower() in token.lower(): j = i + 1 while j < len(tokens) and not tokens[j]: j += 1 result[field] = tokens[j] if j < len(tokens) else "k.A." return result def extract_company_data(self, page_url): if not page_url: return { 'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.', 'full_infobox': 'k.A.' } try: response = requests.get(page_url) soup = BeautifulSoup(response.text, Config.HTML_PARSER) full_infobox = self.extract_full_infobox(soup) extracted_fields = self.extract_fields_from_infobox_text(full_infobox, ['Branche', 'Umsatz', 'Mitarbeiter']) raw_branche = extracted_fields.get('Branche', self._extract_infobox_value(soup, 'branche')) raw_umsatz = extracted_fields.get('Umsatz', self._extract_infobox_value(soup, 'umsatz')) raw_mitarbeiter = extracted_fields.get('Mitarbeiter', self._extract_infobox_value(soup, 'mitarbeiter')) umsatz_val = extract_numeric_value(raw_umsatz, is_umsatz=True) mitarbeiter_val = extract_numeric_value(raw_mitarbeiter, is_umsatz=False) categories_val = self.extract_categories(soup) first_paragraph = self.extract_first_paragraph(page_url) return { 'url': page_url, 'first_paragraph': first_paragraph, 'branche': raw_branche, 'umsatz': umsatz_val, 'mitarbeiter': mitarbeiter_val, 'categories': categories_val, 'full_infobox': full_infobox } except Exception as e: debug_print(f"Extraktionsfehler: {str(e)}") return { 'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.', 'full_infobox': 'k.A.' } @retry_on_failure def search_company_article(self, company_name, website): search_terms = self._generate_search_terms(company_name, website) for term in search_terms: try: results = wikipedia.search(term, results=Config.WIKIPEDIA_SEARCH_RESULTS) debug_print(f"Suchergebnisse für '{term}': {results}") for title in results: try: page = wikipedia.page(title, auto_suggest=False) if self._validate_article(page, company_name, website): return page except (wikipedia.exceptions.DisambiguationError, wikipedia.exceptions.PageError) as e: debug_print(f"Seitenfehler: {str(e)}") continue except Exception as e: debug_print(f"Suchfehler: {str(e)}") continue return None # ==================== DATA PROCESSOR ==================== class DataProcessor: def __init__(self): self.sheet_handler = GoogleSheetHandler() self.wiki_scraper = WikipediaScraper() def process_rows(self, num_rows=None): if MODE == "2": print("Re-Evaluierungsmodus: Verarbeitung aller Zeilen mit 'x' in Spalte A.") for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2): if row[0].strip().lower() == "x": self._process_single_row(i, row) elif MODE == "3": print("Alignment-Demo-Modus: Schreibe neue Spaltenüberschriften in Hauptblatt und Contacts.") alignment_demo_full() elif MODE == "4": for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2): if len(row) <= 39 or row[39].strip() == "": self._process_single_row(i, row, process_wiki=True, process_chatgpt=False) elif MODE == "5": for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2): if len(row) <= 40 or row[40].strip() == "": self._process_single_row(i, row, process_wiki=False, process_chatgpt=True) elif MODE == "51": for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2): if len(row) <= 25 or row[24].strip() == "": self._process_verification_row(i, row) elif MODE == "8": process_batch_token_count() else: start_index = self.sheet_handler.get_start_index() print(f"Starte bei Zeile {start_index+1}") rows_processed = 0 for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2): if i < start_index: continue if num_rows is not None and rows_processed >= num_rows: break self._process_single_row(i, row) rows_processed += 1 def _process_single_row(self, row_num, row_data, process_wiki=True, process_chatgpt=True): company_name = row_data[1] if len(row_data) > 1 else "" website = row_data[2] if len(row_data) > 2 else "" wiki_update_range = f"K{row_num}:Q{row_num}" dt_wiki_range = f"AN{row_num}" dt_chat_range = f"AO{row_num}" ver_range = f"AP{row_num}" print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {row_num}: {company_name}") current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S") if process_wiki: if len(row_data) <= 39 or row_data[39].strip() == "": if len(row_data) > 10 and row_data[10].strip() not in ["", "k.A."]: wiki_url = row_data[10].strip() try: company_data = self.wiki_scraper.extract_company_data(wiki_url) except Exception as e: debug_print(f"Fehler beim Laden des vorgeschlagenen Wikipedia-Artikels: {e}") article = self.wiki_scraper.search_company_article(company_name, website) company_data = self.wiki_scraper.extract_company_data(article.url) if article else { 'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.', 'full_infobox': 'k.A.' } else: article = self.wiki_scraper.search_company_article(company_name, website) company_data = self.wiki_scraper.extract_company_data(article.url) if article else { 'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.', 'full_infobox': 'k.A.' } wiki_values = [ row_data[10] if len(row_data) > 10 and row_data[10].strip() not in ["", "k.A."] else "k.A.", company_data.get('url', 'k.A.'), company_data.get('first_paragraph', 'k.A.'), company_data.get('branche', 'k.A.'), company_data.get('umsatz', 'k.A.'), company_data.get('mitarbeiter', 'k.A.'), company_data.get('categories', 'k.A.') ] self.sheet_handler.sheet.update(values=[wiki_values], range_name=wiki_update_range) self.sheet_handler.sheet.update(values=[[current_dt]], range_name=dt_wiki_range) else: debug_print(f"Zeile {row_num}: Wikipedia-Timestamp bereits gesetzt – überspringe Wiki-Auswertung.") if process_chatgpt: if len(row_data) <= 40 or row_data[40].strip() == "": crm_umsatz = row_data[8] if len(row_data) > 8 else "k.A." abgleich_result = compare_umsatz_values(crm_umsatz, company_data.get('umsatz', 'k.A.')) self.sheet_handler.sheet.update(values=[[abgleich_result]], range_name=f"AG{row_num}") crm_data = ";".join(row_data[1:10]) wiki_data_str = ";".join(row_data[11:18]) valid_result = validate_article_with_chatgpt(crm_data, wiki_data_str) self.sheet_handler.sheet.update(values=[[valid_result]], range_name=f"R{row_num}") fsm_result = evaluate_fsm_suitability(company_name, company_data) self.sheet_handler.sheet.update(values=[[fsm_result["suitability"]]], range_name=f"Y{row_num}") self.sheet_handler.sheet.update(values=[[fsm_result["justification"]]], range_name=f"Z{row_num}") st_estimate = evaluate_servicetechnicians_estimate(company_name, company_data) self.sheet_handler.sheet.update(values=[[st_estimate]], range_name=f"AD{row_num}") internal_value = row_data[7] if len(row_data) > 7 else "k.A." internal_category = map_internal_technicians(internal_value) if internal_value != "k.A." else "k.A." if internal_category != "k.A." and st_estimate != internal_category: explanation = evaluate_servicetechnicians_explanation(company_name, st_estimate, company_data) discrepancy = explanation else: discrepancy = "ok" self.sheet_handler.sheet.update(values=[[discrepancy]], range_name=f"AF{row_num}") self.sheet_handler.sheet.update(values=[[current_dt]], range_name=dt_chat_range) else: debug_print(f"Zeile {row_num}: ChatGPT-Timestamp bereits gesetzt – überspringe ChatGPT-Auswertung.") self.sheet_handler.sheet.update(values=[[current_dt]], range_name=ver_range) self.sheet_handler.sheet.update(values=[[Config.VERSION]], range_name=ver_range) debug_print(f"✅ Aktualisiert: URL: {company_data.get('url', 'k.A.')}, " f"Branche: {company_data.get('branche', 'k.A.')}, Umsatz-Abgleich: {abgleich_result}, " f"Validierung: {valid_result}, " f"FSM: {fsm_result['suitability']}, Servicetechniker-Schätzung: {st_estimate}") time.sleep(Config.RETRY_DELAY) # ==================== GOOGLE SHEET HANDLER (für Hauptdaten) ==================== class GoogleSheetHandler: def __init__(self): self.sheet = None self.sheet_values = [] self._connect() def _connect(self): scope = ["https://www.googleapis.com/auth/spreadsheets"] creds = ServiceAccountCredentials.from_json_keyfile_name(Config.CREDENTIALS_FILE, scope) self.sheet = gspread.authorize(creds).open_by_url(Config.SHEET_URL).sheet1 self.sheet_values = self.sheet.get_all_values() def get_start_index(self): filled_n = [row[13] if len(row) > 13 else '' for row in self.sheet_values[1:]] return next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip()), len(filled_n) + 1) # ==================== ALIGNMENT DEMO (Hauptblatt und Contacts) ==================== def alignment_demo_full(): alignment_demo(GoogleSheetHandler().sheet) gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name( Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"])) sh = gc.open_by_url(Config.SHEET_URL) try: contacts_sheet = sh.worksheet("Contacts") except gspread.exceptions.WorksheetNotFound: contacts_sheet = sh.add_worksheet(title="Contacts", rows="1000", cols="10") header = ["Firmenname", "Website", "Kurzform", "Vorname", "Nachname", "Position", "Anrede", "E-Mail"] contacts_sheet.update(values=[header], range_name="A1:H1") debug_print("Neues Blatt 'Contacts' erstellt und Header eingetragen.") alignment_demo(contacts_sheet) debug_print("Alignment-Demo für Hauptblatt und Contacts abgeschlossen.") # ==================== NEUER MODUS: CONTACT RESEARCH (via SerpAPI) ==================== def process_contact_research(): debug_print("Starte Contact Research (Modus 6)...") gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name( Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"])) sh = gc.open_by_url(Config.SHEET_URL) main_sheet = sh.sheet1 data = main_sheet.get_all_values() for i, row in enumerate(data[1:], start=2): company_name = row[1] if len(row) > 1 else "" search_name = row[2].strip() if len(row) > 2 and row[2].strip() not in ["", "k.A."] else company_name website = row[3] if len(row) > 3 else "" if not company_name or not website: continue count_service = count_linkedin_contacts(search_name, website, "Serviceleiter") count_it = count_linkedin_contacts(search_name, website, "IT-Leiter") count_management = count_linkedin_contacts(search_name, website, "Geschäftsführer") count_disponent = count_linkedin_contacts(search_name, website, "Disponent") current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S") main_sheet.update(values=[[str(count_service)]], range_name=f"AI{i}") main_sheet.update(values=[[str(count_it)]], range_name=f"AJ{i}") main_sheet.update(values=[[str(count_management)]], range_name=f"AK{i}") main_sheet.update(values=[[str(count_disponent)]], range_name=f"AL{i}") main_sheet.update(values=[[current_dt]], range_name=f"AM{i}") debug_print(f"Zeile {i}: Serviceleiter {count_service}, IT-Leiter {count_it}, Management {count_management}, Disponent {count_disponent} – Contact Search Timestamp gesetzt.") time.sleep(Config.RETRY_DELAY * 1.5) debug_print("Contact Research abgeschlossen.") # ==================== NEUER MODUS: CONTACTS (LinkedIn) ==================== def process_contacts(): debug_print("Starte LinkedIn-Kontaktsuche...") gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name( Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"])) sh = gc.open_by_url(Config.SHEET_URL) try: contacts_sheet = sh.worksheet("Contacts") except gspread.exceptions.WorksheetNotFound: contacts_sheet = sh.add_worksheet(title="Contacts", rows="1000", cols="10") header = ["Firmenname", "Website", "Kurzform", "Vorname", "Nachname", "Position", "Anrede", "E-Mail"] contacts_sheet.update(values=[header], range_name="A1:H1") debug_print("Neues Blatt 'Contacts' erstellt und Header eingetragen.") main_sheet = sh.sheet1 data = main_sheet.get_all_values() positions = ["Serviceleiter", "IT-Leiter", "Leiter After Sales", "Leiter Einsatzplanung"] new_rows = [] for idx, row in enumerate(data[1:], start=2): company_name = row[1] if len(row) > 1 else "" search_name = row[2].strip() if len(row) > 2 and row[2].strip() not in ["", "k.A."] else company_name website = row[3] if len(row) > 3 else "" if not company_name or not website: continue for pos in positions: debug_print(f"Suche nach Position: '{pos}' bei '{search_name}'") contact = search_linkedin_contact(search_name, website, pos) if contact: new_rows.append([contact["Firmenname"], website, search_name, contact["Vorname"], contact["Nachname"], contact["Position"], "", ""]) else: debug_print(f"Kein Kontakt für Position '{pos}' bei '{search_name}' gefunden.") if new_rows: last_row = len(contacts_sheet.get_all_values()) + 1 range_str = f"A{last_row}:H{last_row + len(new_rows) - 1}" contacts_sheet.update(values=new_rows, range_name=range_str) debug_print(f"{len(new_rows)} Kontakte in 'Contacts' hinzugefügt.") else: debug_print("Keine Kontakte gefunden.") # ==================== NEUER MODUS: BATCH-PROZESSING MIT TOKEN-ZÄHLUNG (Modus 8) ==================== def process_batch_token_count(batch_size=10): import tiktoken def count_tokens(text, model="gpt-3.5-turbo"): encoding = tiktoken.encoding_for_model(model) tokens = encoding.encode(text) return len(tokens) debug_print("Starte Batch-Token-Zählung (Modus 8)...") gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name( Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"])) sh = gc.open_by_url(Config.SHEET_URL) main_sheet = sh.sheet1 data = main_sheet.get_all_values() for i in range(2, len(data)+1, batch_size): batch_rows = data[i-1:i-1+batch_size] aggregated_prompt = "" for row in batch_rows: info = [] if len(row) > 1: info.append(row[1]) # Firmenname if len(row) > 2: info.append(row[2]) # Kurzform if len(row) > 3: info.append(row[3]) # Website if len(row) > 4: info.append(row[4]) # Ort if len(row) > 5: info.append(row[5]) # Beschreibung if len(row) > 6: info.append(row[6]) # Aktuelle Branche aggregated_prompt += "; ".join(info) + "\n" token_count = count_tokens(aggregated_prompt) debug_print(f"Batch beginnend in Zeile {i}: {token_count} Tokens") for j in range(i, min(i+batch_size, len(data)+1)): main_sheet.update(values=[[str(token_count)]], range_name=f"AQ{j}") time.sleep(Config.RETRY_DELAY) debug_print("Batch-Token-Zählung abgeschlossen.") # ==================== MAIN PROGRAMM ==================== if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--mode", type=str, help="Modus: 1,2,3,4,5,6,7,51 oder 8") parser.add_argument("--num_rows", type=int, default=0, help="Anzahl der zu bearbeitenden Zeilen (nur für Modus 1)") args = parser.parse_args() if not args.mode: print("Modi:") print("1 = Regulärer Modus") print("2 = Re-Evaluierungsmodus (nur Zeilen mit 'x' in Spalte A)") print("3 = Alignment-Demo (Header in Hauptblatt und Contacts)") print("4 = Nur Wikipedia-Suche (Zeilen ohne Wikipedia-Timestamp)") print("5 = Nur ChatGPT-Bewertung (Zeilen ohne ChatGPT-Timestamp)") print("6 = Contact Research (via SerpAPI)") print("7 = Contacts (LinkedIn)") print("8 = Batch-Token-Zählung") print("51 = Nur Verifizierung (Wikipedia + Brancheneinordnung)") args.mode = input("Wählen Sie den Modus: ").strip() MODE = args.mode if MODE == "1": try: num_rows = args.num_rows if args.num_rows > 0 else int(input("Wieviele Zeilen sollen überprüft werden? ")) except Exception as e: print("Ungültige Eingabe. Bitte eine Zahl eingeben.") exit(1) processor = DataProcessor() processor.process_rows(num_rows) elif MODE in ["2", "3"]: processor = DataProcessor() processor.process_rows() elif MODE == "4": processor = DataProcessor() for i, row in enumerate(processor.sheet_handler.sheet_values[1:], start=2): if len(row) <= 39 or row[39].strip() == "": processor._process_single_row(i, row, process_wiki=True, process_chatgpt=False) elif MODE == "5": processor = DataProcessor() for i, row in enumerate(processor.sheet_handler.sheet_values[1:], start=2): if len(row) <= 40 or row[40].strip() == "": processor._process_single_row(i, row, process_wiki=False, process_chatgpt=True) elif MODE == "51": processor = DataProcessor() for i, row in enumerate(processor.sheet_handler.sheet_values[1:], start=2): if len(row) <= 25 or row[24].strip() == "": processor._process_verification_row(i, row) elif MODE == "6": process_contact_research() elif MODE == "7": process_contacts() elif MODE == "8": process_batch_token_count() print(f"\n✅ Auswertung abgeschlossen ({Config.VERSION})")