import os import time import re import gspread import wikipedia import requests import openai from bs4 import BeautifulSoup from oauth2client.service_account import ServiceAccountCredentials from datetime import datetime from difflib import SequenceMatcher import unicodedata import csv # ==================== KONFIGURATION ==================== class Config: VERSION = "v1.3.13" # v1.3.13: Neuer Modus 8 (Batch-Token-Zählung in Spalte AQ) & Modus 51 (nur Verifizierung) LANG = "de" CREDENTIALS_FILE = "service_account.json" SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo" MAX_RETRIES = 3 RETRY_DELAY = 5 LOG_CSV = "gpt_antworten_log.csv" SIMILARITY_THRESHOLD = 0.65 DEBUG = True WIKIPEDIA_SEARCH_RESULTS = 5 HTML_PARSER = "html.parser" # ==================== RETRY-DECORATOR ==================== def retry_on_failure(func): def wrapper(*args, **kwargs): for attempt in range(Config.MAX_RETRIES): try: return func(*args, **kwargs) except Exception as e: print(f"⚠️ Fehler bei {func.__name__} (Versuch {attempt+1}): {str(e)[:100]}") time.sleep(Config.RETRY_DELAY) return None return wrapper # ==================== LOGGING & HELPER FUNCTIONS ==================== if not os.path.exists("Log"): os.makedirs("Log") LOG_FILE = os.path.join("Log", f"{datetime.now().strftime('%d-%m-%Y_%H-%M')}_{Config.VERSION.replace('.', '')}.txt") def debug_print(message): if Config.DEBUG: print(f"[DEBUG] {message}") try: with open(LOG_FILE, "a", encoding="utf-8") as f: f.write(f"[DEBUG] {message}\n") except Exception as e: print(f"[DEBUG] Log-Schreibfehler: {e}") def clean_text(text): if not text: return "k.A." text = unicodedata.normalize("NFKC", str(text)) text = re.sub(r'\[\d+\]', '', text) text = re.sub(r'\s+', ' ', text).strip() return text if text else "k.A." def normalize_company_name(name): if not name: return "" forms = [ r'gmbh', r'g\.m\.b\.h\.', r'ug', r'u\.g\.', r'ug \(haftungsbeschränkt\)', r'u\.g\. \(haftungsbeschränkt\)', r'ag', r'a\.g\.', r'ohg', r'o\.h\.g\.', r'kg', r'k\.g\.', r'gmbh & co\.?\s*kg', r'g\.m\.b\.h\. & co\.?\s*k\.g\.', r'ag & co\.?\s*kg', r'a\.g\. & co\.?\s*k\.g\.', r'e\.k\.', r'e\.kfm\.', r'e\.kfr\.', r'ltd\.', r'ltd & co\.?\s*kg', r's\.a r\.l\.', r'stiftung', r'genossenschaft', r'ggmbh', r'gug', r'partg', r'partgmbb', r'kgaa', r'se', r'og', r'o\.g\.', r'e\.u\.', r'ges\.n\.b\.r\.', r'genmbh', r'verein', r'kollektivgesellschaft', r'kommanditgesellschaft', r'einzelfirma', r'sàrl', r'sa', r'sagl', r'gmbh & co\.?\s*ohg', r'ag & co\.?\s*ohg', r'gmbh & co\.?\s*kgaa', r'ag & co\.?\s*kgaa', r's\.a\.', r's\.p\.a\.', r'b\.v\.', r'n\.v\.' ] pattern = r'\b(' + '|'.join(forms) + r')\b' normalized = re.sub(pattern, '', name, flags=re.IGNORECASE) normalized = re.sub(r'[\-–]', ' ', normalized) normalized = re.sub(r'\s+', ' ', normalized).strip() return normalized.lower() def extract_numeric_value(raw_value, is_umsatz=False): raw_value = raw_value.strip() if not raw_value: return "k.A." raw_value = re.sub(r'\b(ca\.?|circa|über)\b', '', raw_value, flags=re.IGNORECASE) raw = raw_value.lower().replace("\xa0", " ") match = re.search(r'([\d.,]+)', raw, flags=re.UNICODE) if not match or not match.group(1).strip(): debug_print(f"Keine numerischen Zeichen gefunden im Rohtext: '{raw_value}'") return "k.A." num_str = match.group(1) if ',' in num_str: num_str = num_str.replace('.', '').replace(',', '.') try: num = float(num_str) except Exception as e: debug_print(f"Fehler bei der Umwandlung von '{num_str}' (Rohtext: '{raw_value}'): {e}") return raw_value else: num_str = num_str.replace(' ', '').replace('.', '') try: num = float(num_str) except Exception as e: debug_print(f"Fehler bei der Umwandlung von '{num_str}' (Rohtext: '{raw_value}'): {e}") return raw_value if is_umsatz: if "mrd" in raw or "milliarden" in raw: num *= 1000 elif "mio" in raw or "millionen" in raw: pass else: num /= 1e6 return str(int(round(num))) else: return str(int(round(num))) def compare_umsatz_values(crm, wiki): debug_print(f"Vergleich CRM Umsatz: '{crm}' mit Wikipedia Umsatz: '{wiki}'") try: crm_val = float(crm) wiki_val = float(wiki) except Exception as e: debug_print(f"Fehler beim Umwandeln der Werte: CRM='{crm}', Wiki='{wiki}': {e}") return "Daten unvollständig" if crm_val == 0: return "CRM Umsatz 0" diff = abs(crm_val - wiki_val) / crm_val if diff < 0.1: return "OK" else: diff_mio = abs(crm_val - wiki_val) return f"Abweichung: {int(round(diff_mio))} Mio €" def evaluate_umsatz_chatgpt(company_name, wiki_umsatz): try: with open("api_key.txt", "r") as f: api_key = f.read().strip() except Exception as e: debug_print(f"Fehler beim Lesen des API-Tokens: {e}") return "k.A." openai.api_key = api_key prompt = ( f"Bitte schätze den Umsatz in Mio. Euro für das Unternehmen '{company_name}'. " f"Die Wikipedia-Daten zeigen: '{wiki_umsatz}'. " "Antworte nur mit der Zahl." ) try: response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[{"role": "user", "content": prompt}], temperature=0.0 ) result = response.choices[0].message.content.strip() debug_print(f"ChatGPT Umsatzschätzung: '{result}'") try: value = float(result.replace(',', '.')) return str(int(round(value))) except Exception as conv_e: debug_print(f"Fehler bei der Verarbeitung der Umsatzschätzung '{result}': {conv_e}") return result except Exception as e: debug_print(f"Fehler beim Aufruf der ChatGPT API für Umsatzschätzung: {e}") return "k.A." def validate_article_with_chatgpt(crm_data, wiki_data): crm_headers = "Firmenname;Website;Ort;Beschreibung;Aktuelle Branche;Beschreibung Branche extern;Anzahl Techniker;Umsatz (CRM);Anzahl Mitarbeiter (CRM)" wiki_headers = "Wikipedia URL;Wikipedia Absatz;Wikipedia Branche;Wikipedia Umsatz;Wikipedia Mitarbeiter;Wikipedia Kategorien" prompt_text = ( "Bitte überprüfe, ob die folgenden beiden Datensätze grundsätzlich zum gleichen Unternehmen gehören. " "Berücksichtige dabei, dass leichte Abweichungen in Firmennamen (z. B. unterschiedliche Schreibweisen, Mutter-Tochter-Beziehungen) " "oder im Ort (z. B. 'Oberndorf' vs. 'Oberndorf/Neckar') tolerierbar sind. " "Vergleiche insbesondere den Firmennamen, den Ort und die Branche. Unterschiede im Umsatz können bis zu 10% abweichen. " "Wenn die Daten im Wesentlichen übereinstimmen, antworte ausschließlich mit 'OK'. " "Falls nicht, nenne bitte den wichtigsten Grund und eine kurze Begründung, warum die Abweichung plausibel sein könnte.\n\n" f"CRM-Daten:\n{crm_headers}\n{crm_data}\n\n" f"Wikipedia-Daten:\n{wiki_headers}\n{wiki_data}\n\n" "Antwort: " ) try: with open("api_key.txt", "r") as f: api_key = f.read().strip() except Exception as e: debug_print(f"Fehler beim Lesen des API-Tokens: {e}") return "k.A." openai.api_key = api_key try: response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[{"role": "system", "content": prompt_text}], temperature=0.0 ) result = response.choices[0].message.content.strip() debug_print(f"Validierungsantwort ChatGPT: '{result}'") return result except Exception as e: debug_print(f"Fehler beim Validierungs-API-Aufruf: {e}") return "k.A." def evaluate_branche_chatgpt(crm_branche, beschreibung, wiki_branche, wiki_kategorien): prompt_text = ( "Du bist ein Experte im Field Service Management. Analysiere die folgenden Branchenangaben und ordne das Unternehmen " "einer der gültigen Branchen zu. Nutze ausschließlich die vorhandenen Informationen.\n\n" f"CRM-Branche: {crm_branche}\n" f"Beschreibung Branche extern: {beschreibung}\n" f"Wikipedia-Branche: {wiki_branche}\n" f"Wikipedia-Kategorien: {wiki_kategorien}\n\n" "Ordne das Unternehmen exakt einer der gültigen Branchen zu und gib aus:\n" "Branche: \n" "Übereinstimmung: \n" "Begründung: " ) try: with open("api_key.txt", "r") as f: api_key = f.read().strip() except Exception as e: debug_print(f"Fehler beim Lesen des API-Tokens (Branche): {e}") return {"branch": "k.A.", "consistency": "k.A.", "justification": "k.A."} openai.api_key = api_key try: response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[{"role": "system", "content": prompt_text}], temperature=0.0 ) result = response.choices[0].message.content.strip() debug_print(f"Branchenabgleich ChatGPT Antwort: '{result}'") branch = "k.A." consistency = "k.A." justification = "" for line in result.split("\n"): if line.lower().startswith("branche:"): branch = line.split(":", 1)[1].strip() elif line.lower().startswith("übereinstimmung:"): consistency = line.split(":", 1)[1].strip() elif line.lower().startswith("begründung:"): justification = line.split(":", 1)[1].strip() return {"branch": branch, "consistency": consistency, "justification": justification} except Exception as e: debug_print(f"Fehler beim Aufruf der ChatGPT API für Branchenabgleich: {e}") return {"branch": "k.A.", "consistency": "k.A.", "justification": "k.A."} def evaluate_fsm_suitability(company_name, company_data): try: with open("api_key.txt", "r") as f: api_key = f.read().strip() except Exception as e: debug_print(f"Fehler beim Lesen des API-Tokens (FSM): {e}") return {"suitability": "k.A.", "justification": "k.A."} openai.api_key = api_key prompt = ( f"Bitte bewerte, ob das Unternehmen '{company_name}' für den Einsatz einer Field Service Management Lösung geeignet ist. " "Antworte ausschließlich mit 'Ja' oder 'Nein' und gib eine kurze Begründung." ) try: response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[{"role": "system", "content": prompt}], temperature=0.0 ) result = response.choices[0].message.content.strip() debug_print(f"FSM-Eignungsantwort ChatGPT: '{result}'") suitability = "k.A." justification = "" lines = result.split("\n") if len(lines) == 1: parts = result.split(" ", 1) suitability = parts[0].strip() justification = parts[1].strip() if len(parts) > 1 else "" else: for line in lines: if line.lower().startswith("eignung:"): suitability = line.split(":", 1)[1].strip() elif line.lower().startswith("begründung:"): justification = line.split(":", 1)[1].strip() if suitability not in ["Ja", "Nein"]: parts = result.split(" ", 1) suitability = parts[0].strip() justification = " ".join(result.split()[1:]).strip() return {"suitability": suitability, "justification": justification} except Exception as e: debug_print(f"Fehler beim Aufruf der ChatGPT API für FSM-Eignungsprüfung: {e}") return {"suitability": "k.A.", "justification": "k.A."} def evaluate_servicetechnicians_estimate(company_name, company_data): try: with open("serpApiKey.txt", "r") as f: serp_key = f.read().strip() except Exception as e: debug_print(f"Fehler beim Lesen des SerpAPI-Schlüssels (Servicetechniker): {e}") return "k.A." try: with open("api_key.txt", "r") as f: api_key = f.read().strip() except Exception as e: debug_print(f"Fehler beim Lesen des API-Tokens (Servicetechniker): {e}") return "k.A." openai.api_key = api_key prompt = ( f"Bitte schätze die Anzahl der Servicetechniker des Unternehmens '{company_name}' in einer der folgenden Kategorien: " "'<50 Techniker', '>100 Techniker', '>200 Techniker', '>500 Techniker'." ) try: response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[{"role": "system", "content": prompt}], temperature=0.0 ) result = response.choices[0].message.content.strip() debug_print(f"Schätzung Servicetechniker ChatGPT: '{result}'") return result except Exception as e: debug_print(f"Fehler beim Aufruf der ChatGPT API für Servicetechniker-Schätzung: {e}") return "k.A." def evaluate_servicetechnicians_explanation(company_name, st_estimate, company_data): try: with open("api_key.txt", "r") as f: api_key = f.read().strip() except Exception as e: debug_print(f"Fehler beim Lesen des API-Tokens (ST-Erklärung): {e}") return "k.A." openai.api_key = api_key prompt = ( f"Bitte erkläre, warum du für das Unternehmen '{company_name}' die Anzahl der Servicetechniker als '{st_estimate}' geschätzt hast." ) try: response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[{"role": "system", "content": prompt}], temperature=0.0 ) result = response.choices[0].message.content.strip() debug_print(f"Servicetechniker-Erklärung ChatGPT: '{result}'") return result except Exception as e: debug_print(f"Fehler beim Aufruf der ChatGPT API für Servicetechniker-Erklärung: {e}") return "k.A." def map_internal_technicians(value): try: num = int(value) except Exception: return "k.A." if num < 50: return "<50 Techniker" elif num < 100: return ">100 Techniker" elif num < 200: return ">200 Techniker" else: return ">500 Techniker" def wait_for_sheet_update(sheet, cell, expected_value, timeout=5): start_time = time.time() while time.time() - start_time < timeout: try: current_value = sheet.acell(cell).value if current_value == expected_value: return True except Exception as e: debug_print(f"Fehler beim Lesen von Zelle {cell}: {e}") time.sleep(0.5) return False # ==================== NEUE FUNKTION: LINKEDIN-KONTAKT-SUCHE (Einzelkontakt) ==================== def search_linkedin_contact(company_name, website, position_query): try: with open("serpApiKey.txt", "r") as f: serp_key = f.read().strip() except Exception as e: debug_print("Fehler beim Lesen des SerpAPI-Schlüssels: " + str(e)) return None search_name = company_name # Hier kannst du auch die Kurzform verwenden, falls vorhanden. query = f'site:linkedin.com/in "{position_query}" "{search_name}"' debug_print(f"Erstelle LinkedIn-Query: {query}") params = { "engine": "google", "q": query, "api_key": serp_key, "hl": "de" } try: response = requests.get("https://serpapi.com/search", params=params) data = response.json() debug_print(f"SerpAPI-Response für Query '{query}': {data.get('organic_results', [])[:1]}") if "organic_results" in data and len(data["organic_results"]) > 0: result = data["organic_results"][0] title = result.get("title", "") debug_print(f"LinkedIn-Suchergebnis-Titel: {title}") if "–" in title: parts = title.split("–") elif "-" in title: parts = title.split("-") else: parts = [title] if len(parts) >= 2: name_part = parts[0].strip() pos = parts[1].split("|")[0].strip() name_parts = name_part.split(" ", 1) if len(name_parts) == 2: firstname, lastname = name_parts else: firstname = name_part lastname = "" debug_print(f"Kontakt gefunden: {firstname} {lastname}, Position: {pos}") return {"Firmenname": company_name, "Website": website, "Vorname": firstname, "Nachname": lastname, "Position": pos} else: debug_print(f"Kontakt gefunden, aber unvollständige Informationen: {title}") return {"Firmenname": company_name, "Website": website, "Vorname": "", "Nachname": "", "Position": title} else: debug_print(f"Keine LinkedIn-Ergebnisse für Query: {query}") return None except Exception as e: debug_print(f"Fehler bei der SerpAPI-Suche: {e}") return None def count_linkedin_contacts(company_name, website, position_query): try: with open("serpApiKey.txt", "r") as f: serp_key = f.read().strip() except Exception as e: debug_print("Fehler beim Lesen des SerpAPI-Schlüssels: " + str(e)) return 0 query = f'site:linkedin.com/in "{position_query}" "{company_name}"' debug_print(f"Erstelle LinkedIn-Query (Count): {query}") params = { "engine": "google", "q": query, "api_key": serp_key, "hl": "de" } try: response = requests.get("https://serpapi.com/search", params=params) data = response.json() if "organic_results" in data: count = len(data["organic_results"]) debug_print(f"Anzahl Kontakte für Query '{query}': {count}") return count else: debug_print(f"Keine Ergebnisse für Query: {query}") return 0 except Exception as e: debug_print(f"Fehler bei der SerpAPI-Suche (Count): {e}") return 0 # ==================== NEUE FUNKTION: _process_verification_row ==================== def _process_verification_row(self, row_num, row_data): # Verarbeitung nur bis Spalte Y (Begründung Abweichung Branche) company_name = row_data[1] if len(row_data) > 1 else "" website = row_data[3] if len(row_data) > 3 else "" current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S") if len(row_data) > 11 and row_data[11].strip() not in ["", "k.A."]: wiki_url = row_data[11].strip() try: wiki_data = self.wiki_scraper.extract_company_data(wiki_url) except Exception as e: debug_print(f"Fehler beim Laden des vorgeschlagenen Wikipedia-Artikels: {e}") article = self.wiki_scraper.search_company_article(company_name, website) wiki_data = self.wiki_scraper.extract_company_data(article.url) if article else { 'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.', 'full_infobox': 'k.A.' } else: article = self.wiki_scraper.search_company_article(company_name, website) wiki_data = self.wiki_scraper.extract_company_data(article.url) if article else { 'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.', 'full_infobox': 'k.A.' } wiki_values = [ row_data[11] if len(row_data) > 11 and row_data[11].strip() not in ["", "k.A."] else "k.A.", wiki_data.get('url', 'k.A.'), wiki_data.get('first_paragraph', 'k.A.'), wiki_data.get('branche', 'k.A.'), wiki_data.get('umsatz', 'k.A.'), wiki_data.get('mitarbeiter', 'k.A.'), wiki_data.get('categories', 'k.A.') ] self.sheet_handler.sheet.update(values=[wiki_values], range_name=f"L{row_num}:R{row_num}") crm_branche = row_data[6] if len(row_data) > 6 else "k.A." beschreibung = row_data[7] if len(row_data) > 7 else "k.A." wiki_branche = wiki_data.get('branche', 'k.A.') wiki_kategorien = wiki_data.get('categories', 'k.A.') branche_result = evaluate_branche_chatgpt(crm_branche, beschreibung, wiki_branche, wiki_kategorien) self.sheet_handler.sheet.update(values=[[branche_result["branch"]]], range_name=f"V{row_num}") self.sheet_handler.sheet.update(values=[[branche_result["consistency"]]], range_name=f"W{row_num}") self.sheet_handler.sheet.update(values=[[branche_result["justification"]]], range_name=f"X{row_num}") crm_data = ";".join(row_data[1:11]) wiki_data_str = ";".join(row_data[11:18]) valid_result = validate_article_with_chatgpt(crm_data, wiki_data_str) self.sheet_handler.sheet.update(values=[[valid_result]], range_name=f"Y{row_num}") self.sheet_handler.sheet.update(values=[[current_dt]], range_name=f"Z{row_num}") self.sheet_handler.sheet.update(values=[[Config.VERSION]], range_name=f"AA{row_num}") debug_print(f"Zeile {row_num} verifiziert: URL: {wiki_data.get('url', 'k.A.')}, Branche: {wiki_data.get('branche', 'k.A.')}") time.sleep(Config.RETRY_DELAY) # Nach Abschluss der DataProcessor-Klasse wird diese Methode zugewiesen: # (Siehe unten nach der Klassendefinition) # ==================== NEUER MODUS 8: BATCH-PROZESSING MIT TOKEN-ZÄHLUNG ==================== def process_batch_token_count(batch_size=10): import tiktoken def count_tokens(text, model="gpt-3.5-turbo"): encoding = tiktoken.encoding_for_model(model) tokens = encoding.encode(text) return len(tokens) debug_print("Starte Batch-Token-Zählung (Modus 8)...") gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name( Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"])) sh = gc.open_by_url(Config.SHEET_URL) main_sheet = sh.sheet1 data = main_sheet.get_all_values() for i in range(2, len(data)+1, batch_size): batch_rows = data[i-1:i-1+batch_size] aggregated_prompt = "" for row in batch_rows: info = [] if len(row) > 1: info.append(row[1]) # Firmenname if len(row) > 2: info.append(row[2]) # Kurzform if len(row) > 3: info.append(row[3]) # Website if len(row) > 4: info.append(row[4]) # Ort if len(row) > 5: info.append(row[5]) # Beschreibung if len(row) > 6: info.append(row[6]) # Aktuelle Branche aggregated_prompt += "; ".join(info) + "\n" token_count = count_tokens(aggregated_prompt) debug_print(f"Batch beginnend in Zeile {i}: {token_count} Tokens") for j in range(i, min(i+batch_size, len(data)+1)): main_sheet.update(values=[[str(token_count)]], range_name=f"AQ{j}") time.sleep(Config.RETRY_DELAY) debug_print("Batch-Token-Zählung abgeschlossen.") # ==================== NEUER MODUS: ALIGNMENT DEMO (für Hauptblatt und Contacts) ==================== def alignment_demo_full(): alignment_demo(GoogleSheetHandler().sheet) gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name( Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"])) sh = gc.open_by_url(Config.SHEET_URL) try: contacts_sheet = sh.worksheet("Contacts") except gspread.exceptions.WorksheetNotFound: contacts_sheet = sh.add_worksheet(title="Contacts", rows="1000", cols="10") header = ["Firmenname", "Website", "Kurzform", "Vorname", "Nachname", "Position", "Anrede", "E-Mail"] contacts_sheet.update(values=[header], range_name="A1:H1") debug_print("Neues Blatt 'Contacts' erstellt und Header eingetragen.") alignment_demo(contacts_sheet) debug_print("Alignment-Demo für Hauptblatt und Contacts abgeschlossen.") # ==================== ALIGNMENT DEMO (Hauptblatt) ==================== def alignment_demo(sheet): new_headers = [ "Spalte A (ReEval Flag)", "Spalte B (Firmenname)", "Spalte C (Kurzform des Firmennamens)", "Spalte D (Website)", "Spalte E (Ort)", "Spalte F (Beschreibung)", "Spalte G (Aktuelle Branche)", "Spalte H (Beschreibung Branche extern)", "Spalte I (Anzahl Techniker CRM)", "Spalte J (Umsatz CRM)", "Spalte K (Anzahl Mitarbeiter CRM)", "Spalte L (Vorschlag Wiki URL)", "Spalte M (Wikipedia URL)", "Spalte N (Wikipedia Absatz)", "Spalte O (Wikipedia Branche)", "Spalte P (Wikipedia Umsatz)", "Spalte Q (Wikipedia Mitarbeiter)", "Spalte R (Wikipedia Kategorien)", "Spalte S (Konsistenzprüfung)", "Spalte T (Begründung bei Inkonsistenz)", "Spalte U (Vorschlag Wiki Artikel ChatGPT)", "Spalte V (Begründung bei Abweichung)", "Spalte W (Vorschlag neue Branche)", "Spalte X (Konsistenzprüfung Branche)", "Spalte Y (Begründung Abweichung Branche)", "Spalte Z (Timestamp Verifizierung)", "Spalte AA (Version)" ] header_range = "A11200:AA11200" sheet.update(values=[new_headers], range_name=header_range) print("Alignment-Demo abgeschlossen: Neue Spaltenüberschriften in Zeile 11200 geschrieben.") # ==================== WIKIPEDIA SCRAPER ==================== class WikipediaScraper: def __init__(self): wikipedia.set_lang(Config.LANG) def _get_full_domain(self, website): if not website: return "" website = website.lower().strip() website = re.sub(r'^https?:\/\/', '', website) website = re.sub(r'^www\.', '', website) return website.split('/')[0] def _generate_search_terms(self, company_name, website): terms = [] full_domain = self._get_full_domain(website) if full_domain: terms.append(full_domain) normalized_name = normalize_company_name(company_name) candidate = " ".join(normalized_name.split()[:2]).strip() if candidate and candidate not in terms: terms.append(candidate) if normalized_name and normalized_name not in terms: terms.append(normalized_name) debug_print(f"Generierte Suchbegriffe: {terms}") return terms def _validate_article(self, page, company_name, website): full_domain = self._get_full_domain(website) domain_found = False if full_domain: try: html_raw = requests.get(page.url).text soup = BeautifulSoup(html_raw, Config.HTML_PARSER) infobox = soup.find('table', class_=lambda c: c and 'infobox' in c.lower()) if infobox: links = infobox.find_all('a', href=True) for link in links: href = link.get('href').lower() if href.startswith('/wiki/datei:'): continue if full_domain in href: debug_print(f"Definitiver Link-Match in Infobox gefunden: {href}") domain_found = True break if not domain_found and hasattr(page, 'externallinks'): for ext_link in page.externallinks: if full_domain in ext_link.lower(): debug_print(f"Definitiver Link-Match in externen Links gefunden: {ext_link}") domain_found = True break except Exception as e: debug_print(f"Fehler beim Extrahieren von Links: {str(e)}") normalized_title = normalize_company_name(page.title) normalized_company = normalize_company_name(company_name) similarity = SequenceMatcher(None, normalized_title, normalized_company).ratio() debug_print(f"Ähnlichkeit (normalisiert): {similarity:.2f} ({normalized_title} vs {normalized_company})") threshold = 0.60 if domain_found else Config.SIMILARITY_THRESHOLD return similarity >= threshold def extract_first_paragraph(self, page_url): try: response = requests.get(page_url) soup = BeautifulSoup(response.text, Config.HTML_PARSER) paragraphs = soup.find_all('p') for p in paragraphs: text = clean_text(p.get_text()) if len(text) > 50: return text return "k.A." except Exception as e: debug_print(f"Fehler beim Extrahieren des ersten Absatzes: {e}") return "k.A." def extract_categories(self, soup): cat_div = soup.find('div', id="mw-normal-catlinks") if cat_div: ul = cat_div.find('ul') if ul: cats = [clean_text(li.get_text()) for li in ul.find_all('li')] return ", ".join(cats) return "k.A." def _extract_infobox_value(self, soup, target): infobox = soup.find('table', class_=lambda c: c and any(kw in c.lower() for kw in ['infobox', 'vcard', 'unternehmen'])) if not infobox: return "k.A." keywords_map = { 'branche': ['branche', 'industrie', 'tätigkeit', 'geschäftsfeld', 'sektor', 'produkte', 'leistungen', 'aktivitäten', 'wirtschaftszweig'], 'umsatz': ['umsatz', 'jahresumsatz', 'konzernumsatz', 'gesamtumsatz', 'erlöse', 'umsatzerlöse', 'einnahmen', 'ergebnis', 'jahresergebnis'], 'mitarbeiter': ['mitarbeiter', 'beschäftigte', 'personal', 'mitarbeiterzahl', 'angestellte', 'belegschaft', 'personalstärke'] } keywords = keywords_map.get(target, []) for row in infobox.find_all('tr'): header = row.find('th') if header: header_text = clean_text(header.get_text()).lower() if any(kw in header_text for kw in keywords): value = row.find('td') if value: raw_value = clean_text(value.get_text()) if target == 'branche': clean_val = re.sub(r'\[.*?\]|\(.*?\)', '', raw_value) return ' '.join(clean_val.split()).strip() if target == 'umsatz': return extract_numeric_value(raw_value, is_umsatz=True) if target == 'mitarbeiter': return extract_numeric_value(raw_value, is_umsatz=False) return "k.A." def extract_full_infobox(self, soup): infobox = soup.find('table', class_=lambda c: c and any(kw in c.lower() for kw in ['infobox', 'vcard', 'unternehmen'])) if not infobox: return "k.A." return clean_text(infobox.get_text(separator=' | ')) def extract_fields_from_infobox_text(self, infobox_text, field_names): result = {} tokens = [token.strip() for token in infobox_text.split("|") if token.strip()] for i, token in enumerate(tokens): for field in field_names: if field.lower() in token.lower(): j = i + 1 while j < len(tokens) and not tokens[j]: j += 1 result[field] = tokens[j] if j < len(tokens) else "k.A." return result def extract_company_data(self, page_url): if not page_url: return { 'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.', 'full_infobox': 'k.A.' } try: response = requests.get(page_url) soup = BeautifulSoup(response.text, Config.HTML_PARSER) full_infobox = self.extract_full_infobox(soup) extracted_fields = self.extract_fields_from_infobox_text(full_infobox, ['Branche', 'Umsatz', 'Mitarbeiter']) raw_branche = extracted_fields.get('Branche', self._extract_infobox_value(soup, 'branche')) raw_umsatz = extracted_fields.get('Umsatz', self._extract_infobox_value(soup, 'umsatz')) raw_mitarbeiter = extracted_fields.get('Mitarbeiter', self._extract_infobox_value(soup, 'mitarbeiter')) umsatz_val = extract_numeric_value(raw_umsatz, is_umsatz=True) mitarbeiter_val = extract_numeric_value(raw_mitarbeiter, is_umsatz=False) categories_val = self.extract_categories(soup) first_paragraph = self.extract_first_paragraph(page_url) return { 'url': page_url, 'first_paragraph': first_paragraph, 'branche': raw_branche, 'umsatz': umsatz_val, 'mitarbeiter': mitarbeiter_val, 'categories': categories_val, 'full_infobox': full_infobox } except Exception as e: debug_print(f"Extraktionsfehler: {str(e)}") return { 'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.', 'full_infobox': 'k.A.' } @retry_on_failure def search_company_article(self, company_name, website): search_terms = self._generate_search_terms(company_name, website) for term in search_terms: try: results = wikipedia.search(term, results=Config.WIKIPEDIA_SEARCH_RESULTS) debug_print(f"Suchergebnisse für '{term}': {results}") for title in results: try: page = wikipedia.page(title, auto_suggest=False) if self._validate_article(page, company_name, website): return page except (wikipedia.exceptions.DisambiguationError, wikipedia.exceptions.PageError) as e: debug_print(f"Seitenfehler: {str(e)}") continue except Exception as e: debug_print(f"Suchfehler: {str(e)}") continue return None # ==================== GOOGLE SHEET HANDLER (für Hauptdaten) ==================== class GoogleSheetHandler: def __init__(self): self.sheet = None self.sheet_values = [] self._connect() def _connect(self): scope = ["https://www.googleapis.com/auth/spreadsheets"] creds = ServiceAccountCredentials.from_json_keyfile_name(Config.CREDENTIALS_FILE, scope) self.sheet = gspread.authorize(creds).open_by_url(Config.SHEET_URL).sheet1 self.sheet_values = self.sheet.get_all_values() def get_start_index(self): # Verwende Spalte AN (Index 39) als Wikipedia-Timestamp im regulären Modus filled_n = [row[39] if len(row) > 39 else '' for row in self.sheet_values[1:]] return next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip()), len(filled_n) + 1) # ==================== DATA PROCESSOR ==================== class DataProcessor: def __init__(self): self.sheet_handler = GoogleSheetHandler() self.wiki_scraper = WikipediaScraper() def process_rows(self, num_rows=None): if MODE == "2": print("Re-Evaluierungsmodus: Verarbeitung aller Zeilen mit 'x' in Spalte A.") for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2): if row[0].strip().lower() == "x": self._process_single_row(i, row, force_all=True) elif MODE == "3": print("Alignment-Demo-Modus: Schreibe neue Spaltenüberschriften in Hauptblatt und Contacts.") alignment_demo_full() elif MODE == "4": processor = DataProcessor() for i, row in enumerate(processor.sheet_handler.sheet_values[1:], start=2): if len(row) <= 39 or row[39].strip() == "": processor._process_single_row(i, row, process_wiki=True, process_chatgpt=False) elif MODE == "5": processor = DataProcessor() for i, row in enumerate(processor.sheet_handler.sheet_values[1:], start=2): if len(row) <= 40 or row[40].strip() == "": processor._process_single_row(i, row, process_wiki=False, process_chatgpt=True) elif MODE == "51": processor = DataProcessor() for i, row in enumerate(processor.sheet_handler.sheet_values[1:], start=2): if len(row) <= 25 or row[24].strip() == "": processor._process_verification_row(i, row) elif MODE == "8": process_batch_token_count() else: start_index = self.sheet_handler.get_start_index() print(f"Starte bei Zeile {start_index+1}") rows_processed = 0 for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2): if i < start_index: continue if num_rows is not None and rows_processed >= num_rows: break self._process_single_row(i, row) rows_processed += 1 def _process_single_row(self, row_num, row_data, force_all=False, process_wiki=True, process_chatgpt=True): company_name = row_data[1] if len(row_data) > 1 else "" website = row_data[3] if len(row_data) > 3 else "" wiki_update_range = f"L{row_num}:R{row_num}" dt_wiki_range = f"AN{row_num}" dt_chat_range = f"AO{row_num}" ver_range = f"AP{row_num}" print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {row_num}: {company_name}") current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S") if force_all or process_wiki: if len(row_data) <= 39 or row_data[39].strip() == "": if len(row_data) > 11 and row_data[11].strip() not in ["", "k.A."]: wiki_url = row_data[11].strip() try: wiki_data = self.wiki_scraper.extract_company_data(wiki_url) except Exception as e: debug_print(f"Fehler beim Laden des vorgeschlagenen Wikipedia-Artikels: {e}") article = self.wiki_scraper.search_company_article(company_name, website) wiki_data = self.wiki_scraper.extract_company_data(article.url) if article else { 'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.', 'full_infobox': 'k.A.' } else: article = self.wiki_scraper.search_company_article(company_name, website) wiki_data = self.wiki_scraper.extract_company_data(article.url) if article else { 'url': 'k.A.', 'first_paragraph': 'k.A.', 'branche': 'k.A.', 'umsatz': 'k.A.', 'mitarbeiter': 'k.A.', 'categories': 'k.A.', 'full_infobox': 'k.A.' } wiki_values = [ row_data[11] if len(row_data) > 11 and row_data[11].strip() not in ["", "k.A."] else "k.A.", wiki_data.get('url', 'k.A.'), wiki_data.get('first_paragraph', 'k.A.'), wiki_data.get('branche', 'k.A.'), wiki_data.get('umsatz', 'k.A.'), wiki_data.get('mitarbeiter', 'k.A.'), wiki_data.get('categories', 'k.A.') ] self.sheet_handler.sheet.update(values=[wiki_values], range_name=wiki_update_range) self.sheet_handler.sheet.update(values=[[current_dt]], range_name=dt_wiki_range) else: debug_print(f"Zeile {row_num}: Wikipedia-Timestamp bereits gesetzt – überspringe Wiki-Auswertung.") if force_all or process_chatgpt: if len(row_data) <= 40 or row_data[40].strip() == "": crm_umsatz = row_data[9] if len(row_data) > 9 else "k.A." abgleich_result = compare_umsatz_values(crm_umsatz, wiki_data.get('umsatz', 'k.A.') if 'wiki_data' in locals() else "k.A.") self.sheet_handler.sheet.update(values=[[abgleich_result]], range_name=f"AG{row_num}") crm_data = ";".join(row_data[1:11]) wiki_data_str = ";".join(row_data[11:18]) valid_result = validate_article_with_chatgpt(crm_data, wiki_data_str) self.sheet_handler.sheet.update(values=[[valid_result]], range_name=f"R{row_num}") fsm_result = evaluate_fsm_suitability(company_name, wiki_data if 'wiki_data' in locals() else {}) self.sheet_handler.sheet.update(values=[[fsm_result["suitability"]]], range_name=f"Y{row_num}") self.sheet_handler.sheet.update(values=[[fsm_result["justification"]]], range_name=f"Z{row_num}") st_estimate = evaluate_servicetechnicians_estimate(company_name, wiki_data if 'wiki_data' in locals() else {}) self.sheet_handler.sheet.update(values=[[st_estimate]], range_name=f"AE{row_num}") internal_value = row_data[8] if len(row_data) > 8 else "k.A." internal_category = map_internal_technicians(internal_value) if internal_value != "k.A." else "k.A." if internal_category != "k.A." and st_estimate != internal_category: explanation = evaluate_servicetechnicians_explanation(company_name, st_estimate, wiki_data if 'wiki_data' in locals() else {}) discrepancy = explanation else: discrepancy = "ok" self.sheet_handler.sheet.update(values=[[discrepancy]], range_name=f"AF{row_num}") self.sheet_handler.sheet.update(values=[[current_dt]], range_name=dt_chat_range) else: debug_print(f"Zeile {row_num}: ChatGPT-Timestamp bereits gesetzt – überspringe ChatGPT-Auswertung.") self.sheet_handler.sheet.update(values=[[current_dt]], range_name=ver_range) self.sheet_handler.sheet.update(values=[[Config.VERSION]], range_name=ver_range) debug_print(f"✅ Aktualisiert: URL: {(wiki_data.get('url', 'k.A.') if 'wiki_data' in locals() else 'k.A.')}, " f"Branche: {(wiki_data.get('branche', 'k.A.') if 'wiki_data' in locals() else 'k.A.')}, " f"Umsatz-Abgleich: {abgleich_result if 'abgleich_result' in locals() else 'k.A.'}, " f"Validierung: {valid_result if 'valid_result' in locals() else 'k.A.'}, " f"FSM: {fsm_result['suitability'] if 'fsm_result' in locals() else 'k.A.'}, " f"Servicetechniker-Schätzung: {st_estimate if 'st_estimate' in locals() else 'k.A.'}") time.sleep(Config.RETRY_DELAY) # Hier wird _process_verification_row nach der Definition von DataProcessor zugewiesen. DataProcessor._process_verification_row = _process_verification_row # ==================== NEUER MODUS 6: CONTACT RESEARCH (via SerpAPI) ==================== def process_contact_research(): debug_print("Starte Contact Research (Modus 6)...") gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name( Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"])) sh = gc.open_by_url(Config.SHEET_URL) main_sheet = sh.sheet1 data = main_sheet.get_all_values() for i, row in enumerate(data[1:], start=2): company_name = row[1] if len(row) > 1 else "" search_name = row[2].strip() if len(row) > 2 and row[2].strip() not in ["", "k.A."] else company_name website = row[3] if len(row) > 3 else "" if not company_name or not website: continue count_service = count_linkedin_contacts(search_name, website, "Serviceleiter") count_it = count_linkedin_contacts(search_name, website, "IT-Leiter") count_management = count_linkedin_contacts(search_name, website, "Geschäftsführer") count_disponent = count_linkedin_contacts(search_name, website, "Disponent") current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S") main_sheet.update(values=[[str(count_service)]], range_name=f"AI{i}") main_sheet.update(values=[[str(count_it)]], range_name=f"AJ{i}") main_sheet.update(values=[[str(count_management)]], range_name=f"AK{i}") main_sheet.update(values=[[str(count_disponent)]], range_name=f"AL{i}") main_sheet.update(values=[[current_dt]], range_name=f"AM{i}") debug_print(f"Zeile {i}: Serviceleiter {count_service}, IT-Leiter {count_it}, Management {count_management}, Disponent {count_disponent} – Contact Search Timestamp gesetzt.") time.sleep(Config.RETRY_DELAY * 1.5) debug_print("Contact Research abgeschlossen.") # ==================== NEUER MODUS: CONTACTS (LinkedIn) ==================== def process_contacts(): debug_print("Starte LinkedIn-Kontaktsuche...") gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name( Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"])) sh = gc.open_by_url(Config.SHEET_URL) try: contacts_sheet = sh.worksheet("Contacts") except gspread.exceptions.WorksheetNotFound: contacts_sheet = sh.add_worksheet(title="Contacts", rows="1000", cols="10") header = ["Firmenname", "Website", "Kurzform", "Vorname", "Nachname", "Position", "Anrede", "E-Mail"] contacts_sheet.update(values=[header], range_name="A1:H1") debug_print("Neues Blatt 'Contacts' erstellt und Header eingetragen.") main_sheet = sh.sheet1 data = main_sheet.get_all_values() positions = ["Serviceleiter", "IT-Leiter", "Leiter After Sales", "Leiter Einsatzplanung"] new_rows = [] for idx, row in enumerate(data[1:], start=2): company_name = row[1] if len(row) > 1 else "" search_name = row[2].strip() if len(row) > 2 and row[2].strip() not in ["", "k.A."] else company_name website = row[3] if len(row) > 3 else "" debug_print(f"Verarbeite Firma: '{company_name}' (Zeile {idx}), Website: '{website}'") if not company_name or not website: debug_print("Überspringe, da Firmenname oder Website fehlt.") continue for pos in positions: debug_print(f"Suche nach Position: '{pos}' bei '{search_name}'") contact = search_linkedin_contact(search_name, website, pos) if contact: debug_print(f"Kontakt gefunden: {contact}") new_rows.append([contact["Firmenname"], website, search_name, contact["Vorname"], contact["Nachname"], contact["Position"], "", ""]) else: debug_print(f"Kein Kontakt für Position '{pos}' bei '{search_name}' gefunden.") if new_rows: last_row = len(contacts_sheet.get_all_values()) + 1 range_str = f"A{last_row}:H{last_row + len(new_rows) - 1}" contacts_sheet.update(values=new_rows, range_name=range_str) debug_print(f"{len(new_rows)} Kontakte in 'Contacts' hinzugefügt.") else: debug_print("Keine Kontakte gefunden in der Haupttabelle.") # ==================== NEUER MODUS 8: BATCH-PROZESSING MIT TOKEN-ZÄHLUNG ==================== def process_batch_token_count(batch_size=10): import tiktoken def count_tokens(text, model="gpt-3.5-turbo"): encoding = tiktoken.encoding_for_model(model) tokens = encoding.encode(text) return len(tokens) debug_print("Starte Batch-Token-Zählung (Modus 8)...") gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name( Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"])) sh = gc.open_by_url(Config.SHEET_URL) main_sheet = sh.sheet1 data = main_sheet.get_all_values() for i in range(2, len(data)+1, batch_size): batch_rows = data[i-1:i-1+batch_size] aggregated_prompt = "" for row in batch_rows: info = [] if len(row) > 1: info.append(row[1]) # Firmenname if len(row) > 2: info.append(row[2]) # Kurzform if len(row) > 3: info.append(row[3]) # Website if len(row) > 4: info.append(row[4]) # Ort if len(row) > 5: info.append(row[5]) # Beschreibung if len(row) > 6: info.append(row[6]) # Aktuelle Branche aggregated_prompt += "; ".join(info) + "\n" token_count = count_tokens(aggregated_prompt) debug_print(f"Batch beginnend in Zeile {i}: {token_count} Tokens") for j in range(i, min(i+batch_size, len(data)+1)): main_sheet.update(values=[[str(token_count)]], range_name=f"AQ{j}") time.sleep(Config.RETRY_DELAY) debug_print("Batch-Token-Zählung abgeschlossen.") # ==================== MAIN PROGRAMM ==================== if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--mode", type=str, help="Modus: 1,2,3,4,5,6,7,51 oder 8") parser.add_argument("--num_rows", type=int, default=0, help="Anzahl der zu bearbeitenden Zeilen (nur für Modus 1)") args = parser.parse_args() if not args.mode: print("Modi:") print("1 = Regulärer Modus") print("2 = Re-Evaluierungsmodus (nur Zeilen mit 'x' in Spalte A)") print("3 = Alignment-Demo (Header in Hauptblatt und Contacts)") print("4 = Nur Wikipedia-Suche (Zeilen ohne Wikipedia-Timestamp)") print("5 = Nur ChatGPT-Bewertung (Zeilen ohne ChatGPT-Timestamp)") print("6 = Contact Research (via SerpAPI)") print("7 = Contacts (LinkedIn)") print("8 = Batch-Token-Zählung") print("51 = Nur Verifizierung (Wikipedia + Brancheneinordnung)") args.mode = input("Wählen Sie den Modus: ").strip() MODE = args.mode if MODE == "1": num_rows = args.num_rows if args.num_rows > 0 else int(input("Wieviele Zeilen sollen überprüft werden? ")) processor = DataProcessor() processor.process_rows(num_rows) elif MODE in ["2", "3"]: processor = DataProcessor() processor.process_rows() elif MODE == "4": processor = DataProcessor() for i, row in enumerate(processor.sheet_handler.sheet_values[1:], start=2): if len(row) <= 39 or row[39].strip() == "": processor._process_single_row(i, row, process_wiki=True, process_chatgpt=False) elif MODE == "5": processor = DataProcessor() for i, row in enumerate(processor.sheet_handler.sheet_values[1:], start=2): if len(row) <= 40 or row[40].strip() == "": processor._process_single_row(i, row, process_wiki=False, process_chatgpt=True) elif MODE == "51": process_verification_only() elif MODE == "6": process_contact_research() elif MODE == "7": process_contacts() elif MODE == "8": process_batch_token_count() print(f"\n✅ Auswertung abgeschlossen ({Config.VERSION})")