From 0ebfb2c979018880d3626662eb4d969128d891cd Mon Sep 17 00:00:00 2001 From: Floke Date: Wed, 2 Apr 2025 19:02:40 +0000 Subject: [PATCH] v1.3.6: LinkedIn-Contacts via SerpApi, Bugfix Zeilenlimit & Startindex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Neuer Modus (4) zur Suche von LinkedIn-Kontakten via SerpApi, Ausgabe im Tabellenblatt "Contacts" mit den Spalten Firmenname, Website, Vorname, Nachname, Position. Bugfix: Im regulären Modus (1) wird jetzt ab dem letzten Timestamp in Spalte AH nur noch die angegebene Anzahl an Zeilen verarbeitet. Bugfix: Bestehende Datensätze (mit Timestamp in AH) werden nicht überschrieben, es wird ab der ersten leeren Zeile gestartet. Weitere Anpassungen bei den bisherigen Funktionen (FSM, Servicetechniker-Schätzung) bleiben erhalten. --- brancheneinstufung.py | 200 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 162 insertions(+), 38 deletions(-) diff --git a/brancheneinstufung.py b/brancheneinstufung.py index c2ec8e93..dd41aafe 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -14,7 +14,7 @@ import csv # ==================== KONFIGURATION ==================== class Config: - VERSION = "v1.3.5" # v1.3.5: FSM-Eignungsprüfung & Servicetechniker-Explanation, Sheet-Update-Check, automatische Log-Datei. + VERSION = "v1.3.6" # v1.3.6: LinkedIn-Contacts über SerpApi integriert, Bugfixes zu Zeilenbegrenzung und Startindex. LANG = "de" CREDENTIALS_FILE = "service_account.json" SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo" @@ -185,9 +185,7 @@ def validate_article_with_chatgpt(crm_data, wiki_data): except Exception as e: debug_print(f"Fehler beim Lesen des API-Tokens: {e}") return "k.A." - openai.api_key = api_key - try: response = openai.ChatCompletion.create( model="gpt-3.5-turbo", @@ -201,8 +199,6 @@ def validate_article_with_chatgpt(crm_data, wiki_data): debug_print(f"Fehler beim Validierungs-API-Aufruf: {e}") return "k.A." - -# ==================== NEUE FUNKTION: FSM-EIGNUNGSPRÜFUNG ==================== def evaluate_fsm_suitability(company_name, company_data): try: with open("api_key.txt", "r") as f: @@ -247,8 +243,14 @@ def evaluate_fsm_suitability(company_name, company_data): debug_print(f"Fehler beim Aufruf der ChatGPT API für FSM-Eignungsprüfung: {e}") return {"suitability": "k.A.", "justification": "k.A."} -# ==================== NEUE FUNKTION: SCHÄTZUNG DER ANZAHL SERVICETECHNIKER ==================== def evaluate_servicetechnicians_estimate(company_name, company_data): + try: + with open("serpApiKey.txt", "r") as f: + serp_key = f.read().strip() + except Exception as e: + debug_print(f"Fehler beim Lesen des SerpAPI-Schlüssels (Servicetechniker): {e}") + return "k.A." + # Wir nutzen SerpApi hier nicht, sondern die ChatGPT-Funktion try: with open("api_key.txt", "r") as f: api_key = f.read().strip() @@ -314,7 +316,6 @@ def map_internal_technicians(value): else: return ">500 Techniker" -# ==================== WARTEN BIS ZELLE AKTUALISIERT IST ==================== def wait_for_sheet_update(sheet, cell, expected_value, timeout=5): start_time = time.time() while time.time() - start_time < timeout: @@ -327,6 +328,82 @@ def wait_for_sheet_update(sheet, cell, expected_value, timeout=5): time.sleep(0.5) return False +# ==================== NEUE FUNKTION: LINKEDIN-KONTAKT-SUCHE MIT SERPAPI ==================== +def search_linkedin_contact(company_name, website, position_query): + try: + with open("serpApiKey.txt", "r") as f: + serp_key = f.read().strip() + except Exception as e: + debug_print("Fehler beim Lesen des SerpAPI-Schlüssels: " + str(e)) + return None + query = f'site:linkedin.com/in "{position_query}" "{company_name}"' + params = { + "engine": "google", + "q": query, + "api_key": serp_key, + "hl": "de" + } + try: + response = requests.get("https://serpapi.com/search", params=params) + data = response.json() + if "organic_results" in data and len(data["organic_results"]) > 0: + result = data["organic_results"][0] + title = result.get("title", "") + if "–" in title: + parts = title.split("–") + elif "-" in title: + parts = title.split("-") + else: + parts = [title] + if len(parts) >= 2: + name_part = parts[0].strip() + pos = parts[1].split("|")[0].strip() + name_parts = name_part.split(" ", 1) + if len(name_parts) == 2: + firstname, lastname = name_parts + else: + firstname = name_part + lastname = "" + return {"Firmenname": company_name, "Website": website, "Vorname": firstname, "Nachname": lastname, "Position": pos} + else: + return {"Firmenname": company_name, "Website": website, "Vorname": "", "Nachname": "", "Position": title} + else: + return None + except Exception as e: + debug_print(f"Fehler bei der SerpAPI-Suche: {e}") + return None + +def process_contacts(): + # Öffne das Spreadsheet und erhalte das Arbeitsblatt "Contacts" (erstelle es, falls nicht vorhanden) + gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"])) + sh = gc.open_by_url(Config.SHEET_URL) + try: + contacts_sheet = sh.worksheet("Contacts") + except gspread.exceptions.WorksheetNotFound: + contacts_sheet = sh.add_worksheet(title="Contacts", rows="1000", cols="10") + header = ["Firmenname", "Website", "Vorname", "Nachname", "Position", "Anrede", "E-Mail"] + contacts_sheet.update("A1:G1", [header]) + main_sheet = sh.sheet1 + data = main_sheet.get_all_values() + positions = ["Serviceleiter", "IT-Leiter", "Leiter After Sales", "Leiter Einsatzplanung"] + new_rows = [] + for row in data[1:]: + company_name = row[1] if len(row) > 1 else "" + website = row[2] if len(row) > 2 else "" + if not company_name or not website: + continue + for pos in positions: + contact = search_linkedin_contact(company_name, website, pos) + if contact: + new_rows.append([contact["Firmenname"], contact["Website"], contact["Vorname"], contact["Nachname"], contact["Position"], "", ""]) + if new_rows: + last_row = len(contacts_sheet.get_all_values()) + 1 + range_str = f"A{last_row}:G{last_row + len(new_rows) - 1}" + contacts_sheet.update(range_str, new_rows) + debug_print(f"{len(new_rows)} Kontakte in 'Contacts' hinzugefügt.") + else: + debug_print("Keine Kontakte gefunden.") + # ==================== BRANCHENABGLEICH PER CHATGPT ==================== def load_target_branches(): try: @@ -520,12 +597,12 @@ class WikipediaScraper: debug_print(f"Definitiver Link-Match in Infobox gefunden: {href}") domain_found = True break - if not domain_found and hasattr(page, 'externallinks'): - for ext_link in page.externallinks: - if full_domain in ext_link.lower(): - debug_print(f"Definitiver Link-Match in externen Links gefunden: {ext_link}") - domain_found = True - break + if not domain_found and hasattr(page, 'externallinks'): + for ext_link in page.externallinks: + if full_domain in ext_link.lower(): + debug_print(f"Definitiver Link-Match in externen Links gefunden: {ext_link}") + domain_found = True + break except Exception as e: debug_print(f"Fehler beim Extrahieren von Links: {str(e)}") normalized_title = normalize_company_name(page.title) @@ -668,20 +745,23 @@ class DataProcessor: def process_rows(self, num_rows=None): if MODE == "2": print("Re-Evaluierungsmodus: Verarbeitung aller Zeilen mit 'x' in Spalte A.") + for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2): + if row[0].strip().lower() == "x": + self._process_single_row(i, row) elif MODE == "3": print("Alignment-Demo-Modus: Schreibe neue Spaltenüberschriften in Zeile 11200.") alignment_demo(self.sheet_handler.sheet) - return else: start_index = self.sheet_handler.get_start_index() print(f"Starte bei Zeile {start_index+1}") - for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2): - if MODE == "2": - if row[0].strip().lower() == "x": - self._process_single_row(i, row) - else: - if i >= self.sheet_handler.get_start_index(): - self._process_single_row(i, row) + rows_processed = 0 + for i, row in enumerate(self.sheet_handler.sheet_values[1:], start=2): + if i < start_index: + continue + if num_rows is not None and rows_processed >= num_rows: + break + self._process_single_row(i, row) + rows_processed += 1 def _process_single_row(self, row_num, row_data): company_name = row_data[1] if len(row_data) > 1 else "" website = row_data[2] if len(row_data) > 2 else "" @@ -693,7 +773,6 @@ class DataProcessor: ver_range = f"AI{row_num}" print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {row_num}: {company_name}") - # Prüfen: Wikipedia-Vorschlag in Spalte K? if len(row_data) > 10 and row_data[10].strip() not in ["", "k.A."]: wiki_url = row_data[10].strip() try: @@ -727,21 +806,17 @@ class DataProcessor: wait_for_sheet_update(self.sheet_handler.sheet, f"K{row_num}", wiki_values[0]) time.sleep(3) - # Umsatz-Schätzung: Spalte AF soll "XX" erhalten self.sheet_handler.sheet.update(values=[["XX"]], range_name=chatgpt_range) - # Umsatz-Abgleich (Spalte AG) crm_umsatz = row_data[8] if len(row_data) > 8 else "k.A." abgleich_result = compare_umsatz_values(crm_umsatz, company_data.get('umsatz', 'k.A.')) self.sheet_handler.sheet.update(values=[[abgleich_result]], range_name=abgleich_range) - # Validierung crm_data = ";".join(row_data[1:10]) wiki_data = ";".join(row_data[11:17]) valid_result = validate_article_with_chatgpt(crm_data, wiki_data) self.sheet_handler.sheet.update(values=[[valid_result]], range_name=valid_range) - # Branchenabgleich crm_branche = row_data[5] if len(row_data) > 5 else "k.A." beschreibung_branche = row_data[6] if len(row_data) > 6 else "k.A." wiki_branche = company_data.get('branche', 'k.A.') @@ -754,12 +829,10 @@ class DataProcessor: self.sheet_handler.sheet.update(values=[[branche_result["consistency"]]], range_name=branche_w_range) self.sheet_handler.sheet.update(values=[[branche_result["justification"]]], range_name=branche_x_range) - # FSM-Eignungsprüfung (Spalte Y/Z) fsm_result = evaluate_fsm_suitability(company_name, company_data) self.sheet_handler.sheet.update(values=[[fsm_result["suitability"]]], range_name=f"Y{row_num}") self.sheet_handler.sheet.update(values=[[fsm_result["justification"]]], range_name=f"Z{row_num}") - # Servicetechniker-Schätzung (Spalte AD) und Vergleich (Spalte AE) st_estimate = evaluate_servicetechnicians_estimate(company_name, company_data) self.sheet_handler.sheet.update(values=[[st_estimate]], range_name=f"AD{row_num}") internal_value = row_data[7] if len(row_data) > 7 else "k.A." @@ -771,27 +844,75 @@ class DataProcessor: discrepancy = "ok" self.sheet_handler.sheet.update(values=[[discrepancy]], range_name=f"AE{row_num}") - # Spalten AF und AG: "XX" self.sheet_handler.sheet.update(values=[["XX"]], range_name="AF" + str(row_num)) self.sheet_handler.sheet.update(values=[["XX"]], range_name="AG" + str(row_num)) - # Timestamp und Version current_dt = datetime.now().strftime("%Y-%m-%d %H:%M:%S") self.sheet_handler.sheet.update(values=[[current_dt]], range_name=dt_range) self.sheet_handler.sheet.update(values=[[Config.VERSION]], range_name=ver_range) debug_print(f"✅ Aktualisiert: URL: {company_data.get('url', 'k.A.')}, " - f"Branche: {company_data.get('branche', 'k.A.')}, Umsatz-Abgleich: {abgleich_result}, " - f"Validierung: {valid_result}, Branchenvorschlag: {branche_result['branch']}, " - f"FSM: {fsm_result['suitability']}, Servicetechniker-Schätzung: {st_estimate}") + f"Branche: {company_data.get('branche', 'k.A.')}, Umsatz-Abgleich: {abgleich_result}, " + f"Validierung: {valid_result}, Branchenvorschlag: {branche_result['branch']}, " + f"FSM: {fsm_result['suitability']}, Servicetechniker-Schätzung: {st_estimate}") time.sleep(Config.RETRY_DELAY) +# ==================== GOOGLE SHEET HANDLER (für Hauptdaten) ==================== +class GoogleSheetHandler: + def __init__(self): + self.sheet = None + self.sheet_values = [] + self._connect() + def _connect(self): + scope = ["https://www.googleapis.com/auth/spreadsheets"] + creds = ServiceAccountCredentials.from_json_keyfile_name(Config.CREDENTIALS_FILE, scope) + self.sheet = gspread.authorize(creds).open_by_url(Config.SHEET_URL).sheet1 + self.sheet_values = self.sheet.get_all_values() + def get_start_index(self): + filled_n = [row[13] if len(row) > 13 else '' for row in self.sheet_values[1:]] + return next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip()), len(filled_n) + 1) + +# ==================== NEUER MODUS: CONTACTS ==================== +def process_contacts(): + gc = gspread.authorize(ServiceAccountCredentials.from_json_keyfile_name(Config.CREDENTIALS_FILE, ["https://www.googleapis.com/auth/spreadsheets"])) + sh = gc.open_by_url(Config.SHEET_URL) + try: + contacts_sheet = sh.worksheet("Contacts") + except gspread.exceptions.WorksheetNotFound: + contacts_sheet = sh.add_worksheet(title="Contacts", rows="1000", cols="10") + header = ["Firmenname", "Website", "Vorname", "Nachname", "Position", "Anrede", "E-Mail"] + contacts_sheet.update("A1:G1", [header]) + main_sheet = sh.sheet1 + data = main_sheet.get_all_values() + positions = ["Serviceleiter", "IT-Leiter", "Leiter After Sales", "Leiter Einsatzplanung"] + new_rows = [] + for row in data[1:]: + company_name = row[1] if len(row) > 1 else "" + website = row[2] if len(row) > 2 else "" + if not company_name or not website: + continue + for pos in positions: + contact = search_linkedin_contact(company_name, website, pos) + if contact: + new_rows.append([contact["Firmenname"], contact["Website"], contact["Vorname"], contact["Nachname"], contact["Position"], "", ""]) + if new_rows: + last_row = len(contacts_sheet.get_all_values()) + 1 + range_str = f"A{last_row}:G{last_row + len(new_rows) - 1}" + contacts_sheet.update(range_str, new_rows) + debug_print(f"{len(new_rows)} Kontakte in 'Contacts' hinzugefügt.") + else: + debug_print("Keine Kontakte gefunden.") + +# ==================== MAIN PROGRAMM ==================== if __name__ == "__main__": - mode_input = input("Wählen Sie den Modus: 1 für normalen Modus, 2 für Re-Evaluierungsmodus, 3 für Alignment-Demo: ").strip() + print("Modi: 1 = regulärer Modus, 2 = Re-Evaluierungsmodus, 3 = Alignment-Demo, 4 = LinkedIn Contacts") + mode_input = input("Wählen Sie den Modus: ").strip() if mode_input == "2": MODE = "2" elif mode_input == "3": MODE = "3" + elif mode_input == "4": + MODE = "4" else: MODE = "1" if MODE == "1": @@ -800,8 +921,11 @@ if __name__ == "__main__": except Exception as e: print("Ungültige Eingabe. Bitte eine Zahl eingeben.") exit(1) - else: - num_rows = None - processor = DataProcessor() - processor.process_rows(num_rows) + processor = DataProcessor() + processor.process_rows(num_rows) + elif MODE in ["2", "3"]: + processor = DataProcessor() + processor.process_rows() + elif MODE == "4": + process_contacts() print(f"\n✅ Wikipedia-Auswertung abgeschlossen ({Config.VERSION})")