From e7c2d7c612f2322d50d3d26b0e18cd3bf6a41974 Mon Sep 17 00:00:00 2001 From: Floke Date: Fri, 18 Apr 2025 16:53:40 +0000 Subject: [PATCH] bugfix --- brancheneinstufung.py | 300 ++++++++++++++++++++++++++++-------------- 1 file changed, 202 insertions(+), 98 deletions(-) diff --git a/brancheneinstufung.py b/brancheneinstufung.py index 6706bbed..c50e584a 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -1512,7 +1512,9 @@ def process_contact_research(sheet_handler): # unverändert time.sleep(Config.RETRY_DELAY) debug_print("Contact Research abgeschlossen.") -def alignment_demo(sheet): # unverändert +# ==================== ALIGNMENT DEMO (Hauptblatt) ==================== +def alignment_demo(sheet): + """Schreibt die Header-Struktur (Zeilen 1-5, jetzt bis Spalte AX) ins angegebene Sheet.""" new_headers = [ # Spalten A bis AX ["ReEval Flag", "CRM Name", "CRM Kurzform", "CRM Website", "CRM Ort", "CRM Beschreibung", "CRM Branche", "CRM Beschreibung Branche extern", "CRM Anzahl Techniker", "CRM Umsatz", "CRM Anzahl Mitarbeiter", "CRM Vorschlag Wiki URL", "Wiki URL", "Wiki Absatz", "Wiki Branche", "Wiki Umsatz", "Wiki Mitarbeiter", "Wiki Kategorien", "Chat Wiki Konsistenzprüfung", "Chat Begründung Wiki Inkonsistenz", "Chat Vorschlag Wiki Artikel", "Begründung bei Abweichung", "Chat Vorschlag Branche", "Chat Konsistenz Branche", "Chat Begründung Abweichung Branche", "Chat Prüfung FSM Relevanz", "Chat Begründung für FSM Relevanz", "Chat Schätzung Anzahl Mitarbeiter", "Chat Konsistenzprüfung Mitarbeiterzahl", "Chat Begründung Abweichung Mitarbeiterzahl", "Chat Einschätzung Anzahl Servicetechniker", "Chat Begründung Abweichung Anzahl Servicetechniker", "Chat Schätzung Umsatz", "Chat Begründung Abweichung Umsatz", "Linked Serviceleiter gefunden", "Linked It-Leiter gefunden", "Linked Management gefunden", "Linked Disponent gefunden", "Contact Search Timestamp", "Wikipedia Timestamp", "Timestamp letzte Prüfung", "Version", "Tokens", "Website Rohtext", "Website Zusammenfassung", "Website Scrape Timestamp", "Geschätzter Techniker Bucket", "Finaler Umsatz (Wiki>CRM)", "Finaler Mitarbeiter (Wiki>CRM)", "Wiki Verif. Timestamp"], ["CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "CRM", "Wikipediascraper", "Wikipediascraper", "Wikipediascraper", "Wikipediascraper", "Wikipediascraper", "Wikipediascraper", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "Chat GPT API", "LinkedIn (via SerpApi)", "LinkedIn (via SerpApi)", "LinkedIn (via SerpApi)", "LinkedIn (via SerpApi)", "System", "System", "System", "System", "System", "Web Scraper", "Chat GPT API", "System", "ML Modell / Skript", "Skript (Wiki/CRM)", "Skript (Wiki/CRM)", "System"], @@ -1521,12 +1523,26 @@ def alignment_demo(sheet): # unverändert ["Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Datenquelle", "Wird durch Wikipedia Scraper bereitgestellt", "Wird zunächst nicht verwendet...", "Wird u.a. zur finalen Ermittlung...", "Wird u.a. mit CRM-Umsatz...", "Wird u.a. mit CRM-Anzahl...", "Wenn Website-Daten fehlen...", "\"Es soll durch ChatGPT geprüft werden...", "\"Liegt eine Inkonsistenz...", "\"Sollte durch die Wikipedia-Suche...", "XXX derzeit nicht verwendet...", "\"ChatGPT soll anhand der vorliegenden...", "Die in Spalte CRM festgelegte...", "Weicht die von ChatGPT ermittelte...", "ChatGPT soll anhand der vorliegenden Daten prüfen...", "Die in 'Chat Begründung für FSM Relevanz'...", "Nur wenn kein Wikipedia-Eintrag...", "Entspricht die durch ChatGPT ermittelte...", "Weicht die von ChatGPT geschätzte...", "ChatGPT soll auf Basis öffentlich...", "Weicht die von ChatGPT geschätzte...", "Nur wenn kein Wikipedia-Eintrag...", "ChatGPT soll signifikante Umsatzabweichungen...", "Über SerpAPI wird zusammen...", "Über SerpAPI wird zusammen...", "Über SerpAPI wird zusammen...", "Über SerpAPI wird zusammen...", "Wenn die Kontaktsuche gestartet wird...", "Wenn die Wikipedia-Suche gestartet wird...", "Wenn die ChatGPT-Bewertung gestartet wird...", "Wird durch das System befüllt", "Wird durch tiktoken berechnet", "Wird durch Web Scraper...", "Wird durch ChatGPT API...", "Timestamp wird gesetzt, wenn Website Rohtext/Zusammenfassung geschrieben werden.", "Ergebnis der Schätzung durch das trainierte ML-Modell.", "Vom Skript berechneter Wert, priorisiert Wiki > CRM...", "Vom Skript berechneter Wert, priorisiert Wiki > CRM...", "Timestamp wird gesetzt, wenn Wiki-Verifikation (S-Y) durchgeführt wurde."] ] num_cols = len(new_headers[0]) - def colnum_string(n): string = ""; while n > 0: n, remainder = divmod(n - 1, 26); string = chr(65 + remainder) + string; return string - end_col_letter = colnum_string(num_cols); header_range = f"A1:{end_col_letter}{len(new_headers)}" - try: sheet.update(values=new_headers, range_name=header_range); print(f"Alignment-Demo: Header {header_range} OK."); debug_print(f"Alignment-Demo: Header {header_range} geschrieben.") - except Exception as e: print(f"FEHLER Alignment-Demo Header: {e}"); debug_print(f"FEHLER Alignment-Demo Header: {e}") -# --- DataProcessor Klasse (unverändert außer prepare_data Methode) --- + # --- KORRIGIERTE Innere Funktion --- + def colnum_string(n): + string = "" + while n > 0: + n, remainder = divmod(n - 1, 26) + string = chr(65 + remainder) + string + return string + # --- ENDE KORRIGIERTE Innere Funktion --- + + end_col_letter = colnum_string(num_cols) + header_range = f"A1:{end_col_letter}{len(new_headers)}" + try: + sheet.update(values=new_headers, range_name=header_range) + print(f"Alignment-Demo: Header in Bereich {header_range} geschrieben.") + debug_print(f"Alignment-Demo: Header in Bereich {header_range} geschrieben.") + except Exception as e: + print(f"FEHLER beim Schreiben der Alignment-Demo Header: {e}") + debug_print(f"FEHLER beim Schreiben der Alignment-Demo Header: {e}") + # --- DataProcessor Klasse (Rest der Implementierung) --- class DataProcessor: """ @@ -1549,7 +1565,6 @@ class DataProcessor: """ Verarbeitet die Daten für eine einzelne Zeile, prüft Timestamps für jeden Teilbereich und stellt sicher, dass aktuelle Wiki-Daten für Branch-Eval verwendet werden. - (Implementierung aus v1.6.5 angenommen) """ debug_print(f"--- Starte Verarbeitung Zeile {row_num_in_sheet} ---") updates = []; now_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S"); any_processing_done = False @@ -1570,7 +1585,8 @@ class DataProcessor: if not website_url or website_url.strip().lower() == "k.a.": new_website = serp_website_lookup(company_name) if new_website != "k.A.": website_url = new_website; - if website_url != original_website: updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["CRM Website"] + 1)}{row_num_in_sheet}', 'values': [[website_url]]}) # Use helper + # Use helper function to get column letter + if website_url != original_website: updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["CRM Website"] + 1)}{row_num_in_sheet}', 'values': [[website_url]]}) if website_url and website_url.strip().lower() != "k.a.": new_website_raw = get_website_raw(website_url); new_website_summary = summarize_website_content(new_website_raw) if new_website_raw != website_raw: updates.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Rohtext"] + 1)}{row_num_in_sheet}', 'values': [[new_website_raw]]}); website_raw = new_website_raw @@ -1598,12 +1614,12 @@ class DataProcessor: if valid_crm_wiki_url: debug_print(f" -> Prüfe CRM Vorschlag L: {valid_crm_wiki_url}") try: # Use try-except for page loading - # Preload=True can sometimes fail on redirects or special pages - page = wikipedia.page(valid_crm_wiki_url.split('/')[-1].replace('_', ' '), auto_suggest=False, preload=False) - # Attempt to load content after getting page object - _ = page.content # Access content to trigger load + # Get page title from URL for wikipedia.page() + page_title = unquote(valid_crm_wiki_url.split('/wiki/', 1)[-1]).replace('_', ' ') + page = wikipedia.page(page_title, auto_suggest=False, preload=False) # Use preload=False initially + _ = page.content # Access content to trigger load, may raise exception except Exception as page_load_error: - debug_print(f" -> Fehler beim Laden der Seite für CRM Vorschlag: {page_load_error}") + debug_print(f" -> Fehler beim Laden der Seite für CRM Vorschlag '{valid_crm_wiki_url}': {page_load_error}") page = None if page and self.wiki_scraper._validate_article(page, company_name, current_website_for_validation): article_page = page else: debug_print(f" -> CRM Vorschlag L nicht validiert. Starte Suche..."); article_page = self.wiki_scraper.search_company_article(company_name, current_website_for_validation) @@ -1652,7 +1668,6 @@ class DataProcessor: debug_print(f"--- Verarbeitung Zeile {row_num_in_sheet} abgeschlossen ---") time.sleep(0.05) # Minimale Pause - def process_rows_sequentially(self, start_row_index, num_rows_to_process, process_wiki=True, process_chatgpt=True, process_website=True): # unverändert data_rows = self.sheet_handler.get_data(); header_rows = Config.HEADER_ROWS if start_row_index >= len(data_rows): debug_print("Startindex hinter Datenende."); return @@ -1662,7 +1677,11 @@ class DataProcessor: for i in range(start_row_index, end_row_index): if i >= len(data_rows): debug_print(f"WARNUNG: Index {i} > Datenlänge ({len(data_rows)})."); break row_data = data_rows[i]; row_num_in_sheet = i + header_rows + 1 - self._process_single_row(row_num_in_sheet, row_data, process_wiki, process_chatgpt, process_website) + try: # Add try-except around single row processing + self._process_single_row(row_num_in_sheet, row_data, process_wiki, process_chatgpt, process_website) + except Exception as e: + debug_print(f"!! FEHLER in _process_single_row für Zeile {row_num_in_sheet}: {e}") + debug_print(traceback.format_exc()) # Print traceback for detailed error info def process_reevaluation_rows(self, row_limit=None, clear_flag=True): # unverändert debug_print(f"Starte Re-Eval Modus (A = 'x'). Max: {row_limit if row_limit is not None else 'Alle'}") @@ -1682,14 +1701,19 @@ class DataProcessor: if row_limit is not None and processed_count >= row_limit: debug_print(f"Limit ({row_limit}) erreicht."); break row_num = task['row_num']; row_data = task['data']; debug_print(f"--- Re-Evaluiere Z{row_num} ---") try: + # Ensure all processes run for re-evaluation self._process_single_row(row_num, row_data, process_wiki=True, process_chatgpt=True, process_website=True) processed_count += 1 if clear_flag: flag_col_letter = self.sheet_handler._get_col_letter(reeval_col_idx + 1) updates_clear_flag.append({'range': f'{flag_col_letter}{row_num}', 'values': [['']]}) - except Exception as e_proc: debug_print(f"FEHLER Re-Eval Z{row_num}: {e_proc}") + except Exception as e_proc: + debug_print(f"FEHLER Re-Eval Z{row_num}: {e_proc}") + debug_print(traceback.format_exc()) # Print traceback + # Do not clear flag on error to allow retry + if clear_flag and updates_clear_flag: - debug_print(f"Lösche ReEval-Flags für {len(updates_clear_flag)} Zeilen...") + debug_print(f"Lösche ReEval-Flags für {len(updates_clear_flag)} erfolgreich verarbeitete Zeilen...") success = self.sheet_handler.batch_update_cells(updates_clear_flag) if not success: debug_print("FEHLER Löschen ReEval-Flags.") debug_print(f"Re-Eval beendet. {processed_count} verarbeitet (Limit: {row_limit}).") @@ -1697,26 +1721,49 @@ class DataProcessor: def process_website_details_for_marked_rows(self): # unverändert debug_print("Starte Modus 23: Website Detail Extraction (A='x').") data_rows = self.sheet_handler.get_data(); header_rows = Config.HEADER_ROWS; rows_processed = 0 - reeval_col_idx = COLUMN_MAP.get("ReEval Flag"); website_col_idx = COLUMN_MAP.get("CRM Website"); details_col = "AR" # Assume AR for now - if reeval_col_idx is None or website_col_idx is None: debug_print("FEHLER: Spalten Modus 23 fehlen."); return + reeval_col_idx = COLUMN_MAP.get("ReEval Flag"); website_col_idx = COLUMN_MAP.get("CRM Website") + # Decide where to write details. AR (43) is Rohtext. Maybe new column needed? Using AR for now. + details_col_key = "Website Rohtext" + details_col_idx = COLUMN_MAP.get(details_col_key) + + if reeval_col_idx is None or website_col_idx is None or details_col_idx is None: + debug_print(f"FEHLER: Benötigte Spalten für Modus 23 nicht in COLUMN_MAP gefunden (ReEval, CRM Website, {details_col_key}).") + return + + details_col_letter = self.sheet_handler._get_col_letter(details_col_idx + 1) + for i, row in enumerate(data_rows): row_num_in_sheet = i + header_rows + 1 if len(row) > reeval_col_idx and row[reeval_col_idx].strip().lower() == "x": website_url = row[website_col_idx] if len(row) > website_col_idx else "" if not website_url or website_url.strip().lower() == "k.a.": debug_print(f"Z{row_num_in_sheet}: Keine Website (D), skip."); continue debug_print(f"Z{row_num_in_sheet}: Extrahiere Details von {website_url}...") - # Annahme: Funktion scrape_website_details existiert und gibt String zurück - # details = scrape_website_details(website_url) - details = "Platzhalter: Details für " + website_url # Platzhalter - details_col_letter = self.sheet_handler._get_col_letter(COLUMN_MAP.get(details_col, 43) + 1) # Get letter for AR (index 43) + # Define or import scrape_website_details function + # def scrape_website_details(url): return f"Details placeholder for {url}" + try: + # Placeholder for the actual detail scraping function + details = f"Details placeholder for {website_url}" # scrape_website_details(website_url) + except Exception as e_detail: + debug_print(f"Fehler beim Extrahieren der Details für {website_url}: {e_detail}") + details = "k.A. (Detail Extraktion Fehler)" + update_data = [{'range': f'{details_col_letter}{row_num_in_sheet}', 'values': [[details]]}] + # Optionally set a timestamp (e.g., in AT if it's related) + # ts_col_letter = self.sheet_handler._get_col_letter(COLUMN_MAP["Website Scrape Timestamp"] + 1) + # update_data.append({'range': f'{ts_col_letter}{row_num_in_sheet}', 'values': [[datetime.now().strftime("%Y-%m-%d %H:%M:%S")]]}) + self.sheet_handler.batch_update_cells(update_data) - debug_print(f"Z{row_num_in_sheet}: Details in {details_col} geschrieben.") - rows_processed += 1; time.sleep(Config.RETRY_DELAY) + debug_print(f"Z{row_num_in_sheet}: Details in {details_col_letter} geschrieben.") + rows_processed += 1; time.sleep(Config.RETRY_DELAY) # Pause between detail scrapes if needed debug_print(f"Modus 23 beendet. {rows_processed} verarbeitet.") def process_serp_website_lookup_for_empty(self): # unverändert debug_print("Starte Modus 22: SERP Website Lookup (D leer).") + # Ensure data is loaded before accessing get_data + if not self.sheet_handler.load_data(): + debug_print("FEHLER: Laden der Daten für Modus 22 fehlgeschlagen.") + return + data_rows = self.sheet_handler.get_data(); header_rows = Config.HEADER_ROWS; rows_processed = 0 website_col_idx = COLUMN_MAP.get("CRM Website"); name_col_idx = COLUMN_MAP.get("CRM Name") if website_col_idx is None or name_col_idx is None: debug_print("FEHLER: Spalten Modus 22 fehlen."); return @@ -1724,14 +1771,20 @@ class DataProcessor: for i, row in enumerate(data_rows): row_num_in_sheet = i + header_rows + 1 - current_website = row[website_col_idx] if len(row) > website_col_idx else "" + current_website = "" + # Check if row has enough columns before accessing index + if len(row) > website_col_idx: + current_website = row[website_col_idx] if not current_website or current_website.strip().lower() == "k.a.": - company_name = row[name_col_idx] if len(row) > name_col_idx else "" + company_name = "" + if len(row) > name_col_idx: + company_name = row[name_col_idx] + if not company_name: debug_print(f"Z{row_num_in_sheet}: Skip (kein Firmenname)."); continue debug_print(f"Z{row_num_in_sheet}: Suche Website für '{company_name}'...") - new_website = serp_website_lookup(company_name) # Annahme: Funktion existiert + new_website = serp_website_lookup(company_name) # Assumes serp_website_lookup is defined and works if new_website != "k.A.": update_data = [{'range': f'{website_col_letter}{row_num_in_sheet}', 'values': [[new_website]]}] self.sheet_handler.batch_update_cells(update_data) @@ -1739,11 +1792,11 @@ class DataProcessor: rows_processed += 1 else: debug_print(f"Z{row_num_in_sheet}: Keine Website gefunden.") - # Optional: Markieren, dass Suche fehlgeschlagen ist? + # Optional: Mark failure explicitly # update_data = [{'range': f'{website_col_letter}{row_num_in_sheet}', 'values': [['k.A. (SERP failed)']]] # self.sheet_handler.batch_update_cells(update_data) - time.sleep(Config.RETRY_DELAY) # Pause zwischen SERP API Calls + time.sleep(Config.RETRY_DELAY) # Pause between SERP API Calls debug_print(f"Modus 22 beendet. {rows_processed} Websites ergänzt.") @@ -1752,55 +1805,76 @@ class DataProcessor: def prepare_data_for_modeling(self): """ Lädt Daten aus dem Google Sheet über den sheet_handler, - bereitet sie für das Decision Tree Modell vor. (Implementierung aus v1.6.4 angenommen) + bereitet sie für das Decision Tree Modell vor. (Implementierung aus v1.6.5) """ debug_print("Starte Datenvorbereitung für Modellierung...") try: # --- 1. Daten laden & Spalten auswählen --- if not self.sheet_handler or not self.sheet_handler.sheet_values: - debug_print("Fehler: Sheet Handler nicht initialisiert oder keine Daten geladen.") - return None + # Attempt to load data if not already loaded + if not self.sheet_handler.load_data(): + debug_print("Fehler: Sheet Handler nicht initialisiert oder Daten konnten nicht geladen werden.") + return None + # Check again after loading + if not self.sheet_handler.sheet_values: + debug_print("Fehler: Keine Daten nach erneutem Laden.") + return None - all_data = self.sheet_handler.sheet_values + + all_data = self.sheet_handler.sheet_values # Use the loaded data if len(all_data) <= Config.HEADER_ROWS: # Verwende Config.HEADER_ROWS - debug_print("Fehler: Nicht genügend Datenzeilen im Sheet gefunden.") + debug_print(f"Fehler: Nicht genügend Datenzeilen ({len(all_data)}) im Sheet gefunden (benötigt > {Config.HEADER_ROWS}).") return None headers = all_data[0] data_rows = all_data[Config.HEADER_ROWS:] # Verwende Config.HEADER_ROWS + # Check if headers is a list and not empty + if not isinstance(headers, list) or not headers: + debug_print("FEHLER: Header-Zeile ist ungültig oder leer.") + return None + df = pd.DataFrame(data_rows, columns=headers) debug_print(f"DataFrame erstellt mit {len(df)} Zeilen und {len(df.columns)} Spalten.") # Finde die tatsächlichen Spaltennamen anhand der COLUMN_MAP col_indices = {} tech_col_key = "CRM Anzahl Techniker" # <- ANPASSEN, FALLS NÖTIG - try: - col_indices = { - "name": all_data[0][COLUMN_MAP["CRM Name"]], - "branche": all_data[0][COLUMN_MAP["CRM Branche"]], - "umsatz_crm": all_data[0][COLUMN_MAP["CRM Umsatz"]], - "umsatz_wiki": all_data[0][COLUMN_MAP["Wiki Umsatz"]], - "ma_crm": all_data[0][COLUMN_MAP["CRM Anzahl Mitarbeiter"]], - "ma_wiki": all_data[0][COLUMN_MAP["Wiki Mitarbeiter"]], - "techniker": all_data[0][COLUMN_MAP[tech_col_key]] - } - cols_to_select = list(col_indices.values()) - except KeyError as e: - debug_print(f"FEHLER: Konnte Mapping für Schlüssel '{e}' in COLUMN_MAP nicht finden oder Spalte nicht im Header.") - return None - except IndexError as e: - debug_print(f"FEHLER: Spaltenindex aus COLUMN_MAP ist außerhalb der Grenzen der Header-Zeile: {e}") + required_map_keys = ["CRM Name", "CRM Branche", "CRM Umsatz", "Wiki Umsatz", + "CRM Anzahl Mitarbeiter", "Wiki Mitarbeiter", tech_col_key] + actual_col_names = {} + missing_keys = [] + + for key in required_map_keys: + col_idx = COLUMN_MAP.get(key) + if col_idx is None: + missing_keys.append(key) + continue + try: + # Get actual column name from header row using the index + actual_name = headers[col_idx] + actual_col_names[key] = actual_name + except IndexError: + debug_print(f"FEHLER: Index {col_idx} für Key '{key}' ist außerhalb der Header-Grenzen (Länge {len(headers)}).") + missing_keys.append(f"{key} (Index Error)") + + if missing_keys: + debug_print(f"FEHLER: Folgende Keys/Spalten fehlen in COLUMN_MAP oder Header: {missing_keys}") + debug_print(f"Verfügbare Header: {headers}") return None + # Select using actual column names + cols_to_select = list(actual_col_names.values()) df_subset = df[cols_to_select].copy() - rename_map = {v: k for k, v in col_indices.items()} - df_subset.rename(columns=rename_map, inplace=True) + + # Rename columns to shorter keys for easier access + rename_map_inv = {v: k for k, v in actual_col_names.items()} # Map actual name back to key + df_subset.rename(columns=rename_map_inv, inplace=True) debug_print(f"Benötigte Spalten ausgewählt und umbenannt: {list(df_subset.columns)}") + # --- 2. Features konsolidieren --- - def get_valid_numeric_ml(value_str, final_col): # Use different name to avoid conflict? - # Implementation from v1.6.5 assumed here + def get_valid_numeric_ml(value_str, final_col): # Separate helper for ML prep if pd.isna(value_str) or value_str == '': return np.nan text = str(value_str).strip() text = re.sub(r'(?i)^(ca\.?|circa|über|unter|rund|etwa|mehr als|weniger als|bis zu)\s*', '', text) @@ -1815,27 +1889,46 @@ class DataProcessor: elif "mio" in text_lower or "millionen" in text_lower or "mill\." in text_lower: multiplier = 1.0; num_part = re.sub(r'(?i)\s*(mio\.?|millionen|mill\.?)\b.*', '', text).strip() elif "tsd" in text_lower or "tausend" in text_lower: - multiplier = 0.001 if 'Umsatz' in final_col else 1000.0 + # Determine if Umsatz or Mitarbeiter based on final_col name + is_umsatz_target = 'Umsatz' in final_col + multiplier = 0.001 if is_umsatz_target else 1000.0 num_part = re.sub(r'(?i)\s*(tsd\.?|tausend)\b.*', '', text).strip() - num_part_match = re.match(r'([\d.\-]+)', num_part) + # Match numeric part more robustly + num_part_match = re.search(r'([\d.,]+)', num_part) # Find first number group if not num_part_match: return np.nan num_part_str = num_part_match.group(1) + # Clean again after potential suffix removal + if '.' in num_part_str and ',' in num_part_str: num_part_str = num_part_str.replace('.', '').replace(',', '.') + elif ',' in num_part_str: num_part_str = num_part_str.replace(',', '.') + if '.' in num_part_str and num_part_str.count('.') > 1: num_part_str = num_part_str.replace('.', '') - try: val = float(num_part_str) * multiplier; return val if val > 0 else np.nan # Only positive values for modeling features? + + try: + val = float(num_part_str) * multiplier + # Allow 0? For modeling maybe not useful, filter later if needed. + # Keep 0 for now, filter >0 for target variable later. + return val if not pd.isna(val) else np.nan # Return NaN if calculation results in NaN except ValueError: return np.nan + cols_to_process = { - 'Umsatz': ('umsatz_wiki', 'umsatz_crm', 'Finaler_Umsatz'), - 'Mitarbeiter': ('ma_wiki', 'ma_crm', 'Finaler_Mitarbeiter') + 'Umsatz': ('Wiki Umsatz', 'CRM Umsatz', 'Finaler_Umsatz'), + 'Mitarbeiter': ('Wiki Mitarbeiter', 'CRM Anzahl Mitarbeiter', 'Finaler_Mitarbeiter') } - for base_name, (wiki_col, crm_col, final_col) in cols_to_process.items(): + for base_name, (wiki_key, crm_key, final_col) in cols_to_process.items(): debug_print(f"Verarbeite '{base_name}'...") - if wiki_col not in df_subset.columns: df_subset[wiki_col] = np.nan - if crm_col not in df_subset.columns: df_subset[crm_col] = np.nan + # Use the renamed short keys + wiki_col_short = wiki_key # Already renamed via rename_map_inv + crm_col_short = crm_key # Already renamed via rename_map_inv + + if wiki_col_short not in df_subset.columns: df_subset[wiki_col_short] = np.nan + if crm_col_short not in df_subset.columns: df_subset[crm_col_short] = np.nan + # Pass final_col name to helper function - wiki_numeric = df_subset[wiki_col].apply(lambda x: get_valid_numeric_ml(x, final_col)) - crm_numeric = df_subset[crm_col].apply(lambda x: get_valid_numeric_ml(x, final_col)) + wiki_numeric = df_subset[wiki_col_short].apply(lambda x: get_valid_numeric_ml(x, final_col)) + crm_numeric = df_subset[crm_col_short].apply(lambda x: get_valid_numeric_ml(x, final_col)) + # Prioritize Wiki > CRM > NaN df_subset[final_col] = np.where( wiki_numeric.notna(), wiki_numeric, @@ -1843,23 +1936,26 @@ class DataProcessor: ) debug_print(f" -> {df_subset[final_col].notna().sum()} gültige '{final_col}' Werte erstellt.") + # --- 3. Zielvariable vorbereiten --- - techniker_col = "techniker" - debug_print(f"Verarbeite Zielvariable '{techniker_col}'...") - df_subset['Anzahl_Servicetechniker_Numeric'] = pd.to_numeric(df_subset[techniker_col], errors='coerce') + techniker_col_short = tech_col_key # Use the short key from rename_map_inv + debug_print(f"Verarbeite Zielvariable '{techniker_col_short}' (Original: '{tech_col_key}')...") + df_subset['Anzahl_Servicetechniker_Numeric'] = pd.to_numeric(df_subset[techniker_col_short], errors='coerce') initial_rows = len(df_subset) + # Filter: Target variable must be > 0 for modeling df_filtered = df_subset[ df_subset['Anzahl_Servicetechniker_Numeric'].notna() & - (df_subset['Anzahl_Servicetechniker_Numeric'] > 0) # Must be > 0 for modeling + (df_subset['Anzahl_Servicetechniker_Numeric'] > 0) ].copy() filtered_rows = len(df_filtered) debug_print(f"{initial_rows - filtered_rows} Zeilen entfernt (fehlende/ungültige/<=0 Technikerzahl).") debug_print(f"Verbleibende Zeilen für Modellierung: {filtered_rows}") if filtered_rows == 0: return None + # --- 4. Techniker-Buckets erstellen --- # Use labels compatible with file names and variable names - bins = [-1, 0, 19, 49, 99, 249, 499, float('inf')] + bins = [-1, 0, 19, 49, 99, 249, 499, float('inf')] # -1 to include 0 if needed, but we filter >0 above labels = ['B1_0', 'B2_1_19', 'B3_20_49', 'B4_50_99', 'B5_100_249', 'B6_250_499', 'B7_500plus'] df_filtered['Techniker_Bucket'] = pd.cut( df_filtered['Anzahl_Servicetechniker_Numeric'], @@ -1868,30 +1964,36 @@ class DataProcessor: debug_print("Techniker-Buckets erstellt.") debug_print(f"Verteilung der Buckets:\n{df_filtered['Techniker_Bucket'].value_counts(normalize=True).round(3)}") - # --- 5. Kategoriale Features vorbereiten (Branche) --- - branche_col = "branche" - debug_print(f"Verarbeite kategoriales Feature '{branche_col}'...") - # Clean branch names before encoding - df_filtered[branche_col] = df_filtered[branche_col].astype(str).fillna('Unbekannt').str.strip() - # Remove prefix if present - df_filtered[branche_col] = df_filtered[branche_col].apply(lambda x: x.split(' > ')[-1] if ' > ' in x else x) - # Sanitize branch names for column headers (replace spaces, special chars) - df_filtered[branche_col] = df_filtered[branche_col].str.replace(r'\s+', '_', regex=True).str.replace(r'[^\w-]', '', regex=True) - df_encoded = pd.get_dummies(df_filtered, columns=[branche_col], prefix='Branche', dummy_na=False) + # --- 5. Kategoriale Features vorbereiten (Branche) --- + branche_col_short = "CRM Branche" # Use the short key + debug_print(f"Verarbeite kategoriales Feature '{branche_col_short}'...") + df_filtered[branche_col_short] = df_filtered[branche_col_short].astype(str).fillna('Unbekannt').str.strip() + # Remove prefix if present (e.g., "Hersteller / Produzenten > Maschinenbau" -> "Maschinenbau") + df_filtered[branche_col_short] = df_filtered[branche_col_short].apply(lambda x: x.split(' > ')[-1] if ' > ' in x else x) + # Sanitize branch names for column headers (replace spaces, special chars) + df_filtered['Branche_Cleaned'] = df_filtered[branche_col_short].str.replace(r'\s+', '_', regex=True).str.replace(r'[^\w-]', '', regex=True) + + # Perform One-Hot Encoding on the cleaned branch names + df_encoded = pd.get_dummies(df_filtered, columns=['Branche_Cleaned'], prefix='Branche', dummy_na=False) # Use the cleaned column debug_print(f"One-Hot Encoding für Branche durchgeführt.") + # --- 6. Finale Auswahl --- + # Features: Alle 'Branche_' Spalten plus die numerischen feature_columns = [col for col in df_encoded.columns if col.startswith('Branche_')] feature_columns.extend(['Finaler_Umsatz', 'Finaler_Mitarbeiter']) target_column = 'Techniker_Bucket' - # Keep original data columns for reference if needed later (e.g., for analysis) - original_data_cols = ['name', 'Anzahl_Servicetechniker_Numeric'] + + # Keep original data columns for reference/analysis if needed (optional) + original_data_cols = ['CRM Name', 'Anzahl_Servicetechniker_Numeric', 'CRM Branche'] # Keep original CRM Name and Branch # Ensure only required columns are in the final dataframe for modeling final_cols_for_model = feature_columns + [target_column] + + # Select the final columns needed for modeling + original data cols for reference df_model_ready = df_encoded[final_cols_for_model + original_data_cols].copy() - # Convert features to numeric, coercing errors (redundant if get_valid_numeric works perfectly, but safe) + # Convert numeric features again just to be safe (should already be float/NaN) for col in ['Finaler_Umsatz', 'Finaler_Mitarbeiter']: df_model_ready[col] = pd.to_numeric(df_model_ready[col], errors='coerce') @@ -1909,14 +2011,12 @@ class DataProcessor: return None - # ==================== MAIN FUNCTION ==================== def main(): global LOG_FILE # --- Initialisierung --- parser = argparse.ArgumentParser(description="Firmen-Datenanreicherungs-Skript") - # NEU: 'update_wiki' hinzugefügt valid_modes = ["combined", "wiki", "website", "branch", "summarize", "reeval", "website_lookup", "website_details", "contacts", "full_run", "alignment", "train_technician_model", "update_wiki"] @@ -1934,12 +2034,12 @@ def main(): if args.mode and args.mode.lower() in valid_modes: mode = args.mode.lower(); print(f"Betriebsmodus (aus Kommandozeile): {mode}") else: # Interaktive Abfrage print("Bitte wählen Sie den Betriebsmodus:") - print(" combined: Wiki(AX), Website-Scrape(AR), Summarize(AS), Branch(AO) (Batch, Start bei leerem AO, Branch Forced)") # Info angepasst + print(" combined: Wiki(AX), Website-Scrape(AR), Summarize(AS), Branch(AO) (Batch, Start bei leerem AO, Branch Forced)") print(" wiki: Nur Wikipedia-Verifizierung (AX) (Batch, Start bei leerem AX)") print(" website: Nur Website-Scraping Rohtext (AR) (Batch, Start bei leerem AR)") print(" summarize: Nur Website-Zusammenfassung (AS) (Batch, Start bei leerem AS)") - print(" branch: Nur Branchen-Einschätzung (AO) (Batch, Start bei leerem AO, mit TS Check)") # Info angepasst - print(" update_wiki: Wiki-URL aus Spalte U übernehmen, löscht TS für Reeval") # Info angepasst + print(" branch: Nur Branchen-Einschätzung (AO) (Batch, Start bei leerem AO, mit TS Check)") + print(" update_wiki: Wiki-URL aus Spalte U übernehmen, löscht TS für Reeval") print(" reeval: Verarbeitet Zeilen mit 'x' (volle Verarbeitung, alle TS prüfen)") print(" website_lookup: Sucht fehlende Websites (D)") print(" website_details:Extrahiert Details für Zeilen mit 'x' (AR)") @@ -2003,15 +2103,17 @@ def main(): elif mode == "full_run": if row_limit == 0: debug_print("Limit 0 -> Skip full_run.") else: - # full_run startet immer bei der ersten Zeile ohne AO start_index = sheet_handler.get_start_row_index(check_column_key="Timestamp letzte Prüfung") if start_index != -1 and start_index < len(sheet_handler.get_data()): num_available = len(sheet_handler.get_data()) - start_index - num_to_process = min(row_limit, num_available) if row_limit is not None and row_limit >= 0 else num_available + # Use row_limit if set and positive, otherwise process all available + num_to_process = num_available + if row_limit is not None and row_limit >= 0: + num_to_process = min(row_limit, num_available) + if num_to_process > 0: - # process_rows_sequentially ruft _process_single_row auf, das intern alle TS prüft data_processor.process_rows_sequentially(start_index, num_to_process, process_wiki=True, process_chatgpt=True, process_website=True) - else: debug_print("Keine Zeilen für 'full_run' zu verarbeiten.") + else: debug_print("Keine Zeilen für 'full_run' zu verarbeiten (Limit/Startindex).") else: debug_print(f"Startindex {start_index} für 'full_run' ungültig oder alle Zeilen bereits verarbeitet.") elif mode == "alignment": print("\nACHTUNG: Überschreibt A1:AX5!"); @@ -2026,7 +2128,7 @@ def main(): process_wiki_updates_from_chatgpt(sheet_handler, data_processor, row_limit=row_limit) # --- Ende Wiki Update Modus --- - # Block für Modelltraining (unverändert von v1.6.4) + # Block für Modelltraining (unverändert von v1.6.5) elif mode == "train_technician_model": debug_print(f"Starte Modus: {mode}") # Nutze die Methode aus dem DataProcessor @@ -2040,14 +2142,16 @@ def main(): X = prepared_df[feature_columns] y = prepared_df['Techniker_Bucket'] # Behalte Originaldaten für spätere Referenz oder Analyse falls nötig - original_data = prepared_df[['name', 'Anzahl_Servicetechniker_Numeric']] + original_data_cols = ['CRM Name', 'Anzahl_Servicetechniker_Numeric', 'CRM Branche'] # Use correct keys + original_data = prepared_df[original_data_cols] X_train, X_test, y_train, y_test, orig_train, orig_test = train_test_split( X, y, original_data, test_size=0.25, random_state=42, stratify=y ) debug_print(f"Trainingsdaten: {X_train.shape[0]} Zeilen, Testdaten: {X_test.shape[0]} Zeilen.") split_successful = True - except Exception as e: debug_print(f"FEHLER Split: {e}"); split_successful = False + except Exception as e: debug_print(f"FEHLER Split: {e}"); split_successful = False; debug_print(traceback.format_exc()) + if split_successful: debug_print("Imputation...") @@ -2055,13 +2159,15 @@ def main(): try: imputer = SimpleImputer(strategy='median') # Wichtig: Imputer nur auf Trainingsdaten fitten! + # Use .loc to avoid SettingWithCopyWarning X_train.loc[:, numeric_features] = imputer.fit_transform(X_train[numeric_features]) # Testdaten nur transformieren X_test.loc[:, numeric_features] = imputer.transform(X_test[numeric_features]) imputer_filename = args.imputer_out; pickle.dump(imputer, open(imputer_filename, 'wb')) debug_print(f"Imputer gespeichert: '{imputer_filename}'.") imputation_successful = True - except Exception as e: debug_print(f"FEHLER Imputation: {e}"); imputation_successful = False + except Exception as e: debug_print(f"FEHLER Imputation: {e}"); imputation_successful = False; debug_print(traceback.format_exc()) + if imputation_successful: debug_print("Starte Training/GridSearchCV...") @@ -2145,6 +2251,4 @@ def main(): # Führt die main-Funktion aus, wenn das Skript direkt gestartet wird if __name__ == '__main__': - # Stelle sicher, dass OpenAI Key geladen ist bevor etwas anderes passiert (wird auch in Config gemacht) - # Config.load_api_keys() # Bereits in main() am Anfang main() \ No newline at end of file