diff --git a/brancheneinstufung2.py b/brancheneinstufung2.py new file mode 100644 index 00000000..28a44a73 --- /dev/null +++ b/brancheneinstufung2.py @@ -0,0 +1,193 @@ +# --- START OF FILE brancheneinstufung.py --- + +#!/usr/bin/env python3 +""" +brancheneinstufung.py - Hauptskript v1.8.0 + +Dieses Skript dient als Haupteinstiegspunkt für das Projekt zur automatisierten +Anreicherung, Validierung und Standardisierung von Unternehmensdaten. Es parst +Kommandozeilen-Argumente, initialisiert die notwendigen Handler und den +DataProcessor und startet den ausgewählten Verarbeitungsmodus. + +Autor: Christian Godelmann +Version: v1.8.0 +""" + +import logging +import argparse +import time +from datetime import datetime + +# Import der Projekt-Module +from config import Config +from helpers import create_log_filename, load_target_schema, alignment_demo +from google_sheet_handler import GoogleSheetHandler +from wikipedia_scraper import WikipediaScraper +from data_processor import DataProcessor + +def main(): + """ + Haupteinstiegspunkt des Skripts. + Verarbeitet Kommandozeilen-Argumente, richtet Logging ein, + initialisiert Komponenten und dispatchet zu den passenden Modi. + """ + # --- Initiales Logging Setup (vor File Handler) --- + log_level = logging.DEBUG if Config.DEBUG else logging.INFO + log_format = '%(asctime)s - %(levelname)-8s - %(name)-25s - %(message)s' + logging.basicConfig(level=log_level, format=log_format, handlers=[]) + console_handler = logging.StreamHandler() + console_handler.setLevel(log_level) + console_handler.setFormatter(logging.Formatter(log_format)) + if not any(isinstance(h, logging.StreamHandler) for h in logging.getLogger('').handlers): + logging.getLogger('').addHandler(console_handler) + + logger = logging.getLogger(__name__) + logger.debug("Initiales Konsolen-Logging konfiguriert.") + + # --- Argument Parser --- + parser = argparse.ArgumentParser( + description=f"Firmen-Datenanreicherungs-Skript {Config.VERSION}.", + formatter_class=argparse.RawTextHelpFormatter + ) + mode_categories = { + "Batch-Verarbeitung": ["wiki_verify", "website_scraping", "summarize_website", "branch_eval", "suggest_parents"], + "Sequentielle Verarbeitung": ["full_run"], + "Re-Evaluation": ["reeval"], + "Dienstprogramme": ["find_wiki_serp", "check_urls", "contacts", "update_wiki_suggestions", "wiki_reextract_missing_an", "website_details", "train_technician_model", "alignment", "reparatur_sitz", "plausi_check_data"], + "Kombinierte Läufe": ["combined_all"] + } + valid_modes = [mode for modes in mode_categories.values() for mode in modes] + mode_help_text = "Betriebsmodus. Waehlen Sie einen der folgenden:\n" + for category, modes in mode_categories.items(): + mode_help_text += f"\n{category}:\n" + "".join([f" - {mode}\n" for mode in modes]) + + parser.add_argument("--mode", type=str, help=mode_help_text) + parser.add_argument("--limit", type=int, help="Maximale Anzahl zu verarbeitender Zeilen.", default=None) + parser.add_argument("--start_sheet_row", type=int, help="Startzeile im Sheet (1-basiert).", default=None) + parser.add_argument("--end_sheet_row", type=int, help="Endzeile im Sheet (1-basiert).", default=None) + + valid_steps = ['wiki', 'chat', 'web', 'ml_predict'] + parser.add_argument("--steps", type=str, help=f"Schritte für 'reeval'/'full_run' (z.B. 'wiki,chat'). Optionen: {', '.join(valid_steps)}.", default=','.join(valid_steps)) + parser.add_argument("--min_umsatz", type=float, help="Mindestumsatz in MIO € für 'find_wiki_serp'.", default=200.0) + parser.add_argument("--min_employees", type=int, help="Mindest-MA für 'find_wiki_serp'.", default=500) + + args = parser.parse_args() + + # --- Modusauswahl (interaktiv, wenn nicht über CLI) --- + selected_mode = args.mode.lower() if args.mode else None + if not selected_mode: + print("\nBitte waehlen Sie den Betriebsmodus:") + mode_map = {} + counter = 1 + for category, modes in mode_categories.items(): + print(f"\n{category}:") + for mode in modes: + print(f" {counter}: {mode}") + mode_map[str(counter)] = mode + mode_map[mode] = mode + counter += 1 + print("\n 0: Abbrechen") + mode_map['0'] = 'exit' + + while selected_mode is None: + try: + choice = input("Geben Sie den Modusnamen oder die Zahl ein: ").strip().lower() + if choice in mode_map: + selected_mode = mode_map[choice] + if selected_mode == 'exit': + print("Abgebrochen.") + return + else: + print("Ungueltige Eingabe.") + except (EOFError, KeyboardInterrupt): + print("\nAbgebrochen.") + return + + # --- Logging-Konfiguration abschließen --- + log_file_path = create_log_filename(selected_mode) + if log_file_path: + file_handler = logging.FileHandler(log_file_path, mode='a', encoding='utf-8') + file_handler.setLevel(log_level) + file_handler.setFormatter(logging.Formatter(log_format)) + logging.getLogger('').addHandler(file_handler) + + logger.info(f"===== Skript gestartet: Modus '{selected_mode}' =====") + logger.info(f"Version: {Config.VERSION}") + logger.info(f"Logdatei: {log_file_path or 'FEHLER - Keine Logdatei'}") + logger.info(f"CLI Argumente: {args}") + + # --- Hauptlogik --- + try: + # --- Vorbereitung --- + Config.load_api_keys() + load_target_schema() + + sheet_handler = GoogleSheetHandler() + wiki_scraper = WikipediaScraper() + data_processor = DataProcessor(sheet_handler=sheet_handler, wiki_scraper=wiki_scraper) + + # --- Modus-Dispatching --- + start_time = time.time() + + # Sequentiell & Re-Eval Schritte parsen + steps_to_run_set = set(step.strip().lower() for step in args.steps.split(',') if step.strip() in valid_steps) if args.steps else set(valid_steps) + + if selected_mode == "full_run": + start_row = args.start_sheet_row or sheet_handler.get_start_row_index("Timestamp letzte Pruefung") + header_rows + 1 + num_to_process = args.limit or (len(sheet_handler.get_all_data_with_headers()) - start_row + 1) + data_processor.process_rows_sequentially( + start_sheet_row=start_row, num_to_process=num_to_process, + process_wiki_steps='wiki' in steps_to_run_set, + process_chatgpt_steps='chat' in steps_to_run_set, + process_website_steps='web' in steps_to_run_set, + process_ml_steps='ml_predict' in steps_to_run_set + ) + elif selected_mode == "reeval": + data_processor.process_reevaluation_rows( + row_limit=args.limit, clear_flag=True, + process_wiki_steps='wiki' in steps_to_run_set, + process_chatgpt_steps='chat' in steps_to_run_set, + process_website_steps='web' in steps_to_run_set, + process_ml_steps='ml_predict' in steps_to_run_set + ) + elif selected_mode == "alignment": + alignment_demo(sheet_handler.sheet) + elif selected_mode == "train_technician_model": + data_processor.train_technician_model() + elif hasattr(data_processor, f"process_{selected_mode}"): + # Dynamischer Aufruf für die meisten Batch-Modi + method_to_call = getattr(data_processor, f"process_{selected_mode}") + # Nur relevante Argumente übergeben + method_args = {} + if "limit" in method_to_call.__code__.co_varnames: method_args["limit"] = args.limit + if "start_sheet_row" in method_to_call.__code__.co_varnames: method_args["start_sheet_row"] = args.start_sheet_row + if "end_sheet_row" in method_to_call.__code__.co_varnames: method_args["end_sheet_row"] = args.end_sheet_row + if "min_umsatz" in method_to_call.__code__.co_varnames: method_args["min_umsatz"] = args.min_umsatz + if "min_employees" in method_to_call.__code__.co_varnames: method_args["min_employees"] = args.min_employees + method_to_call(**method_args) + elif hasattr(data_processor, f"run_{selected_mode}"): # Für 'run_plausibility_checks_batch' + method_to_call = getattr(data_processor, f"run_{selected_mode}") + method_to_call(start_sheet_row=args.start_sheet_row, end_sheet_row=args.end_sheet_row, limit=args.limit) + else: + logger.error(f"Unbekannter Modus '{selected_mode}' im Dispatcher.") + + duration = time.time() - start_time + logger.info(f"Verarbeitung im Modus '{selected_mode}' abgeschlossen. Dauer: {duration:.2f} Sekunden.") + + except (KeyboardInterrupt, EOFError): + logger.warning("Skript durch Benutzer unterbrochen.") + print("\n! Skript wurde manuell beendet.") + except Exception as e: + logger.critical(f"FATAL: Unerwarteter Fehler im Hauptprozess: {e}", exc_info=True) + print(f"\n! Ein kritischer Fehler ist aufgetreten: {e}") + print(f"Bitte pruefen Sie die Logdatei fuer Details: {log_file_path}") + finally: + logger.info(f"===== Skript beendet =====") + logging.shutdown() + if log_file_path: + print(f"\nVerarbeitung abgeschlossen. Logfile: {log_file_path}") + +if __name__ == '__main__': + main() + +# --- END OF FILE brancheneinstufung.py --- \ No newline at end of file