From 61b89383dad8ea26d5009185e1b77c0b953d9600 Mon Sep 17 00:00:00 2001 From: Floke Date: Wed, 27 Aug 2025 14:20:50 +0000 Subject: [PATCH] brancheneinstufung2.py aktualisiert --- brancheneinstufung2.py | 185 ++++++++++++++++++++++++----------------- 1 file changed, 108 insertions(+), 77 deletions(-) diff --git a/brancheneinstufung2.py b/brancheneinstufung2.py index ae76c3bd..6d68ecdc 100644 --- a/brancheneinstufung2.py +++ b/brancheneinstufung2.py @@ -12,6 +12,7 @@ Version: v1.8.0 """ import logging +import os import argparse import time from datetime import datetime @@ -23,6 +24,7 @@ from helpers import create_log_filename, initialize_target_schema, alignment_dem from google_sheet_handler import GoogleSheetHandler from wikipedia_scraper import WikipediaScraper from data_processor import DataProcessor +from sync_manager import SyncManager import helpers import google_sheet_handler @@ -51,12 +53,26 @@ def main(): Verarbeitet Kommandozeilen-Argumente, richtet Logging ein, initialisiert Komponenten und dispatchet zu den passenden Modi. """ + # --- Importe innerhalb der Funktion, um Abhängigkeiten klar zu halten --- + import argparse + import time + import logging + import os # <<< NEU: für Dateipfad-Prüfung + from config import Config, log_module_versions, create_log_filename + from google_sheet_handler import GoogleSheetHandler + from wikipedia_scraper import WikipediaScraper + from data_processor import DataProcessor + from sync_manager import SyncManager # <<< NEU: SyncManager importieren + import helpers + import google_sheet_handler # Für Version-Logging + # --- Argument Parser --- parser = argparse.ArgumentParser( description=f"Firmen-Datenanreicherungs-Skript {Config.VERSION}.", formatter_class=argparse.RawTextHelpFormatter ) mode_categories = { + "Daten-Synchronisation": ["sync"], # <<< NEU: Eigene Kategorie für den Sync "Batch-Verarbeitung": ["wiki_verify", "website_scraping", "summarize_website", "branch_eval", "suggest_parents", "fsm_pitch"], "Sequentielle Verarbeitung": ["full_run"], "Re-Evaluation": ["reeval"], @@ -79,6 +95,9 @@ def main(): parser.add_argument("--min_umsatz", type=float, help="Mindestumsatz in MIO € für 'find_wiki_serp'.", default=200.0) parser.add_argument("--min_employees", type=int, help="Mindest-MA für 'find_wiki_serp'.", default=500) + # <<< NEU: Argument für den Pfad der Sync-Datei + parser.add_argument("--sync_file", type=str, help="Pfad zur D365 Excel-Exportdatei für den 'sync'-Modus.", default="d365_export.xlsx") + args = parser.parse_args() # --- Modusauswahl (interaktiv, wenn nicht über CLI) --- @@ -111,6 +130,13 @@ def main(): print("\nAbgebrochen.") return + # --- Logging Konfiguration --- + # Definiere hier die Logging-Konstanten, falls sie nicht global sind + LOG_LEVEL = logging.DEBUG if Config.DEBUG else logging.INFO + LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)-25s - %(message)s' + logging.basicConfig(level=LOG_LEVEL, format=LOG_FORMAT) + logger = logging.getLogger(__name__) + # --- Logdatei-Konfiguration abschließen --- log_file_path = create_log_filename(selected_mode) if log_file_path: @@ -120,7 +146,7 @@ def main(): logging.getLogger('').addHandler(file_handler) logger.info(f"===== Skript gestartet: Modus '{selected_mode}' =====") - logger.info(f"Projekt-Version (Config): {Config.VERSION}") # Umbenannt zur Klarheit + logger.info(f"Projekt-Version (Config): {Config.VERSION}") logger.info(f"Logdatei: {log_file_path or 'FEHLER - Keine Logdatei'}") logger.info(f"CLI Argumente: {args}") @@ -130,81 +156,89 @@ def main(): Config.load_api_keys() sheet_handler = GoogleSheetHandler() - wiki_scraper = WikipediaScraper() - data_processor = DataProcessor(sheet_handler=sheet_handler, wiki_scraper=wiki_scraper) - # --- Modul-Versionen loggen (NACH der Initialisierung) --- - modules_to_log = { - "DataProcessor": data_processor, - "GoogleSheetHandler": google_sheet_handler, - "WikipediaScraper": wikipedia_scraper, - "Helpers": helpers - } - log_module_versions(modules_to_log) - # --- Ende Version-Logging --- - - # Expliziter Setup-Aufruf, nachdem alle Konfigurationen geladen sind. - if not data_processor.setup(): - logger.critical("Setup des DataProcessors fehlgeschlagen. Das Skript wird beendet.") - return - - # --- Modus-Dispatching --- - start_time = time.time() - - # Sequentiell & Re-Eval Schritte parsen - steps_to_run_set = set(step.strip().lower() for step in args.steps.split(',') if step.strip() in valid_steps) if args.steps else set(valid_steps) - - if selected_mode == "full_run": - start_row = args.start_sheet_row or sheet_handler.get_start_row_index("Timestamp letzte Pruefung") + sheet_handler._header_rows + 1 - num_to_process = args.limit or (len(sheet_handler.get_all_data_with_headers()) - start_row + 1) - data_processor.process_rows_sequentially( - start_sheet_row=start_row, num_to_process=num_to_process, - process_wiki_steps='wiki' in steps_to_run_set, - process_chatgpt_steps='chat' in steps_to_run_set, - process_website_steps='web' in steps_to_run_set, - process_ml_steps='ml_predict' in steps_to_run_set - ) - elif selected_mode == "reeval": - data_processor.process_reevaluation_rows( - row_limit=args.limit, clear_flag=True, - process_wiki_steps='wiki' in steps_to_run_set, - process_chatgpt_steps='chat' in steps_to_run_set, - process_website_steps='web' in steps_to_run_set, - process_ml_steps='ml_predict' in steps_to_run_set - ) - elif selected_mode == "reclassify_branches": - data_processor.reclassify_all_branches( - start_sheet_row=args.start_sheet_row, - limit=args.limit - ) - elif selected_mode == "alignment": - alignment_demo(sheet_handler) - elif selected_mode == "train_technician_model": - data_processor.train_technician_model() - # KORRIGIERTE EINRÜCKUNG - elif selected_mode == "predict_technicians": - data_processor.process_predict_technicians( - start_sheet_row=args.start_sheet_row, - limit=args.limit - ) - elif hasattr(data_processor, f"process_{selected_mode}"): - # Dynamischer Aufruf für die meisten Batch-Modi - method_to_call = getattr(data_processor, f"process_{selected_mode}") - method_args = {} - if "limit" in method_to_call.__code__.co_varnames: method_args["limit"] = args.limit - if "start_sheet_row" in method_to_call.__code__.co_varnames: method_args["start_sheet_row"] = args.start_sheet_row - if "end_sheet_row" in method_to_call.__code__.co_varnames: method_args["end_sheet_row"] = args.end_sheet_row - if "min_umsatz" in method_to_call.__code__.co_varnames: method_args["min_umsatz"] = args.min_umsatz - if "min_employees" in method_to_call.__code__.co_varnames: method_args["min_employees"] = args.min_employees - method_to_call(**method_args) - elif hasattr(data_processor, f"run_{selected_mode}"): # Für 'run_plausibility_checks_batch' - method_to_call = getattr(data_processor, f"run_{selected_mode}") - method_to_call(start_sheet_row=args.start_sheet_row, end_sheet_row=args.end_sheet_row, limit=args.limit) + # <<< NEU: Früher Ausstieg für den Sync-Modus, da er keine Scraper/Prozessoren braucht + if selected_mode == "sync": + d365_file_path = args.sync_file + if not os.path.exists(d365_file_path): + logger.critical(f"Export-Datei nicht gefunden: {d365_file_path}") + print(f"\n! FEHLER: Die angegebene Sync-Datei wurde nicht gefunden: {d365_file_path}") + else: + sync_manager = SyncManager(sheet_handler, d365_file_path) + sync_manager.run_sync() else: - logger.error(f"Unbekannter Modus '{selected_mode}' im Dispatcher.") + # Bisherige Initialisierung für alle anderen Modi + wiki_scraper = WikipediaScraper() + data_processor = DataProcessor(sheet_handler=sheet_handler, wiki_scraper=wiki_scraper) + + # --- Modul-Versionen loggen --- + modules_to_log = { + "DataProcessor": data_processor, + "GoogleSheetHandler": google_sheet_handler, + "WikipediaScraper": wikipedia_scraper, + "Helpers": helpers + } + log_module_versions(modules_to_log) + + if not data_processor.setup(): + logger.critical("Setup des DataProcessors fehlgeschlagen. Das Skript wird beendet.") + return - duration = time.time() - start_time - logger.info(f"Verarbeitung im Modus '{selected_mode}' abgeschlossen. Dauer: {duration:.2f} Sekunden.") + # --- Modus-Dispatching --- + start_time = time.time() + + steps_to_run_set = set(step.strip().lower() for step in args.steps.split(',') if step.strip() in valid_steps) if args.steps else set(valid_steps) + + if selected_mode == "full_run": + start_row = args.start_sheet_row or sheet_handler.get_start_row_index("Timestamp letzte Pruefung") + sheet_handler._header_rows + 1 + num_to_process = args.limit or (len(sheet_handler.get_all_data_with_headers()) - start_row + 1) + data_processor.process_rows_sequentially( + start_sheet_row=start_row, num_to_process=num_to_process, + process_wiki_steps='wiki' in steps_to_run_set, + process_chatgpt_steps='chat' in steps_to_run_set, + process_website_steps='web' in steps_to_run_set, + process_ml_steps='ml_predict' in steps_to_run_set + ) + elif selected_mode == "reeval": + data_processor.process_reevaluation_rows( + row_limit=args.limit, clear_flag=True, + process_wiki_steps='wiki' in steps_to_run_set, + process_chatgpt_steps='chat' in steps_to_run_set, + process_website_steps='web' in steps_to_run_set, + process_ml_steps='ml_predict' in steps_to_run_set + ) + # ... (alle anderen elif-Blöcke bleiben wie sie sind) ... + elif selected_mode == "reclassify_branches": + data_processor.reclassify_all_branches( + start_sheet_row=args.start_sheet_row, + limit=args.limit + ) + elif selected_mode == "alignment": + alignment_demo(sheet_handler) + elif selected_mode == "train_technician_model": + data_processor.train_technician_model() + elif selected_mode == "predict_technicians": + data_processor.process_predict_technicians( + start_sheet_row=args.start_sheet_row, + limit=args.limit + ) + elif hasattr(data_processor, f"process_{selected_mode}"): + method_to_call = getattr(data_processor, f"process_{selected_mode}") + method_args = {} + if "limit" in method_to_call.__code__.co_varnames: method_args["limit"] = args.limit + if "start_sheet_row" in method_to_call.__code__.co_varnames: method_args["start_sheet_row"] = args.start_sheet_row + if "end_sheet_row" in method_to_call.__code__.co_varnames: method_args["end_sheet_row"] = args.end_sheet_row + if "min_umsatz" in method_to_call.__code__.co_varnames: method_args["min_umsatz"] = args.min_umsatz + if "min_employees" in method_to_call.__code__.co_varnames: method_args["min_employees"] = args.min_employees + method_to_call(**method_args) + elif hasattr(data_processor, f"run_{selected_mode}"): + method_to_call = getattr(data_processor, f"run_{selected_mode}") + method_to_call(start_sheet_row=args.start_sheet_row, end_sheet_row=args.end_sheet_row, limit=args.limit) + else: + logger.error(f"Unbekannter Modus '{selected_mode}' im Dispatcher.") + + duration = time.time() - start_time + logger.info(f"Verarbeitung im Modus '{selected_mode}' abgeschlossen. Dauer: {duration:.2f} Sekunden.") except (KeyboardInterrupt, EOFError): logger.warning("Skript durch Benutzer unterbrochen.") @@ -218,7 +252,4 @@ def main(): logger.info(f"===== Skript beendet =====") logging.shutdown() if 'selected_mode' in locals() and selected_mode != 'exit' and 'log_file_path' in locals() and log_file_path: - print(f"\nVerarbeitung abgeschlossen. Logfile: {log_file_path}") - -if __name__ == '__main__': - main() \ No newline at end of file + print(f"\nVerarbeitung abgeschlossen. Logfile: {log_file_path}") \ No newline at end of file