diff --git a/duplicate_checker.py b/duplicate_checker.py index 6b275e98..d86874e6 100644 --- a/duplicate_checker.py +++ b/duplicate_checker.py @@ -1,3 +1,4 @@ +import os import re import logging import pandas as pd @@ -14,11 +15,37 @@ WEIGHTS = { 'name': 0.4, 'city': 0.1, } +LOG_DIR = '/log' +LOG_FILENAME = 'duplicate_check.log' # --- Logging Setup --- -LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)s - %(message)s' -logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT) +if not os.path.exists(LOG_DIR): + try: + os.makedirs(LOG_DIR) + except Exception as e: + print(f"Warnung: Konnte Log-Ordner nicht anlegen: {e}") +log_path = os.path.join(LOG_DIR, LOG_FILENAME) + logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + +formatter = logging.Formatter('%(asctime)s - %(levelname)-8s - %(name)s - %(message)s') + +# Console Handler +console_handler = logging.StreamHandler() +console_handler.setLevel(logging.INFO) +console_handler.setFormatter(formatter) +logger.addHandler(console_handler) + +# File Handler +try: + file_handler = logging.FileHandler(log_path, mode='a', encoding='utf-8') + file_handler.setLevel(logging.DEBUG) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + logger.info(f"Logging auch in Datei: {log_path}") +except Exception as e: + logger.warning(f"Konnte keine Log-Datei schreiben: {e}") # --- Hilfsfunktionen --- def normalize_company_name(name: str) -> str: @@ -41,7 +68,7 @@ def normalize_domain(url: str) -> str: def main(): - logger.info("Starte den Duplikats-Check (v2.0 mit Blocking und Maximum Logging)...") + logger.info("Starte den Duplikats-Check (v2.0 mit Logging in /log)...") # GoogleSheetHandler initialisieren try: sheet_handler = GoogleSheetHandler() @@ -71,7 +98,7 @@ def main(): df['norm_name'] = df['CRM Name'].fillna('').apply(normalize_company_name) df['norm_domain'] = df['CRM Website'].fillna('').apply(normalize_domain) df['city'] = df['CRM Ort'].fillna('').apply(lambda x: str(x).casefold().strip()) - logger.debug(f"{label}-Daten nach Normalisierung. Erste Zeile: {df.iloc[0].to_dict()}") + logger.debug(f"{label}-Daten normalisiert. Erste Zeile: {df.iloc[0].to_dict()}") # Blocking per Domain indexer = recordlinkage.Index() @@ -100,7 +127,7 @@ def main(): for match_idx, group in features.reset_index().groupby('level_1'): logger.info(f"--- Prüfe: Zeile {match_idx} ---") df_block = group.sort_values('score', ascending=False) - logger.debug(f" Kandidaten für Zeile {match_idx}:\n{df_block[['level_0','score','domain','name_sim','city']].to_string(index=False)}") + logger.debug(f"Kandidaten für Zeile {match_idx}:\n{df_block[['level_0','score','domain','name_sim','city']].to_string(index=False)}") top = df_block.iloc[0] crm_idx = top['level_0'] if top['score'] >= SCORE_THRESHOLD else None if crm_idx is not None: