From 3febe14526c6ac50595c945d92675ba830cd34bb Mon Sep 17 00:00:00 2001 From: Floke Date: Wed, 6 Aug 2025 13:39:30 +0000 Subject: [PATCH] duplicate_checker.py aktualisiert --- duplicate_checker.py | 92 +++++++++++++++++++++++--------------------- 1 file changed, 48 insertions(+), 44 deletions(-) diff --git a/duplicate_checker.py b/duplicate_checker.py index 0a9c1155..5f28fb92 100644 --- a/duplicate_checker.py +++ b/duplicate_checker.py @@ -6,88 +6,93 @@ from thefuzz import fuzz from helpers import normalize_company_name, simple_normalize_url from google_sheet_handler import GoogleSheetHandler -# duplicate_checker.py v2.6 (Original v2.0 Kern + Logging) -# Version: 2025-08-06_17-15 +# duplicate_checker.py v2.7 (Logging-Setup Fix) +# Version: 2025-08-06_17-30 # --- Konfiguration --- CRM_SHEET_NAME = "CRM_Accounts" MATCHING_SHEET_NAME = "Matching_Accounts" SCORE_THRESHOLD = 80 LOG_DIR = "Log" -LOG_FILE = "duplicate_check_v2.6.log" +LOG_FILE = "duplicate_check_v2.7.log" # --- Logging Setup --- if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR, exist_ok=True) log_path = os.path.join(LOG_DIR, LOG_FILE) -# Global logging config -logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s - %(levelname)-8s - %(message)s", - handlers=[ - logging.StreamHandler(sys.stdout), - logging.FileHandler(log_path, mode='a', encoding='utf-8') - ] -) -logger = logging.getLogger(__name__) +# Clear existing handlers +root_logger = logging.getLogger() +root_logger.setLevel(logging.DEBUG) +for h in list(root_logger.handlers): + root_logger.removeHandler(h) -logger.info(f"Starting duplicate_checker.py v2.6 | Log: {log_path}") +# Formatter +formatter = logging.Formatter("%(asctime)s - %(levelname)-8s - %(message)s") + +# Console Handler - INFO+ +ch = logging.StreamHandler(sys.stdout) +ch.setLevel(logging.INFO) +ch.setFormatter(formatter) +root_logger.addHandler(ch) + +# File Handler - DEBUG+ +fh = logging.FileHandler(log_path, mode='a', encoding='utf-8') +fh.setLevel(logging.DEBUG) +fh.setFormatter(formatter) +root_logger.addHandler(fh) + +logger = logging.getLogger(__name__) +logger.info(f"Logging to console and file: {log_path}") +logger.info("Starting duplicate_checker.py v2.7 | Version: 2025-08-06_17-30") def calculate_similarity(record1, record2): - """Berechnet einen gewichteten Ähnlichkeits-Score (0–190).""" total_score = 0 - # Domain-Exact dom1 = record1.get('normalized_domain', '') dom2 = record2.get('normalized_domain', '') if dom1 and dom1 == dom2: total_score += 100 - # Name-Fuzzy name1 = record1.get('normalized_name', '') name2 = record2.get('normalized_name', '') if name1 and name2: - name_similarity = fuzz.token_set_ratio(name1, name2) - total_score += name_similarity * 0.7 - # Ort+Land exact - if record1.get('CRM Ort') and record1.get('CRM Ort') == record2.get('CRM Ort'): - if record1.get('CRM Land') and record1.get('CRM Land') == record2.get('CRM Land'): - total_score += 20 + total_score += fuzz.token_set_ratio(name1, name2) * 0.7 + if record1.get('CRM Ort') == record2.get('CRM Ort') and record1.get('CRM Land') == record2.get('CRM Land'): + total_score += 20 return round(total_score) def main(): - logger.info("Starte Duplikats-Check v2.6 (Original v2.0 Kern mit Logging)") + logger.info("Starte Duplikats-Check v2.7") try: sheet_handler = GoogleSheetHandler() logger.info("GoogleSheetHandler initialisiert") except Exception as e: - logger.critical(f"FEHLER Init GoogleSheetHandler: {e}") + logger.critical(f"Init GoogleSheetHandler fehlgeschlagen: {e}") sys.exit(1) # Load data - logger.info(f"Lade Master-Daten aus '{CRM_SHEET_NAME}'...") + logger.info(f"Lade CRM-Daten aus '{CRM_SHEET_NAME}'...") crm_df = sheet_handler.get_sheet_as_dataframe(CRM_SHEET_NAME) if crm_df is None or crm_df.empty: - logger.critical(f"Keine Daten in '{CRM_SHEET_NAME}'. Abbruch.") + logger.critical("CRM-Tab leer. Abbruch.") return logger.info(f"{len(crm_df)} CRM-Datensätze geladen") logger.info(f"Lade Matching-Daten aus '{MATCHING_SHEET_NAME}'...") match_df = sheet_handler.get_sheet_as_dataframe(MATCHING_SHEET_NAME) if match_df is None or match_df.empty: - logger.critical(f"Keine Daten in '{MATCHING_SHEET_NAME}'. Abbruch.") + logger.critical("Matching-Tab leer. Abbruch.") return logger.info(f"{len(match_df)} Matching-Datensätze geladen") - # Normalize - logger.info("Normalisiere Daten...") + # Normalize & blocking key for df, label in [(crm_df, 'CRM'), (match_df, 'Matching')]: - df['normalized_name'] = df['CRM Name'].astype(str).apply(normalize_company_name) + df['normalized_name'] = df['CRM Name'].astype(str).apply(normalize_company_name) df['normalized_domain'] = df['CRM Website'].astype(str).apply(simple_normalize_url) - df['CRM Ort'] = df['CRM Ort'].astype(str).str.lower().str.strip() - df['CRM Land'] = df['CRM Land'].astype(str).str.lower().str.strip() - df['block_key'] = df['normalized_name'].apply(lambda x: x.split()[0] if x else None) + df['CRM Ort'] = df['CRM Ort'].astype(str).str.lower().str.strip() + df['CRM Land'] = df['CRM Land'].astype(str).str.lower().str.strip() + df['block_key'] = df['normalized_name'].apply(lambda x: x.split()[0] if x else None) logger.debug(f"{label}-Sample: {df.iloc[0][['normalized_name','normalized_domain','block_key']].to_dict()}") # Build blocking index @@ -97,10 +102,10 @@ def main(): key = row['block_key'] if key: crm_index.setdefault(key, []).append(row) - logger.info(f"Blocking-Index erstellt mit {len(crm_index)} Keys") + logger.info(f"Blocking-Index mit {len(crm_index)} Keys erstellt") # Matching - logger.info("Starte Matching-Prozess...") + logger.info("Starte Matching...") results = [] total = len(match_df) for i, mrow in match_df.iterrows(): @@ -111,7 +116,6 @@ def main(): results.append({'Potenzieller Treffer im CRM': '', 'Ähnlichkeits-Score': 0}) continue scored = [(crow['CRM Name'], calculate_similarity(mrow, crow)) for crow in candidates] - # Log Top-3 only top3 = sorted(scored, key=lambda x: x[1], reverse=True)[:3] logger.debug(f" Top3 Kandidaten: {top3}") best_name, best_score = max(scored, key=lambda x: x[1]) @@ -119,16 +123,16 @@ def main(): results.append({'Potenzieller Treffer im CRM': best_name, 'Ähnlichkeits-Score': best_score}) logger.info(f" --> Match: '{best_name}' Score={best_score}") else: - results.append({'Potenzieller Treffer im CRM': best_name if best_name else '', 'Ähnlichkeits-Score': best_score}) - logger.info(f" --> Kein Match (höchster Score {best_score})") + results.append({'Potenzieller Treffer im CRM': best_name or '', 'Ähnlichkeits-Score': best_score}) + logger.info(f" --> Kein Match (Score {best_score})") # Write back logger.info("Schreibe Ergebnisse zurück ins Sheet...") - result_df = pd.DataFrame(results) - output_df = match_df[['CRM Name','CRM Website','CRM Ort','CRM Land']].copy() - output_df = pd.concat([output_df.reset_index(drop=True), result_df], axis=1) - data_to_write = [output_df.columns.tolist()] + output_df.values.tolist() - success = sheet_handler.clear_and_write_data(MATCHING_SHEET_NAME, data_to_write) + out_df = pd.DataFrame(results) + output = match_df[['CRM Name','CRM Website','CRM Ort','CRM Land']].copy() + output = pd.concat([output.reset_index(drop=True), out_df], axis=1) + data = [output.columns.tolist()] + output.values.tolist() + success = sheet_handler.clear_and_write_data(MATCHING_SHEET_NAME, data) if success: logger.info("Ergebnisse erfolgreich geschrieben") else: