diff --git a/duplicate_checker.py b/duplicate_checker.py index 5f28fb92..bbdcacbf 100644 --- a/duplicate_checker.py +++ b/duplicate_checker.py @@ -6,64 +6,60 @@ from thefuzz import fuzz from helpers import normalize_company_name, simple_normalize_url from google_sheet_handler import GoogleSheetHandler -# duplicate_checker.py v2.7 (Logging-Setup Fix) -# Version: 2025-08-06_17-30 +# duplicate_checker.py v2.8 (Match-Komponenten im Log) +# Version: 2025-08-06_17-50 # --- Konfiguration --- CRM_SHEET_NAME = "CRM_Accounts" MATCHING_SHEET_NAME = "Matching_Accounts" SCORE_THRESHOLD = 80 LOG_DIR = "Log" -LOG_FILE = "duplicate_check_v2.7.log" +LOG_FILE = "duplicate_check_v2.8.log" # --- Logging Setup --- if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR, exist_ok=True) log_path = os.path.join(LOG_DIR, LOG_FILE) - -# Clear existing handlers -root_logger = logging.getLogger() -root_logger.setLevel(logging.DEBUG) -for h in list(root_logger.handlers): - root_logger.removeHandler(h) - -# Formatter +root = logging.getLogger() +root.setLevel(logging.DEBUG) +# Remove old handlers +for h in list(root.handlers): + root.removeHandler(h) formatter = logging.Formatter("%(asctime)s - %(levelname)-8s - %(message)s") - -# Console Handler - INFO+ +# Console ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.INFO) ch.setFormatter(formatter) -root_logger.addHandler(ch) - -# File Handler - DEBUG+ +root.addHandler(ch) +# File fh = logging.FileHandler(log_path, mode='a', encoding='utf-8') fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) -root_logger.addHandler(fh) - +root.addHandler(fh) logger = logging.getLogger(__name__) logger.info(f"Logging to console and file: {log_path}") -logger.info("Starting duplicate_checker.py v2.7 | Version: 2025-08-06_17-30") +logger.info("Starting duplicate_checker.py v2.8 | Version: 2025-08-06_17-50") -def calculate_similarity(record1, record2): - total_score = 0 - dom1 = record1.get('normalized_domain', '') - dom2 = record2.get('normalized_domain', '') - if dom1 and dom1 == dom2: - total_score += 100 - name1 = record1.get('normalized_name', '') - name2 = record2.get('normalized_name', '') - if name1 and name2: - total_score += fuzz.token_set_ratio(name1, name2) * 0.7 - if record1.get('CRM Ort') == record2.get('CRM Ort') and record1.get('CRM Land') == record2.get('CRM Land'): - total_score += 20 - return round(total_score) +def calculate_similarity_components(r1, r2): + """Gibt einzelne Komponenten und Gesamt-Score zurück.""" + # Domain + dom1 = r1.get('normalized_domain', '') + dom2 = r2.get('normalized_domain', '') + domain_match = 1 if dom1 and dom1 == dom2 else 0 + # Name + name1 = r1.get('normalized_name', '') + name2 = r2.get('normalized_name', '') + name_score = fuzz.token_set_ratio(name1, name2) if name1 and name2 else 0 + # Ort+Land + loc_match = 1 if (r1.get('CRM Ort') == r2.get('CRM Ort') and r1.get('CRM Land') == r2.get('CRM Land')) else 0 + # Gewichte + total = domain_match * 100 + name_score * 0.7 + loc_match * 20 + return round(total), domain_match, round(name_score,1), loc_match def main(): - logger.info("Starte Duplikats-Check v2.7") + logger.info("Starte Duplikats-Check v2.8 (Match-Komponenten im Log)") try: sheet_handler = GoogleSheetHandler() logger.info("GoogleSheetHandler initialisiert") @@ -71,7 +67,7 @@ def main(): logger.critical(f"Init GoogleSheetHandler fehlgeschlagen: {e}") sys.exit(1) - # Load data + # Daten laden logger.info(f"Lade CRM-Daten aus '{CRM_SHEET_NAME}'...") crm_df = sheet_handler.get_sheet_as_dataframe(CRM_SHEET_NAME) if crm_df is None or crm_df.empty: @@ -86,7 +82,7 @@ def main(): return logger.info(f"{len(match_df)} Matching-Datensätze geladen") - # Normalize & blocking key + # Normalisierung & Blocking-Key for df, label in [(crm_df, 'CRM'), (match_df, 'Matching')]: df['normalized_name'] = df['CRM Name'].astype(str).apply(normalize_company_name) df['normalized_domain'] = df['CRM Website'].astype(str).apply(simple_normalize_url) @@ -113,18 +109,23 @@ def main(): candidates = crm_index.get(key, []) logger.info(f"Prüfe {i+1}/{total}: '{mrow['CRM Name']}' -> {len(candidates)} Kandidaten") if not candidates: - results.append({'Potenzieller Treffer im CRM': '', 'Ähnlichkeits-Score': 0}) + results.append({'Potenzieller Treffer im CRM':'', 'Ähnlichkeits-Score':0}) continue - scored = [(crow['CRM Name'], calculate_similarity(mrow, crow)) for crow in candidates] + scored = [] + for crow in candidates: + score, dm, ns, lm = calculate_similarity_components(mrow, crow) + scored.append((crow['CRM Name'], score, dm, ns, lm)) top3 = sorted(scored, key=lambda x: x[1], reverse=True)[:3] - logger.debug(f" Top3 Kandidaten: {top3}") - best_name, best_score = max(scored, key=lambda x: x[1]) + # Log Top3 mit Komponenten + for name, sc, dm, ns, lm in top3: + logger.debug(f" Kandidat: {name}, Score={sc}, Domain={dm}, Name={ns}, Ort={lm}") + best_name, best_score, best_dm, best_ns, best_lm = max(scored, key=lambda x: x[1]) if best_score >= SCORE_THRESHOLD: - results.append({'Potenzieller Treffer im CRM': best_name, 'Ähnlichkeits-Score': best_score}) - logger.info(f" --> Match: '{best_name}' Score={best_score}") + results.append({'Potenzieller Treffer im CRM':best_name, 'Ähnlichkeits-Score':best_score}) + logger.info(f" --> Match: '{best_name}' Score={best_score} (Dom={best_dm}, Name={best_ns}, Ort={best_lm})") else: - results.append({'Potenzieller Treffer im CRM': best_name or '', 'Ähnlichkeits-Score': best_score}) - logger.info(f" --> Kein Match (Score {best_score})") + results.append({'Potenzieller Treffer im CRM':'', 'Ähnlichkeits-Score':best_score}) + logger.info(f" --> Kein Match (Score={best_score}, Dom={best_dm}, Name={best_ns}, Ort={best_lm})") # Write back logger.info("Schreibe Ergebnisse zurück ins Sheet...")