From fa3c561925e9e39a2fbac2b218cc7be418baa5da Mon Sep 17 00:00:00 2001 From: Floke Date: Mon, 4 Aug 2025 06:01:13 +0000 Subject: [PATCH] duplicate_checker.py aktualisiert --- duplicate_checker.py | 115 +++++++++++++++---------------------------- 1 file changed, 41 insertions(+), 74 deletions(-) diff --git a/duplicate_checker.py b/duplicate_checker.py index 3e0de2f7..8f63128d 100644 --- a/duplicate_checker.py +++ b/duplicate_checker.py @@ -1,105 +1,68 @@ -# duplicate_checker.py (v1.1 - Lauf 1 Logik + Match-Grund) +# duplicate_checker.py (v3.0 - Back to Basics: Optimized Brute-Force) import logging import pandas as pd from thefuzz import fuzz from config import Config -from helpers import normalize_company_name, simple_normalize_url, create_log_filename +from helpers import normalize_company_name, simple_normalize_url from google_sheet_handler import GoogleSheetHandler import time # --- Konfiguration --- CRM_SHEET_NAME = "CRM_Accounts" MATCHING_SHEET_NAME = "Matching_Accounts" -SCORE_THRESHOLD = 80 # Treffer unter diesem Wert werden nicht als "potenzieller Treffer" angezeigt - -# --- VOLLSTÄNDIGES LOGGING SETUP --- -LOG_LEVEL = logging.DEBUG if Config.DEBUG else logging.INFO -LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)s - %(message)s' - -root_logger = logging.getLogger() -root_logger.setLevel(LOG_LEVEL) - -# Handler nur hinzufügen, wenn noch keine konfiguriert sind -if not root_logger.handlers: - stream_handler = logging.StreamHandler() - stream_handler.setFormatter(logging.Formatter(LOG_FORMAT)) - root_logger.addHandler(stream_handler) - - log_file_path = create_log_filename("duplicate_check_final") - if log_file_path: - file_handler = logging.FileHandler(log_file_path, mode='a', encoding='utf-8') - file_handler.setFormatter(logging.Formatter(LOG_FORMAT)) - root_logger.addHandler(file_handler) -else: - # Finde den Dateipfad aus dem bereits konfigurierten Handler - log_file_path = None - for handler in root_logger.handlers: - if isinstance(handler, logging.FileHandler): - log_file_path = handler.baseFilename - break +SCORE_THRESHOLD = 85 # Treffer unter diesem Wert werden nicht als "potenzieller Treffer" angezeigt +# WICHTIG: Logging Setup für detaillierte Ausgaben +logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)-8s - %(name)s - %(message)s') logger = logging.getLogger(__name__) -def calculate_similarity_with_details(record1, record2): + +def calculate_similarity_details(record1, record2): """ - Berechnet einen gewichteten Ähnlichkeits-Score und gibt den Score und den Grund zurück. - Dies ist die originale Scoring-Logik von Lauf 1. + Berechnet einen gewichteten Ähnlichkeits-Score und gibt die Details zurück. """ scores = {'name': 0, 'location': 0, 'domain': 0} - # Domain-Match (100 Punkte) - domain1 = record1.get('normalized_domain') - domain2 = record2.get('normalized_domain') - if domain1 and domain1 != 'k.a.' and domain1 == domain2: + # Domain-Match (höchste Priorität, 100 Punkte) + if record1.get('normalized_domain') and record1['normalized_domain'] != 'k.a.' and record1['normalized_domain'] == record2.get('normalized_domain'): scores['domain'] = 100 - # Namensähnlichkeit (70% Gewichtung) - name1 = record1.get('normalized_name') - name2 = record2.get('normalized_name') - if name1 and name2: - name_similarity = fuzz.token_set_ratio(name1, name2) - scores['name'] = round(name_similarity * 0.7) + # Namensähnlichkeit (hohe 85% Gewichtung) + if record1.get('normalized_name') and record2.get('normalized_name'): + # token_set_ratio ist robust gegen zusätzliche Wörter wie "Holding" oder "Gruppe" + scores['name'] = round(fuzz.token_set_ratio(record1['normalized_name'], record2['normalized_name']) * 0.85) # Standort-Bonus (20 Punkte) - ort1 = record1.get('CRM Ort') - ort2 = record2.get('CRM Ort') - land1 = record1.get('CRM Land') - land2 = record2.get('CRM Land') - if ort1 and ort1 == ort2 and land1 and land1 == land2: - scores['location'] = 20 - - total_score = sum(scores.values()) - - reasons = [] - if scores['domain'] > 0: reasons.append(f"Domain({scores['domain']})") - if scores['name'] > 0: reasons.append(f"Name({scores['name']})") - if scores['location'] > 0: reasons.append(f"Ort({scores['location']})") - reason_text = " + ".join(reasons) if reasons else "Keine Übereinstimmung" + if record1.get('CRM Ort') and record1['CRM Ort'] == record2.get('CRM Ort'): + if record1.get('CRM Land') and record1['CRM Land'] == record2.get('CRM Land'): + scores['location'] = 20 + + total_score = sum(scores.values()) + return {'total': total_score, 'details': scores} - return round(total_score), reason_text def main(): start_time = time.time() - logger.info("Starte den Duplikats-Check (v1.1 - Brute-Force mit Match-Grund)...") - logger.info(f"Logdatei: {log_file_path}") + logger.info("Starte den Duplikats-Check (v3.0 - Back to Basics)...") + # ... (Initialisierung und Laden der Daten bleibt gleich) ... try: sheet_handler = GoogleSheetHandler() except Exception as e: logger.critical(f"FEHLER bei Initialisierung: {e}") return - logger.info(f"Lade Master-Daten aus '{CRM_SHEET_NAME}'...") + logging.info(f"Lade Master-Daten aus '{CRM_SHEET_NAME}'...") crm_df = sheet_handler.get_sheet_as_dataframe(CRM_SHEET_NAME) if crm_df is None or crm_df.empty: return - logger.info(f"Lade zu prüfende Daten aus '{MATCHING_SHEET_NAME}'...") + logging.info(f"Lade zu prüfende Daten aus '{MATCHING_SHEET_NAME}'...") matching_df = sheet_handler.get_sheet_as_dataframe(MATCHING_SHEET_NAME) if matching_df is None or matching_df.empty: return original_matching_df = matching_df.copy() - logger.info("Normalisiere Daten für den Vergleich...") + logging.info("Normalisiere Daten für den Vergleich...") for df in [crm_df, matching_df]: df['normalized_name'] = df['CRM Name'].astype(str).apply(normalize_company_name) df['normalized_domain'] = df['CRM Website'].astype(str).apply(simple_normalize_url) @@ -113,26 +76,31 @@ def main(): results = [] for i, match_record in enumerate(matching_records): - best_score = -1 + best_score_info = {'total': -1, 'details': {'name': 0, 'location': 0, 'domain': 0}} best_match_name = "" - best_reason = "" logger.info(f"--- Prüfe {i + 1}/{len(matching_records)}: '{match_record.get('CRM Name', 'N/A')}' ---") - # Brute-Force-Vergleich: Jede Zeile wird mit jeder CRM-Zeile verglichen + # BRUTE-FORCE: Vergleiche mit jedem einzelnen CRM-Eintrag for crm_record in crm_records: - score, reason = calculate_similarity_with_details(match_record, crm_record) - if score > best_score: - best_score = score + score_info = calculate_similarity_details(match_record, crm_record) + + # Logge jeden interessanten Vergleich (Score > 60) + if score_info['total'] > 60: + logger.debug(f" - Kandidat: '{crm_record.get('CRM Name', 'N/A')}' -> Score: {score_info['total']} (Details: {score_info['details']})") + + if score_info['total'] > best_score_info['total']: + best_score_info = score_info best_match_name = crm_record.get('CRM Name', 'N/A') - best_reason = reason - logger.info(f" --> Bester Treffer: '{best_match_name}' mit Score {best_score} (Grund: {best_reason})") + logger.info(f" --> Bester Treffer: '{best_match_name}' mit Score {best_score_info['total']}") results.append({ - 'Potenzieller Treffer im CRM': best_match_name if best_score >= SCORE_THRESHOLD else "", - 'Ähnlichkeits-Score': best_score, - 'Matching-Grund': best_reason + 'Potenzieller Treffer im CRM': best_match_name if best_score_info['total'] >= SCORE_THRESHOLD else "", + 'Score (Gesamt)': best_score_info['total'], + 'Score (Name)': best_score_info['details']['name'], + 'Bonus (Standort)': best_score_info['details']['location'], + 'Bonus (Domain)': best_score_info['details']['domain'] }) logging.info("Matching abgeschlossen. Schreibe Ergebnisse zurück ins Sheet...") @@ -150,7 +118,6 @@ def main(): end_time = time.time() logger.info(f"Gesamtdauer des Duplikats-Checks: {end_time - start_time:.2f} Sekunden.") - logger.info(f"===== Skript beendet =====") if __name__ == "__main__": main() \ No newline at end of file