diff --git a/duplicate_checker.py b/duplicate_checker.py index 05de2aab..a57a4ad5 100644 --- a/duplicate_checker.py +++ b/duplicate_checker.py @@ -1,4 +1,4 @@ -# duplicate_checker.py (v2.5 - Final Hybrid Approach) +# duplicate_checker.py (v2.2 - Multi-Key Blocking & optimiertes Scoring) import logging import pandas as pd @@ -7,19 +7,11 @@ from config import Config from helpers import normalize_company_name, simple_normalize_url from google_sheet_handler import GoogleSheetHandler from collections import defaultdict -import time # --- Konfiguration --- CRM_SHEET_NAME = "CRM_Accounts" MATCHING_SHEET_NAME = "Matching_Accounts" -SCORE_THRESHOLD = 85 # Zeigt nur Treffer an, die diesen Score erreichen oder übertreffen - -# Erweiterte Liste von generischen Wörtern, die für das Blocking ignoriert werden -BLOCKING_STOP_WORDS = { - 'gmbh', 'ag', 'co', 'kg', 'se', 'holding', 'gruppe', 'industries', 'systems', 'technik', 'service', - 'services', 'solutions', 'management', 'international', 'und', 'germany', 'deutschland', 'gbr', - 'mbh', 'company', 'limited', 'logistics', 'construction', 'products', 'group' -} +SCORE_THRESHOLD = 85 # Etwas höherer Schwellenwert für bessere Präzision logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') @@ -30,9 +22,9 @@ def calculate_similarity_details(record1, record2): if record1.get('normalized_domain') and record1['normalized_domain'] != 'k.a.' and record1['normalized_domain'] == record2.get('normalized_domain'): scores['domain'] = 100 + # Höhere Gewichtung für den Namen, da die Website oft fehlt if record1.get('normalized_name') and record2.get('normalized_name'): - # Wir verwenden token_sort_ratio für eine gute Balance zwischen Wortreihenfolge und Inhalt - scores['name'] = round(fuzz.token_sort_ratio(record1['normalized_name'], record2['normalized_name']) * 0.85) + scores['name'] = round(fuzz.token_set_ratio(record1['normalized_name'], record2['normalized_name']) * 0.85) if record1.get('CRM Ort') and record1['CRM Ort'] == record2.get('CRM Ort'): if record1.get('CRM Land') and record1['CRM Land'] == record2.get('CRM Land'): @@ -42,18 +34,28 @@ def calculate_similarity_details(record1, record2): return {'total': total_score, 'details': scores} def create_blocking_keys(name): - """Erstellt Blocking Keys aus allen signifikanten Wörtern eines Namens.""" + """Erstellt mehrere Blocking Keys für einen Namen, um die Sensitivität zu erhöhen.""" if not name: return [] - # Filtere Stop-Wörter und sehr kurze Wörter (z.B. '&') aus der Wortliste - significant_words = {word for word in name.split() if word not in BLOCKING_STOP_WORDS and len(word) > 2} - return list(significant_words) + + words = name.split() + keys = set() + + # 1. Erstes Wort + if len(words) > 0: + keys.add(words[0]) + # 2. Zweites Wort (falls vorhanden) + if len(words) > 1: + keys.add(words[1]) + # 3. Erste 4 Buchstaben des ersten Wortes + if len(words) > 0 and len(words[0]) >= 4: + keys.add(words[0][:4]) + + return list(keys) def main(): - start_time = time.time() - logging.info("Starte den Duplikats-Check (v2.5 - Final Hybrid Approach)...") + logging.info("Starte den Duplikats-Check (v2.2 mit Multi-Key Blocking)...") - # ... (Initialisierung und Laden der Daten bleibt gleich) ... try: sheet_handler = GoogleSheetHandler() except Exception as e: @@ -79,8 +81,7 @@ def main(): logging.info("Erstelle Index für CRM-Daten zur Beschleunigung...") crm_index = defaultdict(list) - crm_records = crm_df.to_dict('records') - for record in crm_records: + for record in crm_df.to_dict('records'): for key in record['block_keys']: crm_index[key].append(record) @@ -98,7 +99,9 @@ def main(): for crm_record in crm_index.get(key, []): candidate_pool[crm_record['CRM Name']] = crm_record - # Brute-Force-Vergleich innerhalb des intelligenten Blocks + if not candidate_pool: + logging.debug(" -> Keine Kandidaten im Index gefunden.") + for crm_record in candidate_pool.values(): score_info = calculate_similarity_details(match_record, crm_record) if score_info['total'] > best_score_info['total']: @@ -126,8 +129,5 @@ def main(): else: logging.error("FEHLER beim Schreiben der Ergebnisse ins Google Sheet.") - end_time = time.time() - logging.info(f"Gesamtdauer des Duplikats-Checks: {end_time - start_time:.2f} Sekunden.") - if __name__ == "__main__": main() \ No newline at end of file