diff --git a/duplicate_checker_old.py b/duplicate_checker_old.py index 555b1d6e..3f223628 100644 --- a/duplicate_checker_old.py +++ b/duplicate_checker_old.py @@ -104,6 +104,13 @@ def assess_serp_trust(company_name: str, url: str) -> str: # --- Similarity --- def calculate_similarity(mrec: dict, crec: dict, token_freq: Counter): + n1 = mrec.get('normalized_name','') + n2 = crec.get('normalized_name','') + + # NEU: Direkte Prämierung für exakten Namens-Match + if n1 and n1 == n2: + return 300, {'name': 100, 'exact_match': 1} + # Domain (mit Gate) dom1 = mrec.get('normalized_domain','') dom2 = crec.get('normalized_domain','') @@ -314,39 +321,61 @@ def main(): for idx, mrow in match_df.to_dict('index').items(): processed += 1 name_disp = mrow.get('CRM Name','') - # Kandidatenwahl - candidates = [] - used_block = '' + + # --- NEUE KANDIDATEN-SAMMELLOGIK --- + candidate_records = {} # Dict, um Duplikate zu vermeiden und Records zu speichern + used_blocks = [] + + # 1. Priorität: Exakter Namens-Match + mrec_norm_name = mrow.get('normalized_name') + if mrec_norm_name: + exact_matches = crm_df[crm_df['normalized_name'] == mrec_norm_name] + if not exact_matches.empty: + for _, record in exact_matches.to_dict('index').items(): + candidate_records[record['CRM Name']] = record + used_blocks.append('exact_name') + + # 2. Domain-Match if mrow.get('normalized_domain') and mrow.get('domain_use_flag') == 1: - candidates = domain_index.get(mrow['normalized_domain'], []) - used_block = f"domain:{mrow['normalized_domain']}" - if not candidates: - rtok = choose_rarest_token(mrow.get('normalized_name',''), token_freq) - if rtok: - candidates = token_index.get(rtok, []) - used_block = f"token:{rtok}" - if not candidates: - # Prefilter über gesamte CRM-Liste (strenger + limitierter; erfordert Rarest-Token-Overlap) + domain_cands = domain_index.get(mrow['normalized_domain'], []) + if domain_cands: + for record in domain_cands: + candidate_records[record['CRM Name']] = record + used_blocks.append('domain') + + # 3. Rarest-Token-Match + rtok = choose_rarest_token(mrow.get('normalized_name',''), token_freq) + if rtok: + token_cands = token_index.get(rtok, []) + if token_cands: + for record in token_cands: + candidate_records[record['CRM Name']] = record + used_blocks.append('token') + + # 4. Prefilter als Fallback, wenn wenige Kandidaten gefunden wurden + if len(candidate_records) < PREFILTER_LIMIT: pf = [] n1 = mrow.get('normalized_name','') rtok = choose_rarest_token(n1, token_freq) clean1, toks1 = clean_name_for_scoring(n1) if clean1: for r in crm_records: + if r['CRM Name'] in candidate_records: continue # Nicht erneut prüfen n2 = r.get('normalized_name','') clean2, toks2 = clean_name_for_scoring(n2) - if not clean2: - continue - if rtok and rtok not in toks2: + if not clean2 or (rtok and rtok not in toks2): continue pr = fuzz.partial_ratio(clean1, clean2) if pr >= PREFILTER_MIN_PARTIAL: pf.append((pr, r)) pf.sort(key=lambda x: x[0], reverse=True) - candidates = [r for _, r in pf[:PREFILTER_LIMIT]] - used_block = f"prefilter:{PREFILTER_MIN_PARTIAL}/{len(pf)}" + for _, record in pf[:PREFILTER_LIMIT]: + candidate_records[record['CRM Name']] = record + if pf: used_blocks.append('prefilter') + + candidates = list(candidate_records.values()) + logger.info(f"Prüfe {processed}/{total}: '{name_disp}' -> {len(candidates)} Kandidaten (Blocks={','.join(used_blocks)})") - logger.info(f"Prüfe {processed}/{total}: '{name_disp}' -> {len(candidates)} Kandidaten (Block={used_block})") if not candidates: results.append({'Match':'', 'Score':0, 'Match_Grund':'keine Kandidaten'}) continue