feat(duplicate_checker): Verbesserte Kandidatenauswahl und Match-Priorisierung

- Kandidatensuche sammelt Treffer aus allen Quellen (Domain, Token, Name)

- Exakte Namens-Matches werden durch einen hohen Score priorisiert

- Verhindert, dass Domain-Matches exaktere Namens-Matches überschatten
This commit is contained in:
Gemini CLI
2025-11-08 11:35:46 +00:00
parent bb0c6db85b
commit 9986722554

View File

@@ -104,6 +104,13 @@ def assess_serp_trust(company_name: str, url: str) -> str:
# --- Similarity ---
def calculate_similarity(mrec: dict, crec: dict, token_freq: Counter):
n1 = mrec.get('normalized_name','')
n2 = crec.get('normalized_name','')
# NEU: Direkte Prämierung für exakten Namens-Match
if n1 and n1 == n2:
return 300, {'name': 100, 'exact_match': 1}
# Domain (mit Gate)
dom1 = mrec.get('normalized_domain','')
dom2 = crec.get('normalized_domain','')
@@ -314,39 +321,61 @@ def main():
for idx, mrow in match_df.to_dict('index').items():
processed += 1
name_disp = mrow.get('CRM Name','')
# Kandidatenwahl
candidates = []
used_block = ''
# --- NEUE KANDIDATEN-SAMMELLOGIK ---
candidate_records = {} # Dict, um Duplikate zu vermeiden und Records zu speichern
used_blocks = []
# 1. Priorität: Exakter Namens-Match
mrec_norm_name = mrow.get('normalized_name')
if mrec_norm_name:
exact_matches = crm_df[crm_df['normalized_name'] == mrec_norm_name]
if not exact_matches.empty:
for _, record in exact_matches.to_dict('index').items():
candidate_records[record['CRM Name']] = record
used_blocks.append('exact_name')
# 2. Domain-Match
if mrow.get('normalized_domain') and mrow.get('domain_use_flag') == 1:
candidates = domain_index.get(mrow['normalized_domain'], [])
used_block = f"domain:{mrow['normalized_domain']}"
if not candidates:
rtok = choose_rarest_token(mrow.get('normalized_name',''), token_freq)
if rtok:
candidates = token_index.get(rtok, [])
used_block = f"token:{rtok}"
if not candidates:
# Prefilter über gesamte CRM-Liste (strenger + limitierter; erfordert Rarest-Token-Overlap)
domain_cands = domain_index.get(mrow['normalized_domain'], [])
if domain_cands:
for record in domain_cands:
candidate_records[record['CRM Name']] = record
used_blocks.append('domain')
# 3. Rarest-Token-Match
rtok = choose_rarest_token(mrow.get('normalized_name',''), token_freq)
if rtok:
token_cands = token_index.get(rtok, [])
if token_cands:
for record in token_cands:
candidate_records[record['CRM Name']] = record
used_blocks.append('token')
# 4. Prefilter als Fallback, wenn wenige Kandidaten gefunden wurden
if len(candidate_records) < PREFILTER_LIMIT:
pf = []
n1 = mrow.get('normalized_name','')
rtok = choose_rarest_token(n1, token_freq)
clean1, toks1 = clean_name_for_scoring(n1)
if clean1:
for r in crm_records:
if r['CRM Name'] in candidate_records: continue # Nicht erneut prüfen
n2 = r.get('normalized_name','')
clean2, toks2 = clean_name_for_scoring(n2)
if not clean2:
continue
if rtok and rtok not in toks2:
if not clean2 or (rtok and rtok not in toks2):
continue
pr = fuzz.partial_ratio(clean1, clean2)
if pr >= PREFILTER_MIN_PARTIAL:
pf.append((pr, r))
pf.sort(key=lambda x: x[0], reverse=True)
candidates = [r for _, r in pf[:PREFILTER_LIMIT]]
used_block = f"prefilter:{PREFILTER_MIN_PARTIAL}/{len(pf)}"
for _, record in pf[:PREFILTER_LIMIT]:
candidate_records[record['CRM Name']] = record
if pf: used_blocks.append('prefilter')
candidates = list(candidate_records.values())
logger.info(f"Prüfe {processed}/{total}: '{name_disp}' -> {len(candidates)} Kandidaten (Blocks={','.join(used_blocks)})")
logger.info(f"Prüfe {processed}/{total}: '{name_disp}' -> {len(candidates)} Kandidaten (Block={used_block})")
if not candidates:
results.append({'Match':'', 'Score':0, 'Match_Grund':'keine Kandidaten'})
continue