feat(duplicate_checker): Verbesserte Kandidatenauswahl und Match-Priorisierung
- Kandidatensuche sammelt Treffer aus allen Quellen (Domain, Token, Name) - Exakte Namens-Matches werden durch einen hohen Score priorisiert - Verhindert, dass Domain-Matches exaktere Namens-Matches überschatten
This commit is contained in:
@@ -104,6 +104,13 @@ def assess_serp_trust(company_name: str, url: str) -> str:
|
||||
|
||||
# --- Similarity ---
|
||||
def calculate_similarity(mrec: dict, crec: dict, token_freq: Counter):
|
||||
n1 = mrec.get('normalized_name','')
|
||||
n2 = crec.get('normalized_name','')
|
||||
|
||||
# NEU: Direkte Prämierung für exakten Namens-Match
|
||||
if n1 and n1 == n2:
|
||||
return 300, {'name': 100, 'exact_match': 1}
|
||||
|
||||
# Domain (mit Gate)
|
||||
dom1 = mrec.get('normalized_domain','')
|
||||
dom2 = crec.get('normalized_domain','')
|
||||
@@ -314,39 +321,61 @@ def main():
|
||||
for idx, mrow in match_df.to_dict('index').items():
|
||||
processed += 1
|
||||
name_disp = mrow.get('CRM Name','')
|
||||
# Kandidatenwahl
|
||||
candidates = []
|
||||
used_block = ''
|
||||
|
||||
# --- NEUE KANDIDATEN-SAMMELLOGIK ---
|
||||
candidate_records = {} # Dict, um Duplikate zu vermeiden und Records zu speichern
|
||||
used_blocks = []
|
||||
|
||||
# 1. Priorität: Exakter Namens-Match
|
||||
mrec_norm_name = mrow.get('normalized_name')
|
||||
if mrec_norm_name:
|
||||
exact_matches = crm_df[crm_df['normalized_name'] == mrec_norm_name]
|
||||
if not exact_matches.empty:
|
||||
for _, record in exact_matches.to_dict('index').items():
|
||||
candidate_records[record['CRM Name']] = record
|
||||
used_blocks.append('exact_name')
|
||||
|
||||
# 2. Domain-Match
|
||||
if mrow.get('normalized_domain') and mrow.get('domain_use_flag') == 1:
|
||||
candidates = domain_index.get(mrow['normalized_domain'], [])
|
||||
used_block = f"domain:{mrow['normalized_domain']}"
|
||||
if not candidates:
|
||||
rtok = choose_rarest_token(mrow.get('normalized_name',''), token_freq)
|
||||
if rtok:
|
||||
candidates = token_index.get(rtok, [])
|
||||
used_block = f"token:{rtok}"
|
||||
if not candidates:
|
||||
# Prefilter über gesamte CRM-Liste (strenger + limitierter; erfordert Rarest-Token-Overlap)
|
||||
domain_cands = domain_index.get(mrow['normalized_domain'], [])
|
||||
if domain_cands:
|
||||
for record in domain_cands:
|
||||
candidate_records[record['CRM Name']] = record
|
||||
used_blocks.append('domain')
|
||||
|
||||
# 3. Rarest-Token-Match
|
||||
rtok = choose_rarest_token(mrow.get('normalized_name',''), token_freq)
|
||||
if rtok:
|
||||
token_cands = token_index.get(rtok, [])
|
||||
if token_cands:
|
||||
for record in token_cands:
|
||||
candidate_records[record['CRM Name']] = record
|
||||
used_blocks.append('token')
|
||||
|
||||
# 4. Prefilter als Fallback, wenn wenige Kandidaten gefunden wurden
|
||||
if len(candidate_records) < PREFILTER_LIMIT:
|
||||
pf = []
|
||||
n1 = mrow.get('normalized_name','')
|
||||
rtok = choose_rarest_token(n1, token_freq)
|
||||
clean1, toks1 = clean_name_for_scoring(n1)
|
||||
if clean1:
|
||||
for r in crm_records:
|
||||
if r['CRM Name'] in candidate_records: continue # Nicht erneut prüfen
|
||||
n2 = r.get('normalized_name','')
|
||||
clean2, toks2 = clean_name_for_scoring(n2)
|
||||
if not clean2:
|
||||
continue
|
||||
if rtok and rtok not in toks2:
|
||||
if not clean2 or (rtok and rtok not in toks2):
|
||||
continue
|
||||
pr = fuzz.partial_ratio(clean1, clean2)
|
||||
if pr >= PREFILTER_MIN_PARTIAL:
|
||||
pf.append((pr, r))
|
||||
pf.sort(key=lambda x: x[0], reverse=True)
|
||||
candidates = [r for _, r in pf[:PREFILTER_LIMIT]]
|
||||
used_block = f"prefilter:{PREFILTER_MIN_PARTIAL}/{len(pf)}"
|
||||
for _, record in pf[:PREFILTER_LIMIT]:
|
||||
candidate_records[record['CRM Name']] = record
|
||||
if pf: used_blocks.append('prefilter')
|
||||
|
||||
candidates = list(candidate_records.values())
|
||||
logger.info(f"Prüfe {processed}/{total}: '{name_disp}' -> {len(candidates)} Kandidaten (Blocks={','.join(used_blocks)})")
|
||||
|
||||
logger.info(f"Prüfe {processed}/{total}: '{name_disp}' -> {len(candidates)} Kandidaten (Block={used_block})")
|
||||
if not candidates:
|
||||
results.append({'Match':'', 'Score':0, 'Match_Grund':'keine Kandidaten'})
|
||||
continue
|
||||
|
||||
Reference in New Issue
Block a user