[31f88f42] Keine neuen Commits in dieser Session.

Keine neuen Commits in dieser Session.
This commit is contained in:
2026-03-10 13:54:07 +00:00
parent a3f79db2d2
commit 3fd3c5acfa
8 changed files with 268 additions and 9 deletions

View File

@@ -63,7 +63,8 @@ class Deduplicator:
Optimized for 10k-50k records.
"""
logger.info("Loading reference data for deduplication...")
query = self.db.query(Company.id, Company.name, Company.website, Company.city, Company.country)
# Include crm_id in the query
query = self.db.query(Company.id, Company.name, Company.website, Company.city, Company.country, Company.crm_id)
companies = query.all()
for c in companies:
@@ -72,6 +73,7 @@ class Deduplicator:
record = {
'id': c.id,
'crm_id': c.crm_id,
'name': c.name,
'normalized_name': norm_name,
'normalized_domain': norm_domain,
@@ -81,7 +83,7 @@ class Deduplicator:
self.reference_data.append(record)
# Build Indexes
if norm_domain:
if norm_domain and norm_domain != "k.a.":
self.domain_index.setdefault(norm_domain, []).append(record)
# Token Frequency
@@ -113,7 +115,7 @@ class Deduplicator:
candidates_to_check = {} # Map ID -> Record
# 1. Domain Match (Fastest)
if c_norm_domain and c_norm_domain in self.domain_index:
if c_norm_domain and c_norm_domain != "k.a." and c_norm_domain in self.domain_index:
for r in self.domain_index[c_norm_domain]:
candidates_to_check[r['id']] = r
@@ -123,6 +125,14 @@ class Deduplicator:
for r in self.token_index[rtok]:
candidates_to_check[r['id']] = r
if not candidates_to_check:
# Fallback: if no domain or rare token match, we might have an exact name match that wasn't indexed correctly (e.g. all tokens are stop words)
# This is rare but possible. We check reference_data directly if name is short and candidate pool is empty.
if len(c_norm_name) > 3:
for r in self.reference_data:
if r['normalized_name'] == c_norm_name:
candidates_to_check[r['id']] = r
if not candidates_to_check:
return []
@@ -135,12 +145,14 @@ class Deduplicator:
)
# Threshold Logic (Weak vs Strong)
# A match is "weak" if there is no domain match AND no location match
is_weak = (details['domain_match'] == 0 and not (details['loc_match']))
threshold = SCORE_THRESHOLD_WEAK if is_weak else SCORE_THRESHOLD
if score >= threshold:
matches.append({
'company_id': db_rec['id'],
'crm_id': db_rec['crm_id'],
'name': db_rec['name'],
'score': score,
'details': details
@@ -155,11 +167,11 @@ class Deduplicator:
# Exact Name Shortcut
if n1 and n1 == n2:
return 100, {'exact': True, 'domain_match': 0, 'loc_match': 0}
return 100, {'exact': True, 'domain_match': 0, 'loc_match': 1 if (cand['c'] and ref['city'] and cand['c'] == ref['city']) else 0, 'name_score': 100, 'penalties': 0}
# Domain
d1, d2 = cand['d'], ref['normalized_domain']
domain_match = 1 if (d1 and d2 and d1 == d2) else 0
domain_match = 1 if (d1 and d2 and d1 != "k.a." and d1 == d2) else 0
# Location
city_match = 1 if (cand['c'] and ref['city'] and cand['c'] == ref['city']) else 0
@@ -176,7 +188,8 @@ class Deduplicator:
ss = fuzz.token_sort_ratio(clean1, clean2)
name_score = max(ts, pr, ss)
else:
name_score = 0
# If cleaning removed everything, fallback to raw fuzzy on normalized names
name_score = fuzz.ratio(n1, n2) if (n1 and n2) else 0
# Penalties
penalties = 0
@@ -194,7 +207,7 @@ class Deduplicator:
total = name_score
if loc_match:
total += 10 # Bonus
total += 10 # Bonus for location match
total -= penalties