diff --git a/duplicate_checker.py b/duplicate_checker.py index d692e22f..555b1d6e 100644 --- a/duplicate_checker.py +++ b/duplicate_checker.py @@ -10,21 +10,24 @@ from helpers import normalize_company_name, simple_normalize_url, serp_website_l from config import Config from google_sheet_handler import GoogleSheetHandler -# duplicate_checker.py v2.14 (Quality-first + SERP nur falls B/E leer: Domain-Gate, Location-Penalties, Smart Blocking, Serp-Trust, Metrics) -# Version-Build: dynamic timestamp below +# duplicate_checker.py v2.15 +# Quality-first ++: Domain-Gate, Location-Penalties, Smart Blocking (IDF-light), +# Serp-Trust, Weak-Threshold, City-Bias-Guard, Prefilter tightened, Metrics +# Build timestamp is injected into logfile name. # --- Konfiguration --- CRM_SHEET_NAME = "CRM_Accounts" MATCHING_SHEET_NAME = "Matching_Accounts" -SCORE_THRESHOLD = 80 # Schwellwert fürs Auto-Match -MIN_NAME_FOR_DOMAIN = 70 # Domain-Match gilt nur, wenn Name >= 70 ODER Ort matcht +SCORE_THRESHOLD = 80 # Standard-Schwelle +SCORE_THRESHOLD_WEAK= 95 # Schwelle, wenn weder Domain noch (City&Country) matchen +MIN_NAME_FOR_DOMAIN = 70 # Domain-Score nur, wenn Name >= 70 ODER Ort+Land matchen CITY_MISMATCH_PENALTY = 30 COUNTRY_MISMATCH_PENALTY = 40 -PREFILTER_MIN_PARTIAL = 60 # Vorfilter über gesamte CRM-Liste bei fehlenden Kandidaten -PREFILTER_LIMIT = 50 # Max. Kandidaten aus Vorfilter -LOG_DIR = "Log" +PREFILTER_MIN_PARTIAL = 70 # (vorher 60) +PREFILTER_LIMIT = 30 # (vorher 50) +LOG_DIR = "Log" now = datetime.now().strftime('%Y-%m-%d_%H-%M') -LOG_FILE = f"{now}_duplicate_check_v2.14.txt" +LOG_FILE = f"{now}_duplicate_check_v2.15.txt" # --- Logging Setup --- if not os.path.exists(LOG_DIR): @@ -45,7 +48,7 @@ fh.setFormatter(formatter) root.addHandler(fh) logger = logging.getLogger(__name__) logger.info(f"Logging to console and file: {log_path}") -logger.info(f"Starting duplicate_checker.py v2.14 | Build: {now}") +logger.info(f"Starting duplicate_checker.py v2.15 | Build: {now}") # --- SerpAPI Key laden --- try: @@ -57,91 +60,124 @@ except Exception as e: logger.warning(f"Fehler beim Laden API-Keys: {e}") serp_key = None -STOP_TOKENS = { - 'gmbh','mbh','ag','kg','ug','ohg','se','co','kg','kgaa','inc','llc','ltd','sarl', +# --- Stop-/City-Tokens --- +STOP_TOKENS_BASE = { + 'gmbh','mbh','ag','kg','ug','ohg','se','co','kgaa','inc','llc','ltd','sarl', 'holding','gruppe','group','international','solutions','solution','service','services', 'deutschland','austria','germany','technik','technology','technologies','systems','systeme', - 'logistik','logistics','industries','industrie','management','consulting','vertrieb','handel' + 'logistik','logistics','industries','industrie','management','consulting','vertrieb','handel', + 'international','company','gesellschaft','mbh&co','mbhco','werke','werk','renkhoff','sonnenschutztechnik' } +CITY_TOKENS = set() # dynamisch befüllt nach Datennormalisierung + +# --- Utilities --- +def _tokenize(s: str): + if not s: + return [] + return re.split(r"[^a-z0-9]+", str(s).lower()) -# --- Utilitys --- def split_tokens(name: str): + """Tokens für Indexing/Scoring (Basis-Stop + dynamische City-Tokens).""" if not name: return [] - return [t for t in str(name).split() if len(t) >= 3 and t not in STOP_TOKENS] + tokens = [t for t in _tokenize(name) if len(t) >= 3] + stop_union = STOP_TOKENS_BASE | CITY_TOKENS + return [t for t in tokens if t not in stop_union] + +def clean_name_for_scoring(norm_name: str): + """Entfernt Stop- & City-Tokens. Leerer Output => kein sinnvoller Namevergleich.""" + toks = split_tokens(norm_name) + return " ".join(toks), set(toks) def assess_serp_trust(company_name: str, url: str) -> str: - """Einfache Vertrauensstufe für recherchierte URL: hoch/mittel/niedrig.""" + """Vertrauen 'hoch/mittel/niedrig' anhand Token-Vorkommen in Domain.""" if not url: return 'n/a' host = simple_normalize_url(url) or '' host = host.replace('www.', '') - tokens = [t for t in split_tokens(normalize_company_name(company_name)) if len(t) >= 4] - if any(t in host for t in tokens): + name_toks = [t for t in split_tokens(normalize_company_name(company_name)) if len(t) >= 3] + if any(t in host for t in name_toks if len(t) >= 4): return 'hoch' - tokens3 = [t for t in split_tokens(normalize_company_name(company_name)) if len(t) == 3] - if any(t in host for t in tokens3): + if any(t in host for t in name_toks if len(t) == 3): return 'mittel' return 'niedrig' -# --- Ähnlichkeitsberechnung --- -def calculate_similarity(mrec: dict, crec: dict): - # Domain-Komponente (mit Gate) +# --- Similarity --- +def calculate_similarity(mrec: dict, crec: dict, token_freq: Counter): + # Domain (mit Gate) dom1 = mrec.get('normalized_domain','') dom2 = crec.get('normalized_domain','') - m_domain_use = mrec.get('domain_use_flag', 0) # 1 nur wenn original URL oder Serp-Vertrauen hoch + m_domain_use = mrec.get('domain_use_flag', 0) domain_flag_raw = 1 if (m_domain_use == 1 and dom1 and dom1 == dom2) else 0 - # Location + # Location flags city_match = 1 if (mrec.get('CRM Ort') and crec.get('CRM Ort') and mrec.get('CRM Ort') == crec.get('CRM Ort')) else 0 country_match = 1 if (mrec.get('CRM Land') and crec.get('CRM Land') and mrec.get('CRM Land') == crec.get('CRM Land')) else 0 - # Name - n1, n2 = mrec.get('normalized_name',''), crec.get('normalized_name','') - if n1 and n2: - ts = fuzz.token_set_ratio(n1,n2) - pr = fuzz.partial_ratio(n1,n2) - ss = fuzz.token_sort_ratio(n1,n2) - name_score = max(ts,pr,ss) + # Name (nur sinnvolle Tokens) + n1 = mrec.get('normalized_name','') + n2 = crec.get('normalized_name','') + clean1, toks1 = clean_name_for_scoring(n1) + clean2, toks2 = clean_name_for_scoring(n2) + + # Overlaps + overlap_clean = toks1 & toks2 + # city-only overlap check (wenn nach Clean nichts übrig, aber Roh-Overlap evtl. Städte; wir cappen Score) + raw_overlap = set(_tokenize(n1)) & set(_tokenize(n2)) + city_only_overlap = (not overlap_clean) and any(t in CITY_TOKENS for t in raw_overlap) + + # Name-Score + if clean1 and clean2: + ts = fuzz.token_set_ratio(clean1, clean2) + pr = fuzz.partial_ratio(clean1, clean2) + ss = fuzz.token_sort_ratio(clean1, clean2) + name_score = max(ts, pr, ss) else: name_score = 0 - # Domain-Gate: Domain zählt nur, wenn Name >= MIN_NAME_FOR_DOMAIN ODER Ort+Land passt + if city_only_overlap and name_score > 70: + name_score = 70 # cap + + # Rare-token-overlap (IDF-light): benutze seltensten Token aus mrec + rtoks_sorted = sorted(list(toks1), key=lambda t: (token_freq.get(t, 10**9), -len(t))) + rare_token = rtoks_sorted[0] if rtoks_sorted else None + rare_overlap = 1 if (rare_token and rare_token in toks2) else 0 + + # Domain Gate domain_gate_ok = (name_score >= MIN_NAME_FOR_DOMAIN) or (city_match and country_match) - domain_flag = 1 if (domain_flag_raw and domain_gate_ok) else 0 + domain_used = 1 if (domain_flag_raw and domain_gate_ok) else 0 # Basisscore - total = domain_flag*100 + name_score*1.0 + (1 if (city_match and country_match) else 0)*20 + total = domain_used*100 + name_score*1.0 + (1 if (city_match and country_match) else 0)*20 - # Penalties bei Mismatch (nur anwenden, wenn entsprechende Felder befüllt und kein voller Location-Match) + # Penalties penalties = 0 if mrec.get('CRM Land') and crec.get('CRM Land') and not country_match: penalties += COUNTRY_MISMATCH_PENALTY if mrec.get('CRM Ort') and crec.get('CRM Ort') and not city_match: penalties += CITY_MISMATCH_PENALTY - total -= penalties - # Bonus für reine Name-Matches (keine Domain, kein Ort) wenn stark - bonus_flag = 1 if (domain_flag == 0 and not (city_match and country_match) and name_score >= 85) else 0 - if bonus_flag: + # Bonus für starke Name-only Fälle + name_bonus = 1 if (domain_used == 0 and not (city_match and country_match) and name_score >= 85 and rare_overlap==1) else 0 + if name_bonus: total += 20 - return ( - round(total), - { - 'domain_raw': domain_flag_raw, - 'domain_used': domain_flag, - 'domain_gate_ok': int(domain_gate_ok), - 'name': round(name_score,1), - 'city_match': city_match, - 'country_match': country_match, - 'penalties': penalties, - 'name_bonus': bonus_flag - } - ) + comp = { + 'domain_raw': domain_flag_raw, + 'domain_used': domain_used, + 'domain_gate_ok': int(domain_gate_ok), + 'name': round(name_score,1), + 'city_match': city_match, + 'country_match': country_match, + 'penalties': penalties, + 'name_bonus': name_bonus, + 'rare_overlap': rare_overlap, + 'city_only_overlap': int(city_only_overlap) + } + return round(total), comp -# --- Blocking vorbereiten --- +# --- Indexe --- def build_indexes(crm_df: pd.DataFrame): records = list(crm_df.to_dict('records')) # Domain-Index @@ -150,31 +186,31 @@ def build_indexes(crm_df: pd.DataFrame): d = r.get('normalized_domain') if d: domain_index.setdefault(d, []).append(r) - # Token-Frequenzen + # Token-Frequenzen (auf gereinigten Tokens) token_freq = Counter() for r in records: - for t in set(split_tokens(r.get('normalized_name',''))): + _, toks = clean_name_for_scoring(r.get('normalized_name','')) + for t in set(toks): token_freq[t] += 1 - # Token-Index (nur sinnvolle Tokens) + # Token-Index token_index = {} for r in records: - toks = [t for t in set(split_tokens(r.get('normalized_name',''))) if token_freq[t] > 0] - for t in toks: + _, toks = clean_name_for_scoring(r.get('normalized_name','')) + for t in set(toks): token_index.setdefault(t, []).append(r) return records, domain_index, token_freq, token_index def choose_rarest_token(norm_name: str, token_freq: Counter): - toks = [t for t in split_tokens(norm_name) if len(t) >= 4 and token_freq.get(t, 0) > 0] + _, toks = clean_name_for_scoring(norm_name) if not toks: return None - # Rarest (kleinste Frequenz), zweitkriterium längster Token - toks.sort(key=lambda x: (token_freq.get(x, 0), -len(x))) - return toks[0] + lst = sorted(list(toks), key=lambda x: (token_freq.get(x, 10**9), -len(x))) + return lst[0] if lst else None # --- Hauptfunktion --- def main(): - logger.info("Starte Duplikats-Check v2.14 (Quality-first)") + logger.info("Starte Duplikats-Check v2.15 (Quality-first++)") try: sheet = GoogleSheetHandler() logger.info("GoogleSheetHandler initialisiert") @@ -190,12 +226,10 @@ def main(): logger.critical("Leere Daten in einem der Sheets. Abbruch.") return - # SerpAPI nur für Matching (fehlende URLs in B/E) → in 'Gefundene Website' speichern + # SerpAPI nur für Matching (B und E leer) if serp_key: - # Stelle sicher, dass Spalte E existiert if 'Gefundene Website' not in match_df.columns: match_df['Gefundene Website'] = '' - # B/E beide leer? Dann erst suchen. Alles andere: überspringen. b_empty = match_df['CRM Website'].fillna('').astype(str).str.strip().str.lower().isin(['','k.a.','k.a','n/a','na']) e_empty = match_df['Gefundene Website'].fillna('').astype(str).str.strip().str.lower().isin(['','k.a.','k.a','n/a','na']) empty_mask = b_empty & e_empty @@ -231,9 +265,9 @@ def main(): crm_df['CRM Ort'] = crm_df['CRM Ort'].astype(str).str.lower().str.strip() crm_df['CRM Land'] = crm_df['CRM Land'].astype(str).str.lower().str.strip() crm_df['block_key'] = crm_df['normalized_name'].apply(lambda x: x.split()[0] if x else None) - crm_df['domain_use_flag'] = 1 # CRM-Domain gilt immer als vertrauenswürdig + crm_df['domain_use_flag'] = 1 # CRM-Domain gilt als vertrauenswürdig - # Normalisierung Matching (Effektive Website: Original oder Gefundene, aber Domain nur nutzen bei Vertrauen=hoch) + # Normalisierung Matching match_df['Gefundene Website'] = match_df.get('Gefundene Website', pd.Series(index=match_df.index, dtype=object)) match_df['Serp Vertrauen'] = match_df.get('Serp Vertrauen', pd.Series(index=match_df.index, dtype=object)) match_df['Effektive Website'] = match_df['CRM Website'].fillna('').astype(str).str.strip() @@ -254,11 +288,19 @@ def main(): return 1 if trust == 'hoch' else 0 match_df['domain_use_flag'] = match_df.apply(_domain_use, axis=1) - # Debug-Sample - logger.debug(f"CRM-Sample: {crm_df.iloc[0][['normalized_name','normalized_domain','block_key']].to_dict()}") - logger.debug(f"Matching-Sample: {match_df.iloc[0][['normalized_name','normalized_domain','block_key','Effektive Website','Gefundene Website','Serp Vertrauen','domain_use_flag']].to_dict()}") + # City-Tokens dynamisch bauen (nach Normalisierung von Ort) + def build_city_tokens(crm_df, match_df): + cities = set() + for s in pd.concat([crm_df['CRM Ort'], match_df['CRM Ort']], ignore_index=True).dropna().unique(): + for t in _tokenize(s): + if len(t) >= 3: + cities.add(t) + return cities + global CITY_TOKENS + CITY_TOKENS = build_city_tokens(crm_df, match_df) + logger.info(f"City tokens gesammelt: {len(CITY_TOKENS)}") - # Blocking-Indizes + # Blocking-Indizes (nachdem CITY_TOKENS gesetzt wurde) crm_records, domain_index, token_freq, token_index = build_indexes(crm_df) logger.info(f"Blocking: Domains={len(domain_index)} | TokenKeys={len(token_index)}") @@ -269,10 +311,9 @@ def main(): logger.info("Starte Matching-Prozess…") processed = 0 - # iterate safely with index for idx, mrow in match_df.to_dict('index').items(): - name_disp = mrow.get('CRM Name','') processed += 1 + name_disp = mrow.get('CRM Name','') # Kandidatenwahl candidates = [] used_block = '' @@ -285,16 +326,22 @@ def main(): candidates = token_index.get(rtok, []) used_block = f"token:{rtok}" if not candidates: - # Prefilter über gesamte CRM-Liste + # Prefilter über gesamte CRM-Liste (strenger + limitierter; erfordert Rarest-Token-Overlap) pf = [] n1 = mrow.get('normalized_name','') - for r in crm_records: - n2 = r.get('normalized_name','') - if not n1 or not n2: - continue - pr = fuzz.partial_ratio(n1, n2) - if pr >= PREFILTER_MIN_PARTIAL: - pf.append((pr, r)) + rtok = choose_rarest_token(n1, token_freq) + clean1, toks1 = clean_name_for_scoring(n1) + if clean1: + for r in crm_records: + n2 = r.get('normalized_name','') + clean2, toks2 = clean_name_for_scoring(n2) + if not clean2: + continue + if rtok and rtok not in toks2: + continue + pr = fuzz.partial_ratio(clean1, clean2) + if pr >= PREFILTER_MIN_PARTIAL: + pf.append((pr, r)) pf.sort(key=lambda x: x[0], reverse=True) candidates = [r for _, r in pf[:PREFILTER_LIMIT]] used_block = f"prefilter:{PREFILTER_MIN_PARTIAL}/{len(pf)}" @@ -306,7 +353,7 @@ def main(): scored = [] for cr in candidates: - score, comp = calculate_similarity(mrow, cr) + score, comp = calculate_similarity(mrow, cr, token_freq) scored.append((cr.get('CRM Name',''), score, comp)) scored.sort(key=lambda x: x[1], reverse=True) @@ -316,8 +363,12 @@ def main(): best_name, best_score, best_comp = scored[0] - # Metriken - if best_score >= SCORE_THRESHOLD: + # Akzeptanzlogik (Weak-Threshold + Guard) + weak = (best_comp.get('domain_used') == 0 and not (best_comp.get('city_match') and best_comp.get('country_match'))) + applied_threshold = SCORE_THRESHOLD_WEAK if weak else SCORE_THRESHOLD + weak_guard_fail = (weak and best_comp.get('rare_overlap') == 0) + + if not weak_guard_fail and best_score >= applied_threshold: results.append({'Match': best_name, 'Score': best_score, 'Match_Grund': str(best_comp)}) metrics['matches_total'] += 1 if best_comp.get('domain_used') == 1: @@ -326,12 +377,13 @@ def main(): metrics['matches_with_loc'] += 1 if best_comp.get('domain_used') == 0 and best_comp.get('name') >= 85 and not (best_comp.get('city_match') and best_comp.get('country_match')): metrics['matches_name_only'] += 1 - logger.info(f" --> Match: '{best_name}' ({best_score}) {best_comp}") + logger.info(f" --> Match: '{best_name}' ({best_score}) {best_comp} | TH={applied_threshold}{' weak' if weak else ''}") else: - results.append({'Match':'', 'Score': best_score, 'Match_Grund': str(best_comp)}) - logger.info(f" --> Kein Match (Score={best_score}) {best_comp}") + reason = 'weak_guard_no_rare' if weak_guard_fail else 'below_threshold' + results.append({'Match':'', 'Score': best_score, 'Match_Grund': f"{best_comp} | {reason} TH={applied_threshold}"}) + logger.info(f" --> Kein Match (Score={best_score}) {best_comp} | {reason} TH={applied_threshold}") - # Ergebnisse zurückschreiben (SAFE: alle Originalspalten + neue, ohne interne Felder) + # Ergebnisse zurückschreiben (SAFE) logger.info("Schreibe Ergebnisse ins Sheet (SAFE in-place, keine Spaltenverluste)…") res_df = pd.DataFrame(results, index=match_df.index) write_df = match_df.copy() @@ -344,7 +396,6 @@ def main(): if c in write_df.columns: write_df.drop(columns=[c], inplace=True) - # Backup backup_path = os.path.join(LOG_DIR, f"{now}_backup_{MATCHING_SHEET_NAME}.csv") try: write_df.to_csv(backup_path, index=False, encoding='utf-8') @@ -359,12 +410,12 @@ def main(): else: logger.error("Fehler beim Schreiben ins Google Sheet") - # Abschluss-Metriken + # Summary serp_counts = Counter((str(x).lower() for x in write_df.get('Serp Vertrauen', []))) logger.info("===== Summary =====") logger.info(f"Matches total: {metrics['matches_total']} | mit Domain: {metrics['matches_domain']} | mit Ort: {metrics['matches_with_loc']} | nur Name: {metrics['matches_name_only']}") logger.info(f"Serp Vertrauen: {dict(serp_counts)}") - logger.info(f"Config: TH={SCORE_THRESHOLD}, MIN_NAME_FOR_DOMAIN={MIN_NAME_FOR_DOMAIN}, Penalties(city={CITY_MISMATCH_PENALTY},country={COUNTRY_MISMATCH_PENALTY}), Prefilter(partial>={PREFILTER_MIN_PARTIAL}, limit={PREFILTER_LIMIT})") + logger.info(f"Config: TH={SCORE_THRESHOLD}, TH_WEAK={SCORE_THRESHOLD_WEAK}, MIN_NAME_FOR_DOMAIN={MIN_NAME_FOR_DOMAIN}, Penalties(city={CITY_MISMATCH_PENALTY},country={COUNTRY_MISMATCH_PENALTY}), Prefilter(partial>={PREFILTER_MIN_PARTIAL}, limit={PREFILTER_LIMIT})") if __name__=='__main__': main()