Files
Brancheneinstufung2/duplicate_checker.py
Floke 2a7d546713 feat(duplicate-checker): quality-first Matching (Domain-Gate, Location-Penalties, Smart Blocking)
- Domain-Gate: Domain-Score (100) nur, wenn Name >= MIN_NAME_FOR_DOMAIN (default 70) ODER Ort+Land exakt matchen
- Location-Penalties: City-Mismatch -30, Country-Mismatch -40 (wenn Felder befüllt)
- Smart Blocking: Domain-Index -> seltenster Name-Token (Stopwörter gefiltert) -> Prefilter (partial_ratio >= 60, Top 50)
- Name-Score: max(token_set_ratio, partial_ratio, token_sort_ratio) + Name-only Bonus (+20) bei starken Namen
- SerpAPI nur für Matching-Accounts: schreibt "Gefundene Website"; Domain wird NUR bei Vertrauen=hoch genutzt
- Serp-Trust: hoch/mittel/niedrig (Token-Check gegen Domain)
- Transparenz: neue Spalten "Match", "Score", "Match_Grund", "Gefundene Website", "Serp Vertrauen"
- Safe Writeback: Originalspalten bleiben erhalten; interne Felder werden vor Write entfernt
- Logs: Log/{$timestamp}_duplicate_check_v2.13.txt, Summary-Metriken am Ende
- Backup: Log/{$timestamp}_backup_Matching_Accounts.csv

BREAKING CHANGES: none
2025-08-08 06:28:35 +00:00

364 lines
16 KiB
Python

import os
import sys
import re
import logging
import pandas as pd
from datetime import datetime
from collections import Counter
from thefuzz import fuzz
from helpers import normalize_company_name, simple_normalize_url, serp_website_lookup
from config import Config
from google_sheet_handler import GoogleSheetHandler
# duplicate_checker.py v2.13 (Quality-first: Domain-Gate, Location-Penalties, Smart Blocking, Serp-Trust, Metrics)
# Version-Build: dynamic timestamp below
# --- Konfiguration ---
CRM_SHEET_NAME = "CRM_Accounts"
MATCHING_SHEET_NAME = "Matching_Accounts"
SCORE_THRESHOLD = 80 # Schwellwert fürs Auto-Match
MIN_NAME_FOR_DOMAIN = 70 # Domain-Match gilt nur, wenn Name >= 70 ODER Ort matcht
CITY_MISMATCH_PENALTY = 30
COUNTRY_MISMATCH_PENALTY = 40
PREFILTER_MIN_PARTIAL = 60 # Vorfilter über gesamte CRM-Liste bei fehlenden Kandidaten
PREFILTER_LIMIT = 50 # Max. Kandidaten aus Vorfilter
LOG_DIR = "Log"
now = datetime.now().strftime('%Y-%m-%d_%H-%M')
LOG_FILE = f"{now}_duplicate_check_v2.13.txt"
# --- Logging Setup ---
if not os.path.exists(LOG_DIR):
os.makedirs(LOG_DIR, exist_ok=True)
log_path = os.path.join(LOG_DIR, LOG_FILE)
root = logging.getLogger()
root.setLevel(logging.DEBUG)
for h in list(root.handlers):
root.removeHandler(h)
formatter = logging.Formatter("%(asctime)s - %(levelname)-8s - %(message)s")
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
root.addHandler(ch)
fh = logging.FileHandler(log_path, mode='a', encoding='utf-8')
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
root.addHandler(fh)
logger = logging.getLogger(__name__)
logger.info(f"Logging to console and file: {log_path}")
logger.info(f"Starting duplicate_checker.py v2.13 | Build: {now}")
# --- SerpAPI Key laden ---
try:
Config.load_api_keys()
serp_key = Config.API_KEYS.get('serpapi')
if not serp_key:
logger.warning("SerpAPI Key nicht gefunden; Serp-Fallback deaktiviert.")
except Exception as e:
logger.warning(f"Fehler beim Laden API-Keys: {e}")
serp_key = None
STOP_TOKENS = {
'gmbh','mbh','ag','kg','ug','ohg','se','co','kg','kgaa','inc','llc','ltd','sarl',
'holding','gruppe','group','international','solutions','solution','service','services',
'deutschland','austria','germany','technik','technology','technologies','systems','systeme',
'logistik','logistics','industries','industrie','management','consulting','vertrieb','handel'
}
# --- Utilitys ---
def split_tokens(name: str):
if not name:
return []
return [t for t in str(name).split() if len(t) >= 3 and t not in STOP_TOKENS]
def assess_serp_trust(company_name: str, url: str) -> str:
"""Einfache Vertrauensstufe für recherchierte URL: hoch/mittel/niedrig."""
if not url:
return 'n/a'
host = simple_normalize_url(url) or ''
host = host.replace('www.', '')
tokens = [t for t in split_tokens(normalize_company_name(company_name)) if len(t) >= 4]
if any(t in host for t in tokens):
return 'hoch'
tokens3 = [t for t in split_tokens(normalize_company_name(company_name)) if len(t) == 3]
if any(t in host for t in tokens3):
return 'mittel'
return 'niedrig'
# --- Ähnlichkeitsberechnung ---
def calculate_similarity(mrec: dict, crec: dict):
# Domain-Komponente (mit Gate)
dom1 = mrec.get('normalized_domain','')
dom2 = crec.get('normalized_domain','')
m_domain_use = mrec.get('domain_use_flag', 0) # 1 nur wenn original URL oder Serp-Vertrauen hoch
domain_flag_raw = 1 if (m_domain_use == 1 and dom1 and dom1 == dom2) else 0
# Location
city_match = 1 if (mrec.get('CRM Ort') and crec.get('CRM Ort') and mrec.get('CRM Ort') == crec.get('CRM Ort')) else 0
country_match = 1 if (mrec.get('CRM Land') and crec.get('CRM Land') and mrec.get('CRM Land') == crec.get('CRM Land')) else 0
# Name
n1, n2 = mrec.get('normalized_name',''), crec.get('normalized_name','')
if n1 and n2:
ts = fuzz.token_set_ratio(n1,n2)
pr = fuzz.partial_ratio(n1,n2)
ss = fuzz.token_sort_ratio(n1,n2)
name_score = max(ts,pr,ss)
else:
name_score = 0
# Domain-Gate: Domain zählt nur, wenn Name >= MIN_NAME_FOR_DOMAIN ODER Ort+Land passt
domain_gate_ok = (name_score >= MIN_NAME_FOR_DOMAIN) or (city_match and country_match)
domain_flag = 1 if (domain_flag_raw and domain_gate_ok) else 0
# Basisscore
total = domain_flag*100 + name_score*1.0 + (1 if (city_match and country_match) else 0)*20
# Penalties bei Mismatch (nur anwenden, wenn entsprechende Felder befüllt und kein voller Location-Match)
penalties = 0
if mrec.get('CRM Land') and crec.get('CRM Land') and not country_match:
penalties += COUNTRY_MISMATCH_PENALTY
if mrec.get('CRM Ort') and crec.get('CRM Ort') and not city_match:
penalties += CITY_MISMATCH_PENALTY
total -= penalties
# Bonus für reine Name-Matches (keine Domain, kein Ort) wenn stark
bonus_flag = 1 if (domain_flag == 0 and not (city_match and country_match) and name_score >= 85) else 0
if bonus_flag:
total += 20
return (
round(total),
{
'domain_raw': domain_flag_raw,
'domain_used': domain_flag,
'domain_gate_ok': int(domain_gate_ok),
'name': round(name_score,1),
'city_match': city_match,
'country_match': country_match,
'penalties': penalties,
'name_bonus': bonus_flag
}
)
# --- Blocking vorbereiten ---
def build_indexes(crm_df: pd.DataFrame):
records = list(crm_df.to_dict('records'))
# Domain-Index
domain_index = {}
for r in records:
d = r.get('normalized_domain')
if d:
domain_index.setdefault(d, []).append(r)
# Token-Frequenzen
token_freq = Counter()
for r in records:
for t in set(split_tokens(r.get('normalized_name',''))):
token_freq[t] += 1
# Token-Index (nur sinnvolle Tokens)
token_index = {}
for r in records:
toks = [t for t in set(split_tokens(r.get('normalized_name',''))) if token_freq[t] > 0]
for t in toks:
token_index.setdefault(t, []).append(r)
return records, domain_index, token_freq, token_index
def choose_rarest_token(norm_name: str, token_freq: Counter):
toks = [t for t in split_tokens(norm_name) if len(t) >= 4 and token_freq.get(t, 0) > 0]
if not toks:
return None
# Rarest (kleinste Frequenz), zweitkriterium längster Token
toks.sort(key=lambda x: (token_freq.get(x, 0), -len(x)))
return toks[0]
# --- Hauptfunktion ---
def main():
logger.info("Starte Duplikats-Check v2.13 (Quality-first)")
try:
sheet = GoogleSheetHandler()
logger.info("GoogleSheetHandler initialisiert")
except Exception as e:
logger.critical(f"Init GoogleSheetHandler fehlgeschlagen: {e}")
sys.exit(1)
# Daten laden
crm_df = sheet.get_sheet_as_dataframe(CRM_SHEET_NAME)
match_df = sheet.get_sheet_as_dataframe(MATCHING_SHEET_NAME)
logger.info(f"{0 if crm_df is None else len(crm_df)} CRM-Datensätze | {0 if match_df is None else len(match_df)} Matching-Datensätze")
if crm_df is None or crm_df.empty or match_df is None or match_df.empty:
logger.critical("Leere Daten in einem der Sheets. Abbruch.")
return
# SerpAPI nur für Matching (fehlende URLs) → in 'Gefundene Website' speichern
if serp_key:
empty_mask = match_df['CRM Website'].fillna('').astype(str).str.strip() == ''
empty_count = int(empty_mask.sum())
if empty_count > 0:
logger.info(f"Serp-Fallback für Matching: {empty_count} Firmen ohne URL")
found_cnt = 0
trust_stats = Counter()
for idx, row in match_df[empty_mask].iterrows():
company = row['CRM Name']
try:
url = serp_website_lookup(company)
if url and 'k.A.' not in url:
if not str(url).startswith(('http://','https://')):
url = 'https://' + str(url).lstrip()
trust = assess_serp_trust(company, url)
match_df.at[idx, 'Gefundene Website'] = url
match_df.at[idx, 'Serp Vertrauen'] = trust
trust_stats[trust] += 1
logger.info(f" ✓ URL gefunden: '{company}' -> {url} (Vertrauen: {trust})")
found_cnt += 1
else:
logger.debug(f" ✗ Keine eindeutige URL: '{company}' -> {url}")
except Exception as e:
logger.warning(f" ! Serp-Fehler für '{company}': {e}")
logger.info(f"Serp-Fallback beendet: {found_cnt}/{empty_count} URLs ergänzt | Trust: {dict(trust_stats)}")
else:
logger.info("Serp-Fallback übersprungen: keine fehlenden Matching-URLs")
# Normalisierung CRM
crm_df['normalized_name'] = crm_df['CRM Name'].astype(str).apply(normalize_company_name)
crm_df['normalized_domain'] = crm_df['CRM Website'].astype(str).apply(simple_normalize_url)
crm_df['CRM Ort'] = crm_df['CRM Ort'].astype(str).str.lower().str.strip()
crm_df['CRM Land'] = crm_df['CRM Land'].astype(str).str.lower().str.strip()
crm_df['block_key'] = crm_df['normalized_name'].apply(lambda x: x.split()[0] if x else None)
crm_df['domain_use_flag'] = 1 # CRM-Domain gilt immer als vertrauenswürdig
# Normalisierung Matching (Effektive Website: Original oder Gefundene, aber Domain nur nutzen bei Vertrauen=hoch)
match_df['Gefundene Website'] = match_df.get('Gefundene Website', pd.Series(index=match_df.index, dtype=object))
match_df['Serp Vertrauen'] = match_df.get('Serp Vertrauen', pd.Series(index=match_df.index, dtype=object))
match_df['Effektive Website'] = match_df['CRM Website'].fillna('').astype(str).str.strip()
mask_eff = match_df['Effektive Website'] == ''
match_df.loc[mask_eff, 'Effektive Website'] = match_df['Gefundene Website'].fillna('').astype(str).str.strip()
match_df['normalized_name'] = match_df['CRM Name'].astype(str).apply(normalize_company_name)
match_df['normalized_domain'] = match_df['Effektive Website'].astype(str).apply(simple_normalize_url)
match_df['CRM Ort'] = match_df['CRM Ort'].astype(str).str.lower().str.strip()
match_df['CRM Land'] = match_df['CRM Land'].astype(str).str.lower().str.strip()
match_df['block_key'] = match_df['normalized_name'].apply(lambda x: x.split()[0] if x else None)
# Domain-Vertrauen/Use-Flag
def _domain_use(row):
if str(row.get('CRM Website','')).strip():
return 1
trust = str(row.get('Serp Vertrauen','')).lower()
return 1 if trust == 'hoch' else 0
match_df['domain_use_flag'] = match_df.apply(_domain_use, axis=1)
# Debug-Sample
logger.debug(f"CRM-Sample: {crm_df.iloc[0][['normalized_name','normalized_domain','block_key']].to_dict()}")
logger.debug(f"Matching-Sample: {match_df.iloc[0][['normalized_name','normalized_domain','block_key','Effektive Website','Gefundene Website','Serp Vertrauen','domain_use_flag']].to_dict()}")
# Blocking-Indizes
crm_records, domain_index, token_freq, token_index = build_indexes(crm_df)
logger.info(f"Blocking: Domains={len(domain_index)} | TokenKeys={len(token_index)}")
# Matching
results = []
metrics = Counter()
total = len(match_df)
logger.info("Starte Matching-Prozess…")
for i, mrow in match_df.to_dict('records'):
pass
# iterate safely with index
for idx, mrow in match_df.to_dict('index').items():
name_disp = mrow.get('CRM Name','')
# Kandidatenwahl
candidates = []
used_block = ''
if mrow.get('normalized_domain') and mrow.get('domain_use_flag') == 1:
candidates = domain_index.get(mrow['normalized_domain'], [])
used_block = f"domain:{mrow['normalized_domain']}"
if not candidates:
rtok = choose_rarest_token(mrow.get('normalized_name',''), token_freq)
if rtok:
candidates = token_index.get(rtok, [])
used_block = f"token:{rtok}"
if not candidates:
# Prefilter über gesamte CRM-Liste
pf = []
n1 = mrow.get('normalized_name','')
for r in crm_records:
n2 = r.get('normalized_name','')
if not n1 or not n2:
continue
pr = fuzz.partial_ratio(n1, n2)
if pr >= PREFILTER_MIN_PARTIAL:
pf.append((pr, r))
pf.sort(key=lambda x: x[0], reverse=True)
candidates = [r for _, r in pf[:PREFILTER_LIMIT]]
used_block = f"prefilter:{PREFILTER_MIN_PARTIAL}/{len(pf)}"
logger.info(f"Prüfe {idx+1}/{total}: '{name_disp}' -> {len(candidates)} Kandidaten (Block={used_block})")
if not candidates:
results.append({'Match':'', 'Score':0, 'Match_Grund':'keine Kandidaten'})
continue
scored = []
for cr in candidates:
score, comp = calculate_similarity(mrow, cr)
scored.append((cr.get('CRM Name',''), score, comp))
scored.sort(key=lambda x: x[1], reverse=True)
# Log Top5
for cand_name, sc, comp in scored[:5]:
logger.debug(f" Kandidat: {cand_name} | Score={sc} | Comp={comp}")
best_name, best_score, best_comp = scored[0]
# Metriken
if best_score >= SCORE_THRESHOLD:
results.append({'Match': best_name, 'Score': best_score, 'Match_Grund': str(best_comp)})
metrics['matches_total'] += 1
if best_comp.get('domain_used') == 1:
metrics['matches_domain'] += 1
if best_comp.get('city_match') and best_comp.get('country_match'):
metrics['matches_with_loc'] += 1
if best_comp.get('domain_used') == 0 and best_comp.get('name') >= 85 and not (best_comp.get('city_match') and best_comp.get('country_match')):
metrics['matches_name_only'] += 1
logger.info(f" --> Match: '{best_name}' ({best_score}) {best_comp}")
else:
results.append({'Match':'', 'Score': best_score, 'Match_Grund': str(best_comp)})
logger.info(f" --> Kein Match (Score={best_score}) {best_comp}")
# Ergebnisse zurückschreiben (SAFE: alle Originalspalten + neue, ohne interne Felder)
logger.info("Schreibe Ergebnisse ins Sheet (SAFE in-place, keine Spaltenverluste)…")
res_df = pd.DataFrame(results, index=match_df.index)
write_df = match_df.copy()
write_df['Match'] = res_df['Match']
write_df['Score'] = res_df['Score']
write_df['Match_Grund'] = res_df['Match_Grund']
drop_cols = ['normalized_name','normalized_domain','block_key','Effektive Website','domain_use_flag']
for c in drop_cols:
if c in write_df.columns:
write_df.drop(columns=[c], inplace=True)
# Backup
backup_path = os.path.join(LOG_DIR, f"{now}_backup_{MATCHING_SHEET_NAME}.csv")
try:
write_df.to_csv(backup_path, index=False, encoding='utf-8')
logger.info(f"Lokales Backup geschrieben: {backup_path}")
except Exception as e:
logger.warning(f"Backup fehlgeschlagen: {e}")
data = [write_df.columns.tolist()] + write_df.fillna('').values.tolist()
ok = sheet.clear_and_write_data(MATCHING_SHEET_NAME, data)
if ok:
logger.info("Ergebnisse erfolgreich geschrieben")
else:
logger.error("Fehler beim Schreiben ins Google Sheet")
# Abschluss-Metriken
serp_counts = Counter((str(x).lower() for x in write_df.get('Serp Vertrauen', [])))
logger.info("===== Summary =====")
logger.info(f"Matches total: {metrics['matches_total']} | mit Domain: {metrics['matches_domain']} | mit Ort: {metrics['matches_with_loc']} | nur Name: {metrics['matches_name_only']}")
logger.info(f"Serp Vertrauen: {dict(serp_counts)}")
logger.info(f"Config: TH={SCORE_THRESHOLD}, MIN_NAME_FOR_DOMAIN={MIN_NAME_FOR_DOMAIN}, Penalties(city={CITY_MISMATCH_PENALTY},country={COUNTRY_MISMATCH_PENALTY}), Prefilter(partial>={PREFILTER_MIN_PARTIAL}, limit={PREFILTER_LIMIT})")
if __name__=='__main__':
main()