duplicate_checker.py aktualisiert

This commit is contained in:
2025-08-06 13:19:44 +00:00
parent e43efa44cb
commit 558b75f325

View File

@@ -2,60 +2,69 @@ import os
import sys import sys
import logging import logging
import pandas as pd import pandas as pd
import tldextract
from datetime import datetime from datetime import datetime
from thefuzz import fuzz from thefuzz import fuzz
from helpers import normalize_company_name, simple_normalize_url from helpers import normalize_company_name, simple_normalize_url
from google_sheet_handler import GoogleSheetHandler from google_sheet_handler import GoogleSheetHandler
# duplicate_checker.py v2.3 (Rückkehr zum v2.0-Scoring, erweitert mit Logging) # duplicate_checker.py v2.4 (root-domain match via tldextract)
# Version: 2025-08-06_16-00 # Version: 2025-08-06_16-30
# --- Konfiguration --- # --- Konfiguration ---
CRM_SHEET_NAME = "CRM_Accounts" CRM_SHEET_NAME = "CRM_Accounts"
MATCHING_SHEET_NAME = "Matching_Accounts" MATCHING_SHEET_NAME = "Matching_Accounts"
SCORE_THRESHOLD = 80 # v2.0 Schwelle (0190 Skala) SCORE_THRESHOLD = 80 # Schwelle für automatisches Match
LOG_DIR = "Log" LOG_DIR = "Log"
# --- Logging Setup mit Datum im Dateinamen --- # --- Logging Setup mit Datum im Dateinamen ---
if not os.path.exists(LOG_DIR): if not os.path.exists(LOG_DIR):
os.makedirs(LOG_DIR, exist_ok=True) os.makedirs(LOG_DIR, exist_ok=True)
now = datetime.now().strftime('%Y-%m-%d_%H-%M') now = datetime.now().strftime('%Y-%m-%d_%H-%M')
log_path = os.path.join(LOG_DIR, f"{now}_Duplicate_v2.3.log") log_path = os.path.join(LOG_DIR, f"{now}_Duplicate_v2.4.log")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
# Console Handler (INFO+)
# Console-Handler (INFO+)
ch = logging.StreamHandler() ch = logging.StreamHandler()
ch.setLevel(logging.INFO) ch.setLevel(logging.INFO)
ch.setFormatter(logging.Formatter("%(asctime)s - %(levelname)-8s - %(message)s")) ch.setFormatter(logging.Formatter("%(asctime)s - %(levelname)-8s - %(message)s"))
logger.addHandler(ch) logger.addHandler(ch)
# File Handler (DEBUG+)
# File-Handler (DEBUG+)
fh = logging.FileHandler(log_path, mode='a', encoding='utf-8') fh = logging.FileHandler(log_path, mode='a', encoding='utf-8')
fh.setLevel(logging.DEBUG) fh.setLevel(logging.DEBUG)
fh.setFormatter(logging.Formatter("%(asctime)s - %(levelname)-8s - %(name)s - %(message)s")) fh.setFormatter(logging.Formatter("%(asctime)s - %(levelname)-8s - %(name)s - %(message)s"))
logger.addHandler(fh) logger.addHandler(fh)
logger.info(f"Logging in Datei: {log_path}") logger.info(f"Logging in Datei: {log_path}")
logger.info("Version: duplicate_checker.py v2.3 (v2.0-Scoring mit Logging) | Build: 2025-08-06_16-00") logger.info("Version: duplicate_checker.py v2.4 (root-domain match via tldextract) | Build: 2025-08-06_16-30")
def calculate_similarity(record1, record2): def calculate_similarity(record1, record2):
"""Berechnet den v2.0-Score: Domain=100, Name*0.7, Ort+Land=20.""" """Berechnet v2.0-Score mit root-domain match."""
total_score = 0 total_score = 0
# Domain(exakt) # Domain root only
if record1.get('normalized_domain') and record1['normalized_domain'] == record2.get('normalized_domain'): url1 = record1.get('CRM Website', '')
url2 = record2.get('CRM Website', '')
dom1 = tldextract.extract(url1).domain
dom2 = tldextract.extract(url2).domain
if dom1 and dom1 == dom2:
total_score += 100 total_score += 100
# Name fuzzy # Name fuzzy
name1 = record1.get('normalized_name','') name1 = record1.get('normalized_name', '')
name2 = record2.get('normalized_name','') name2 = record2.get('normalized_name', '')
if name1 and name2: if name1 and name2:
sim = fuzz.token_set_ratio(name1, name2) sim = fuzz.token_set_ratio(name1, name2)
total_score += sim * 0.7 total_score += sim * 0.7
# Ort+Land exakt # Ort+Land exact
if record1.get('CRM Ort') == record2.get('CRM Ort') and record1.get('CRM Land') == record2.get('CRM Land'): if record1.get('CRM Ort') == record2.get('CRM Ort') and record1.get('CRM Land') == record2.get('CRM Land'):
total_score += 20 total_score += 20
return round(total_score) return round(total_score)
def main(): def main():
logger.info("Starte Duplikats-Check v2.3 (v2.0-Scoring mit Logging)") logger.info("Starte Duplikats-Check v2.4 (root-domain match)")
try: try:
sheet = GoogleSheetHandler() sheet = GoogleSheetHandler()
logger.info("GoogleSheetHandler initialisiert") logger.info("GoogleSheetHandler initialisiert")
@@ -72,15 +81,15 @@ def main():
logger.info(f"{len(crm_df)} CRM- und {len(match_df)} Matching-Zeilen geladen") logger.info(f"{len(crm_df)} CRM- und {len(match_df)} Matching-Zeilen geladen")
# Normalisierung & Blocking-Key # Normalisierung & Blocking-Key
for df, label in [(crm_df,'CRM'), (match_df,'Matching')]: for df, label in [(crm_df, 'CRM'), (match_df, 'Matching')]:
df['normalized_name'] = df['CRM Name'].astype(str).apply(normalize_company_name) df['normalized_name'] = df['CRM Name'].astype(str).apply(normalize_company_name)
df['normalized_domain'] = df['CRM Website'].astype(str).apply(simple_normalize_url) df['normalized_domain'] = df['CRM Website'].astype(str).apply(simple_normalize_url)
df['CRM Ort'] = df['CRM Ort'].astype(str).str.lower().str.strip() df['CRM Ort'] = df['CRM Ort'].astype(str).str.lower().str.strip()
df['CRM Land'] = df['CRM Land'].astype(str).str.lower().str.strip() df['CRM Land'] = df['CRM Land'].astype(str).str.lower().str.strip()
df['block_key'] = df['normalized_name'].apply(lambda x: x.split()[0] if x else None) df['block_key'] = df['normalized_name'].apply(lambda x: x.split()[0] if x else None)
logger.debug(f"{label}-Sample: {df.iloc[0][['normalized_name','normalized_domain','block_key']].to_dict()}") logger.debug(f"{label}-Sample: {df.iloc[0][['normalized_name','normalized_domain','block_key']].to_dict()}")
# Build Blocking-Index # Blocking-Index bauen
crm_index = {} crm_index = {}
for idx, row in crm_df.iterrows(): for idx, row in crm_df.iterrows():
key = row['block_key'] key = row['block_key']
@@ -88,32 +97,31 @@ def main():
crm_index.setdefault(key, []).append(row) crm_index.setdefault(key, []).append(row)
logger.info(f"Blocking-Index mit {len(crm_index)} Keys erstellt") logger.info(f"Blocking-Index mit {len(crm_index)} Keys erstellt")
# Matching mit relevanten Kandidaten im Log # Matching mit Top-3-Log
results = [] results = []
total = len(match_df) total = len(match_df)
for i, mrow in match_df.iterrows(): for i, mrow in match_df.iterrows():
key = mrow['block_key'] key = mrow['block_key']
candidates = crm_index.get(key, []) cands = crm_index.get(key, [])
logger.info(f"Prüfe {i+1}/{total}: '{mrow['CRM Name']}' (Key='{key}') -> {len(candidates)} Kandidaten") logger.info(f"Prüfe {i+1}/{total}: '{mrow['CRM Name']}' (Key='{key}') -> {len(cands)} Kandidaten")
if not candidates: if not cands:
results.append({'Potenzieller Treffer im CRM':'','Ähnlichkeits-Score':0}) results.append({'Potenzieller Treffer im CRM': '', 'Ähnlichkeits-Score': 0})
continue continue
# Score for each candidate # Score für Kandidaten
scored = [(crow['CRM Name'], calculate_similarity(mrow,crow)) for crow in candidates] scored = [(crow['CRM Name'], calculate_similarity(mrow, crow)) for crow in cands]
# Top 3 candidates logged
top3 = sorted(scored, key=lambda x: x[1], reverse=True)[:3] top3 = sorted(scored, key=lambda x: x[1], reverse=True)[:3]
logger.debug(f" Top3 Kandidaten: {top3}") logger.debug(f" Top3 Kandidaten: {top3}")
best_name, best_score = max(scored, key=lambda x: x[1]) best_name, best_score = max(scored, key=lambda x: x[1])
if best_score >= SCORE_THRESHOLD: if best_score >= SCORE_THRESHOLD:
results.append({'Potenzieller Treffer im CRM':best_name,'Ähnlichkeits-Score':best_score}) results.append({'Potenzieller Treffer im CRM': best_name, 'Ähnlichkeits-Score': best_score})
logger.info(f" --> Match: '{best_name}' mit Score {best_score}") logger.info(f" --> Match: '{best_name}' mit Score {best_score}")
else: else:
results.append({'Potenzieller Treffer im CRM':'','Ähnlichkeits-Score':best_score}) results.append({'Potenzieller Treffer im CRM': '', 'Ähnlichkeits-Score': best_score})
logger.info(f" --> Kein Match (höchster Score {best_score})") logger.info(f" --> Kein Match (höchster Score {best_score})")
# Write results back # Ergebnisse zurückschreiben
out = pd.DataFrame(results) out_df = pd.DataFrame(results)
output = pd.concat([match_df[['CRM Name','CRM Website','CRM Ort','CRM Land']].reset_index(drop=True), out], axis=1) output = pd.concat([match_df[['CRM Name','CRM Website','CRM Ort','CRM Land']].reset_index(drop=True), out_df], axis=1)
data = [output.columns.tolist()] + output.values.tolist() data = [output.columns.tolist()] + output.values.tolist()
ok = sheet.clear_and_write_data(MATCHING_SHEET_NAME, data) ok = sheet.clear_and_write_data(MATCHING_SHEET_NAME, data)
if ok: if ok:
@@ -121,5 +129,5 @@ def main():
else: else:
logger.error("Fehler beim Schreiben ins Google Sheet") logger.error("Fehler beim Schreiben ins Google Sheet")
if __name__=='__main__': if __name__ == '__main__':
main() main()