duplicate_checker.py aktualisiert

This commit is contained in:
2025-08-06 11:37:35 +00:00
parent 786086a6e9
commit c216b24024

View File

@@ -3,7 +3,6 @@ import sys
import logging import logging
import pandas as pd import pandas as pd
from datetime import datetime from datetime import datetime
import tldextract
from thefuzz import fuzz from thefuzz import fuzz
from helpers import normalize_company_name, simple_normalize_url from helpers import normalize_company_name, simple_normalize_url
from google_sheet_handler import GoogleSheetHandler from google_sheet_handler import GoogleSheetHandler
@@ -11,14 +10,14 @@ from google_sheet_handler import GoogleSheetHandler
# --- Konfiguration --- # --- Konfiguration ---
CRM_SHEET_NAME = "CRM_Accounts" CRM_SHEET_NAME = "CRM_Accounts"
MATCHING_SHEET_NAME = "Matching_Accounts" MATCHING_SHEET_NAME = "Matching_Accounts"
SCORE_THRESHOLD = 80 # ab hier automatisches Match SCORE_THRESHOLD = 80 # Score ab hier gilt als Match
LOG_DIR = "Log" LOG_DIR = "Log"
# --- Logging Setup mit Datum im Dateinamen --- # --- Logging Setup mit Datum im Dateinamen ---
if not os.path.exists(LOG_DIR): if not os.path.exists(LOG_DIR):
os.makedirs(LOG_DIR, exist_ok=True) os.makedirs(LOG_DIR, exist_ok=True)
now = datetime.now().strftime('%Y-%m-%d_%H-%M') now = datetime.now().strftime('%Y-%m-%d_%H-%M')
log_path = os.path.join(LOG_DIR, f"{now}_Duplicate.txt") log_path = os.path.join(LOG_DIR, f"{now}_Duplicate.log")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
@@ -39,28 +38,27 @@ logger.info(f"Logging in Datei: {log_path}")
def calculate_similarity(record1, record2): def calculate_similarity(record1, record2):
"""Berechnet gewichteten Ähnlichkeits-Score (0190) zwischen zwei Datensätzen.""" """Berechnet gewichteten Ähnlichkeits-Score zwischen zwei Datensätzen."""
total = 0 total = 0
# Domain-Check über registered domain # Domain exact match über normalisierte Domain
url1 = record1.get('CRM Website','') dom1 = record1.get('normalized_domain', '')
url2 = record2.get('CRM Website','') dom2 = record2.get('normalized_domain', '')
dom1 = tldextract.extract(url1).registered_domain or ''
dom2 = tldextract.extract(url2).registered_domain or ''
if dom1 and dom1 == dom2: if dom1 and dom1 == dom2:
total += 100 total += 100
# Name-Fuzzy # Name fuzzy (Token-Set Ratio)
name1 = record1['normalized_name'] name1 = record1.get('normalized_name', '')
name2 = record2['normalized_name'] name2 = record2.get('normalized_name', '')
if name1 and name2: if name1 and name2:
total += fuzz.token_set_ratio(name1, name2) * 0.7 name_score = fuzz.token_set_ratio(name1, name2)
# Ort+Land exakt total += name_score * 0.7
if record1['CRM Ort'] == record2['CRM Ort'] and record1['CRM Land'] == record2['CRM Land']: # Ort+Land exact
if record1.get('CRM Ort') == record2.get('CRM Ort') and record1.get('CRM Land') == record2.get('CRM Land'):
total += 20 total += 20
return round(total) return round(total)
def main(): def main():
logger.info("Starte Duplikats-Check (v2.0) mit Datum im Lognamen und verbessertem Domain-Match") logger.info("Starte Duplikats-Check (v2.0 mit Kern-Syntax nach Entwurf)")
try: try:
sheet = GoogleSheetHandler() sheet = GoogleSheetHandler()
logger.info("GoogleSheetHandler initialisiert") logger.info("GoogleSheetHandler initialisiert")
@@ -83,7 +81,7 @@ def main():
df['CRM Ort'] = df['CRM Ort'].astype(str).str.lower().str.strip() df['CRM Ort'] = df['CRM Ort'].astype(str).str.lower().str.strip()
df['CRM Land'] = df['CRM Land'].astype(str).str.lower().str.strip() df['CRM Land'] = df['CRM Land'].astype(str).str.lower().str.strip()
df['block_key'] = df['normalized_name'].apply(lambda x: x.split()[0] if x else None) df['block_key'] = df['normalized_name'].apply(lambda x: x.split()[0] if x else None)
logger.debug(f"{label}-Normierung Beispiel: {df.iloc[0][['normalized_name','normalized_domain','block_key']].to_dict()}") logger.debug(f"{label}-Beispiel nach Normalisierung: {df.iloc[0][['normalized_name','normalized_domain','block_key']].to_dict()}")
# Blocking-Index # Blocking-Index
crm_index = {} crm_index = {}
@@ -93,23 +91,22 @@ def main():
crm_index.setdefault(key, []).append(row) crm_index.setdefault(key, []).append(row)
logger.info(f"Blocking-Index erstellt: {len(crm_index)} Keys") logger.info(f"Blocking-Index erstellt: {len(crm_index)} Keys")
# Matching # Matching mit Log relevanter Kandidaten
results = [] results = []
total = len(match_df) total = len(match_df)
for i, mrow in match_df.iterrows(): for i, mrow in match_df.iterrows():
key = mrow['block_key'] key = mrow['block_key']
cands = crm_index.get(key, []) candidates = crm_index.get(key, [])
logger.info(f"Prüfe {i+1}/{total}: '{mrow['CRM Name']}' (Key='{key}') -> {len(cands)} Kandidaten") logger.info(f"Prüfe {i+1}/{total}: '{mrow['CRM Name']}' (Key='{key}') -> {len(candidates)} Kandidaten")
if not cands: if not candidates:
results.append({'Match': '', 'Score': 0}) results.append({'Match': '', 'Score': 0})
continue continue
scored = [] # Scores sammeln
for crow in cands: scored = [(crow['CRM Name'], calculate_similarity(mrow, crow)) for crow in candidates]
score = calculate_similarity(mrow, crow) # Top 3 relevante Kandidaten loggen
scored.append((crow['CRM Name'], score)) top3 = sorted(scored, key=lambda x: x[1], reverse=True)[:3]
# Log relevante Kandidaten mit Score>=SCORE_THRESHOLD-20 logger.debug(f" Top 3 Kandidaten: {top3}")
relevant = [(n,s) for n,s in scored if s >= SCORE_THRESHOLD-20] # Besten Treffer wählen
logger.debug(f" Relevante Kandidaten (>= {SCORE_THRESHOLD-20}): {relevant}")
best_name, best_score = max(scored, key=lambda x: x[1]) best_name, best_score = max(scored, key=lambda x: x[1])
if best_score >= SCORE_THRESHOLD: if best_score >= SCORE_THRESHOLD:
results.append({'Match': best_name, 'Score': best_score}) results.append({'Match': best_name, 'Score': best_score})
@@ -118,7 +115,7 @@ def main():
results.append({'Match': '', 'Score': best_score}) results.append({'Match': '', 'Score': best_score})
logger.info(f" --> Kein Match (höchster Score {best_score})") logger.info(f" --> Kein Match (höchster Score {best_score})")
# Ergebnis zurück in Sheet # Ergebnisse zurück ins Sheet
out = pd.DataFrame(results) out = pd.DataFrame(results)
output = pd.concat([match_df[['CRM Name','CRM Website','CRM Ort','CRM Land']].reset_index(drop=True), out], axis=1) output = pd.concat([match_df[['CRM Name','CRM Website','CRM Ort','CRM Land']].reset_index(drop=True), out], axis=1)
data = [output.columns.tolist()] + output.values.tolist() data = [output.columns.tolist()] + output.values.tolist()