duplicate_checker.py aktualisiert

This commit is contained in:
2025-08-06 10:32:38 +00:00
parent 722fd8fb9a
commit 1d8a067746

View File

@@ -1,6 +1,9 @@
import os import os
import sys
import logging import logging
import pandas as pd import pandas as pd
from datetime import datetime
import tldextract
from thefuzz import fuzz from thefuzz import fuzz
from helpers import normalize_company_name, simple_normalize_url from helpers import normalize_company_name, simple_normalize_url
from google_sheet_handler import GoogleSheetHandler from google_sheet_handler import GoogleSheetHandler
@@ -8,25 +11,25 @@ from google_sheet_handler import GoogleSheetHandler
# --- Konfiguration --- # --- Konfiguration ---
CRM_SHEET_NAME = "CRM_Accounts" CRM_SHEET_NAME = "CRM_Accounts"
MATCHING_SHEET_NAME = "Matching_Accounts" MATCHING_SHEET_NAME = "Matching_Accounts"
SCORE_THRESHOLD = 80 # Score ab hier gilt als Match SCORE_THRESHOLD = 80 # ab hier automatisches Match
LOG_DIR = "Log" LOG_DIR = "Log"
LOG_FILE = "duplicate_check.log"
# --- Logging Setup --- # --- Logging Setup mit Datum im Dateinamen ---
if not os.path.exists(LOG_DIR): if not os.path.exists(LOG_DIR):
os.makedirs(LOG_DIR, exist_ok=True) os.makedirs(LOG_DIR, exist_ok=True)
log_path = os.path.join(LOG_DIR, LOG_FILE) now = datetime.now().strftime('%Y-%m-%d_%H-%M')
log_path = os.path.join(LOG_DIR, f"{now}_Duplicate.txt")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
# Console Handler: INFO+ # Console-Handler (INFO+)
ch = logging.StreamHandler() ch = logging.StreamHandler()
ch.setLevel(logging.INFO) ch.setLevel(logging.INFO)
ch.setFormatter(logging.Formatter("%(asctime)s - %(levelname)-8s - %(message)s")) ch.setFormatter(logging.Formatter("%(asctime)s - %(levelname)-8s - %(message)s"))
logger.addHandler(ch) logger.addHandler(ch)
# File Handler: DEBUG+ # File-Handler (DEBUG+)
fh = logging.FileHandler(log_path, mode='a', encoding='utf-8') fh = logging.FileHandler(log_path, mode='a', encoding='utf-8')
fh.setLevel(logging.DEBUG) fh.setLevel(logging.DEBUG)
fh.setFormatter(logging.Formatter("%(asctime)s - %(levelname)-8s - %(name)s - %(message)s")) fh.setFormatter(logging.Formatter("%(asctime)s - %(levelname)-8s - %(name)s - %(message)s"))
@@ -36,93 +39,90 @@ logger.info(f"Logging in Datei: {log_path}")
def calculate_similarity(record1, record2): def calculate_similarity(record1, record2):
"""Berechnet gewichteten Ähnlichkeits-Score zwischen zwei Datensätzen.""" """Berechnet gewichteten Ähnlichkeits-Score (0190) zwischen zwei Datensätzen."""
total_score = 0 total = 0
# Domain exact match # Domain-Check über registered domain
if record1['normalized_domain'] and record1['normalized_domain'] == record2['normalized_domain']: url1 = record1.get('CRM Website','')
total_score += 100 url2 = record2.get('CRM Website','')
# Name fuzzy dom1 = tldextract.extract(url1).registered_domain or ''
name_similarity = fuzz.token_set_ratio(record1['normalized_name'], record2['normalized_name']) dom2 = tldextract.extract(url2).registered_domain or ''
total_score += name_similarity * 0.7 if dom1 and dom1 == dom2:
# Ort+Land exact total += 100
# Name-Fuzzy
name1 = record1['normalized_name']
name2 = record2['normalized_name']
if name1 and name2:
total += fuzz.token_set_ratio(name1, name2) * 0.7
# Ort+Land exakt
if record1['CRM Ort'] == record2['CRM Ort'] and record1['CRM Land'] == record2['CRM Land']: if record1['CRM Ort'] == record2['CRM Ort'] and record1['CRM Land'] == record2['CRM Land']:
total_score += 20 total += 20
return round(total_score) return round(total)
def main(): def main():
logger.info("Starte Duplikats-Check (v2.0 - mit Blocking & relevantem Kandidaten-Log)") logger.info("Starte Duplikats-Check (v2.0) mit Datum im Lognamen und verbessertem Domain-Match")
try: try:
sheet_handler = GoogleSheetHandler() sheet = GoogleSheetHandler()
logger.info("GoogleSheetHandler initialisiert") logger.info("GoogleSheetHandler initialisiert")
except Exception as e: except Exception as e:
logger.critical(f"FEHLER Init GoogleSheetHandler: {e}") logger.critical(f"FEHLER beim Init GoogleSheetHandler: {e}")
return sys.exit(1)
# Daten laden # Daten einlesen
crm_df = sheet_handler.get_sheet_as_dataframe(CRM_SHEET_NAME) crm_df = sheet.get_sheet_as_dataframe(CRM_SHEET_NAME)
match_df = sheet_handler.get_sheet_as_dataframe(MATCHING_SHEET_NAME) match_df = sheet.get_sheet_as_dataframe(MATCHING_SHEET_NAME)
if crm_df is None or crm_df.empty: if crm_df is None or crm_df.empty or match_df is None or match_df.empty:
logger.critical(f"Keine Daten in '{CRM_SHEET_NAME}'") logger.critical("CRM- oder Matching-Daten fehlen. Abbruch.")
return return
if match_df is None or match_df.empty: logger.info(f"{len(crm_df)} CRM-Datensätze, {len(match_df)} Matching-Datensätze geladen")
logger.critical(f"Keine Daten in '{MATCHING_SHEET_NAME}'")
return
logger.info(f"{len(crm_df)} CRM-Zeilen, {len(match_df)} Matching-Zeilen geladen")
# Normalisierung # Normalisierung und Blocking-Key
for df, label in [(crm_df, 'CRM'), (match_df, 'Matching')]: for df, label in [(crm_df, 'CRM'), (match_df, 'Matching')]:
df['normalized_name'] = df['CRM Name'].astype(str).apply(normalize_company_name) df['normalized_name'] = df['CRM Name'].astype(str).apply(normalize_company_name)
df['normalized_domain'] = df['CRM Website'].astype(str).apply(simple_normalize_url) df['normalized_domain'] = df['CRM Website'].astype(str).apply(simple_normalize_url)
df['CRM Ort'] = df['CRM Ort'].astype(str).str.lower().str.strip() df['CRM Ort'] = df['CRM Ort'].astype(str).str.lower().str.strip()
df['CRM Land'] = df['CRM Land'].astype(str).str.lower().str.strip() df['CRM Land'] = df['CRM Land'].astype(str).str.lower().str.strip()
# Blocking Key
df['block_key'] = df['normalized_name'].apply(lambda x: x.split()[0] if x else None) df['block_key'] = df['normalized_name'].apply(lambda x: x.split()[0] if x else None)
logger.debug(f"{label}-Sample nach Norm: {df.iloc[0][['normalized_name','normalized_domain','block_key']].to_dict()}") logger.debug(f"{label}-Normierung Beispiel: {df.iloc[0][['normalized_name','normalized_domain','block_key']].to_dict()}")
# Blocking Index erstellen # Blocking-Index
crm_index = {} crm_index = {}
for idx, row in crm_df.iterrows(): for idx, row in crm_df.iterrows():
key = row['block_key'] key = row['block_key']
if not key: continue if key:
crm_index.setdefault(key, []).append(row) crm_index.setdefault(key, []).append(row)
logger.info(f"Blocking-Index erstellt: {len(crm_index)} Keys") logger.info(f"Blocking-Index erstellt: {len(crm_index)} Keys")
# Matching # Matching
results = [] results = []
total = len(match_df) total = len(match_df)
for i, match_row in match_df.iterrows(): for i, mrow in match_df.iterrows():
key = match_row['block_key'] key = mrow['block_key']
candidates = crm_index.get(key, []) cands = crm_index.get(key, [])
logger.info(f"Prüfe {i+1}/{total}: {match_row['CRM Name']} (Key='{key}') -> {len(candidates)} Kandidaten") logger.info(f"Prüfe {i+1}/{total}: '{mrow['CRM Name']}' (Key='{key}') -> {len(cands)} Kandidaten")
if not cands:
if not candidates: results.append({'Match': '', 'Score': 0})
results.append({'Potenzieller Treffer im CRM': '', 'Ähnlichkeits-Score': 0})
continue continue
# Scores für Kandidaten sammeln
scored = [] scored = []
for crm_row in candidates: for crow in cands:
score = calculate_similarity(match_row, crm_row) score = calculate_similarity(mrow, crow)
scored.append((crm_row['CRM Name'], score)) scored.append((crow['CRM Name'], score))
# Top 3 loggen # Log relevante Kandidaten mit Score>=SCORE_THRESHOLD-20
top3 = sorted(scored, key=lambda x: x[1], reverse=True)[:3] relevant = [(n,s) for n,s in scored if s >= SCORE_THRESHOLD-20]
logger.debug(f" Top 3 Kandidaten: {top3}") logger.debug(f" Relevante Kandidaten (>= {SCORE_THRESHOLD-20}): {relevant}")
# Besten Treffer wählen
best_name, best_score = max(scored, key=lambda x: x[1]) best_name, best_score = max(scored, key=lambda x: x[1])
if best_score >= SCORE_THRESHOLD: if best_score >= SCORE_THRESHOLD:
results.append({'Potenzieller Treffer im CRM': best_name, 'Ähnlichkeits-Score': best_score}) results.append({'Match': best_name, 'Score': best_score})
logger.info(f" --> Match: '{best_name}' mit Score {best_score}") logger.info(f" --> Match: '{best_name}' mit Score {best_score}")
else: else:
results.append({'Potenzieller Treffer im CRM': '', 'Ähnlichkeits-Score': best_score}) results.append({'Match': '', 'Score': best_score})
logger.info(f" --> Kein Match (höchster Score {best_score})") logger.info(f" --> Kein Match (höchster Score {best_score})")
# Ergebnisse zurückschreiben # Ergebnis zurück in Sheet
out_df = pd.DataFrame(results) out = pd.DataFrame(results)
output = pd.concat([match_df[['CRM Name','CRM Website','CRM Ort','CRM Land']].reset_index(drop=True), out_df], axis=1) output = pd.concat([match_df[['CRM Name','CRM Website','CRM Ort','CRM Land']].reset_index(drop=True), out], axis=1)
data = [output.columns.tolist()] + output.values.tolist() data = [output.columns.tolist()] + output.values.tolist()
ok = sheet_handler.clear_and_write_data(MATCHING_SHEET_NAME, data) ok = sheet.clear_and_write_data(MATCHING_SHEET_NAME, data)
if ok: if ok:
logger.info("Ergebnisse erfolgreich geschrieben") logger.info("Ergebnisse erfolgreich geschrieben")
else: else: