Files
Brancheneinstufung2/duplicate_checker.py

126 lines
5.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import sys
import logging
import pandas as pd
from datetime import datetime
from thefuzz import fuzz
from helpers import normalize_company_name, simple_normalize_url
from google_sheet_handler import GoogleSheetHandler
# duplicate_checker.py v2.3 (Rückkehr zum v2.0-Scoring, erweitert mit Logging)
# Version: 2025-08-06_16-00
# --- Konfiguration ---
CRM_SHEET_NAME = "CRM_Accounts"
MATCHING_SHEET_NAME = "Matching_Accounts"
SCORE_THRESHOLD = 80 # v2.0 Schwelle (0190 Skala)
LOG_DIR = "Log"
# --- Logging Setup mit Datum im Dateinamen ---
if not os.path.exists(LOG_DIR):
os.makedirs(LOG_DIR, exist_ok=True)
now = datetime.now().strftime('%Y-%m-%d_%H-%M')
log_path = os.path.join(LOG_DIR, f"{now}_Duplicate_v2.3.log")
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
# Console Handler (INFO+)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(logging.Formatter("%(asctime)s - %(levelname)-8s - %(message)s"))
logger.addHandler(ch)
# File Handler (DEBUG+)
fh = logging.FileHandler(log_path, mode='a', encoding='utf-8')
fh.setLevel(logging.DEBUG)
fh.setFormatter(logging.Formatter("%(asctime)s - %(levelname)-8s - %(name)s - %(message)s"))
logger.addHandler(fh)
logger.info(f"Logging in Datei: {log_path}")
logger.info("Version: duplicate_checker.py v2.3 (v2.0-Scoring mit Logging) | Build: 2025-08-06_16-00")
def calculate_similarity(record1, record2):
"""Berechnet den v2.0-Score: Domain=100, Name*0.7, Ort+Land=20."""
total_score = 0
# Domain(exakt)
if record1.get('normalized_domain') and record1['normalized_domain'] == record2.get('normalized_domain'):
total_score += 100
# Name fuzzy
name1 = record1.get('normalized_name','')
name2 = record2.get('normalized_name','')
if name1 and name2:
sim = fuzz.token_set_ratio(name1, name2)
total_score += sim * 0.7
# Ort+Land exakt
if record1.get('CRM Ort') == record2.get('CRM Ort') and record1.get('CRM Land') == record2.get('CRM Land'):
total_score += 20
return round(total_score)
def main():
logger.info("Starte Duplikats-Check v2.3 (v2.0-Scoring mit Logging)")
try:
sheet = GoogleSheetHandler()
logger.info("GoogleSheetHandler initialisiert")
except Exception as e:
logger.critical(f"FEHLER Init GoogleSheetHandler: {e}")
sys.exit(1)
# Daten laden
crm_df = sheet.get_sheet_as_dataframe(CRM_SHEET_NAME)
match_df = sheet.get_sheet_as_dataframe(MATCHING_SHEET_NAME)
if crm_df is None or crm_df.empty or match_df is None or match_df.empty:
logger.critical("Daten fehlen. Abbruch.")
return
logger.info(f"{len(crm_df)} CRM- und {len(match_df)} Matching-Zeilen geladen")
# Normalisierung & Blocking-Key
for df, label in [(crm_df,'CRM'), (match_df,'Matching')]:
df['normalized_name'] = df['CRM Name'].astype(str).apply(normalize_company_name)
df['normalized_domain'] = df['CRM Website'].astype(str).apply(simple_normalize_url)
df['CRM Ort'] = df['CRM Ort'].astype(str).str.lower().str.strip()
df['CRM Land'] = df['CRM Land'].astype(str).str.lower().str.strip()
df['block_key'] = df['normalized_name'].apply(lambda x: x.split()[0] if x else None)
logger.debug(f"{label}-Sample: {df.iloc[0][['normalized_name','normalized_domain','block_key']].to_dict()}")
# Build Blocking-Index
crm_index = {}
for idx, row in crm_df.iterrows():
key = row['block_key']
if key:
crm_index.setdefault(key, []).append(row)
logger.info(f"Blocking-Index mit {len(crm_index)} Keys erstellt")
# Matching mit relevanten Kandidaten im Log
results = []
total = len(match_df)
for i, mrow in match_df.iterrows():
key = mrow['block_key']
candidates = crm_index.get(key, [])
logger.info(f"Prüfe {i+1}/{total}: '{mrow['CRM Name']}' (Key='{key}') -> {len(candidates)} Kandidaten")
if not candidates:
results.append({'Potenzieller Treffer im CRM':'','Ähnlichkeits-Score':0})
continue
# Score for each candidate
scored = [(crow['CRM Name'], calculate_similarity(mrow,crow)) for crow in candidates]
# Top 3 candidates logged
top3 = sorted(scored, key=lambda x: x[1], reverse=True)[:3]
logger.debug(f" Top3 Kandidaten: {top3}")
best_name, best_score = max(scored, key=lambda x: x[1])
if best_score >= SCORE_THRESHOLD:
results.append({'Potenzieller Treffer im CRM':best_name,'Ähnlichkeits-Score':best_score})
logger.info(f" --> Match: '{best_name}' mit Score {best_score}")
else:
results.append({'Potenzieller Treffer im CRM':'','Ähnlichkeits-Score':best_score})
logger.info(f" --> Kein Match (höchster Score {best_score})")
# Write results back
out = pd.DataFrame(results)
output = pd.concat([match_df[['CRM Name','CRM Website','CRM Ort','CRM Land']].reset_index(drop=True), out], axis=1)
data = [output.columns.tolist()] + output.values.tolist()
ok = sheet.clear_and_write_data(MATCHING_SHEET_NAME, data)
if ok:
logger.info("Ergebnisse erfolgreich geschrieben")
else:
logger.error("Fehler beim Schreiben ins Google Sheet")
if __name__=='__main__':
main()