duplicate_checker.py aktualisiert

This commit is contained in:
2025-08-06 13:35:25 +00:00
parent 4c38af9781
commit f594a54fbf

View File

@@ -2,83 +2,86 @@ import os
import sys import sys
import logging import logging
import pandas as pd import pandas as pd
from datetime import datetime
from thefuzz import fuzz from thefuzz import fuzz
from helpers import normalize_company_name, simple_normalize_url from helpers import normalize_company_name, simple_normalize_url
from google_sheet_handler import GoogleSheetHandler from google_sheet_handler import GoogleSheetHandler
# duplicate_checker.py v2.5 (Original v2.0-Logik + Logging enhancements) # duplicate_checker.py v2.6 (Original v2.0 Kern + Logging)
# Version: 2025-08-06_17-00 # Version: 2025-08-06_17-15
# Version: 2025-08-06_16-30
# --- Konfiguration --- # --- Konfiguration ---
CRM_SHEET_NAME = "CRM_Accounts" CRM_SHEET_NAME = "CRM_Accounts"
MATCHING_SHEET_NAME = "Matching_Accounts" MATCHING_SHEET_NAME = "Matching_Accounts"
SCORE_THRESHOLD = 80 # Schwelle für automatisches Match SCORE_THRESHOLD = 80
LOG_DIR = "Log" LOG_DIR = "Log"
LOG_FILE = "duplicate_check_v2.6.log"
# --- Logging Setup mit Datum im Dateinamen --- # --- Logging Setup ---
if not os.path.exists(LOG_DIR): if not os.path.exists(LOG_DIR):
os.makedirs(LOG_DIR, exist_ok=True) os.makedirs(LOG_DIR, exist_ok=True)
now = datetime.now().strftime('%Y-%m-%d_%H-%M') log_path = os.path.join(LOG_DIR, LOG_FILE)
log_path = os.path.join(LOG_DIR, f"{now}_Duplicate_v2.4.log")
# Global logging config
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s - %(levelname)-8s - %(message)s",
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler(log_path, mode='a', encoding='utf-8')
]
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
# Console-Handler (INFO+) logger.info(f"Starting duplicate_checker.py v2.6 | Log: {log_path}")
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(logging.Formatter("%(asctime)s - %(levelname)-8s - %(message)s"))
logger.addHandler(ch)
# File-Handler (DEBUG+)
fh = logging.FileHandler(log_path, mode='a', encoding='utf-8')
fh.setLevel(logging.DEBUG)
fh.setFormatter(logging.Formatter("%(asctime)s - %(levelname)-8s - %(name)s - %(message)s"))
logger.addHandler(fh)
logger.info(f"Logging in Datei: {log_path}")
logger.info("Version: duplicate_checker.py v2.4 (root-domain match via tldextract) | Build: 2025-08-06_16-30")
def calculate_similarity(record1, record2): def calculate_similarity(record1, record2):
"""Berechnet den v2.0-Score: Domain exact=100, Name*0.7, Ort+Land=20.""" """Berechnet einen gewichteten Ähnlichkeits-Score (0190)."""
total_score = 0 total_score = 0
# Domain exact match # Domain-Exact
dom1 = record1.get('normalized_domain', '') dom1 = record1.get('normalized_domain', '')
dom2 = record2.get('normalized_domain', '') dom2 = record2.get('normalized_domain', '')
if dom1 and dom1 == dom2: if dom1 and dom1 == dom2:
total_score += 100 total_score += 100
# Name fuzzy token_set # Name-Fuzzy
name1 = record1.get('normalized_name', '') name1 = record1.get('normalized_name', '')
name2 = record2.get('normalized_name', '') name2 = record2.get('normalized_name', '')
if name1 and name2: if name1 and name2:
sim = fuzz.token_set_ratio(name1, name2) name_similarity = fuzz.token_set_ratio(name1, name2)
total_score += sim * 0.7 total_score += name_similarity * 0.7
# Ort+Land exact # Ort+Land exact
if record1.get('CRM Ort') == record2.get('CRM Ort') and record1.get('CRM Land') == record2.get('CRM Land'): if record1.get('CRM Ort') and record1.get('CRM Ort') == record2.get('CRM Ort'):
if record1.get('CRM Land') and record1.get('CRM Land') == record2.get('CRM Land'):
total_score += 20 total_score += 20
return round(total_score) return round(total_score)
def main(): def main():
logger.info("Starte Duplikats-Check v2.4 (root-domain match)") logger.info("Starte Duplikats-Check v2.6 (Original v2.0 Kern mit Logging)")
try: try:
sheet = GoogleSheetHandler() sheet_handler = GoogleSheetHandler()
logger.info("GoogleSheetHandler initialisiert") logger.info("GoogleSheetHandler initialisiert")
except Exception as e: except Exception as e:
logger.critical(f"FEHLER Init GoogleSheetHandler: {e}") logger.critical(f"FEHLER Init GoogleSheetHandler: {e}")
sys.exit(1) sys.exit(1)
# Daten laden # Load data
crm_df = sheet.get_sheet_as_dataframe(CRM_SHEET_NAME) logger.info(f"Lade Master-Daten aus '{CRM_SHEET_NAME}'...")
match_df = sheet.get_sheet_as_dataframe(MATCHING_SHEET_NAME) crm_df = sheet_handler.get_sheet_as_dataframe(CRM_SHEET_NAME)
if crm_df is None or crm_df.empty or match_df is None or match_df.empty: if crm_df is None or crm_df.empty:
logger.critical("Daten fehlen. Abbruch.") logger.critical(f"Keine Daten in '{CRM_SHEET_NAME}'. Abbruch.")
return return
logger.info(f"{len(crm_df)} CRM- und {len(match_df)} Matching-Zeilen geladen") logger.info(f"{len(crm_df)} CRM-Datensätze geladen")
# Normalisierung & Blocking-Key logger.info(f"Lade Matching-Daten aus '{MATCHING_SHEET_NAME}'...")
match_df = sheet_handler.get_sheet_as_dataframe(MATCHING_SHEET_NAME)
if match_df is None or match_df.empty:
logger.critical(f"Keine Daten in '{MATCHING_SHEET_NAME}'. Abbruch.")
return
logger.info(f"{len(match_df)} Matching-Datensätze geladen")
# Normalize
logger.info("Normalisiere Daten...")
for df, label in [(crm_df, 'CRM'), (match_df, 'Matching')]: for df, label in [(crm_df, 'CRM'), (match_df, 'Matching')]:
df['normalized_name'] = df['CRM Name'].astype(str).apply(normalize_company_name) df['normalized_name'] = df['CRM Name'].astype(str).apply(normalize_company_name)
df['normalized_domain'] = df['CRM Website'].astype(str).apply(simple_normalize_url) df['normalized_domain'] = df['CRM Website'].astype(str).apply(simple_normalize_url)
@@ -87,42 +90,46 @@ def main():
df['block_key'] = df['normalized_name'].apply(lambda x: x.split()[0] if x else None) df['block_key'] = df['normalized_name'].apply(lambda x: x.split()[0] if x else None)
logger.debug(f"{label}-Sample: {df.iloc[0][['normalized_name','normalized_domain','block_key']].to_dict()}") logger.debug(f"{label}-Sample: {df.iloc[0][['normalized_name','normalized_domain','block_key']].to_dict()}")
# Blocking-Index bauen # Build blocking index
logger.info("Erstelle Blocking-Index...")
crm_index = {} crm_index = {}
for idx, row in crm_df.iterrows(): for idx, row in crm_df.iterrows():
key = row['block_key'] key = row['block_key']
if key: if key:
crm_index.setdefault(key, []).append(row) crm_index.setdefault(key, []).append(row)
logger.info(f"Blocking-Index mit {len(crm_index)} Keys erstellt") logger.info(f"Blocking-Index erstellt mit {len(crm_index)} Keys")
# Matching mit Top-3-Log # Matching
logger.info("Starte Matching-Prozess...")
results = [] results = []
total = len(match_df) total = len(match_df)
for i, mrow in match_df.iterrows(): for i, mrow in match_df.iterrows():
key = mrow['block_key'] key = mrow['block_key']
cands = crm_index.get(key, []) candidates = crm_index.get(key, [])
logger.info(f"Prüfe {i+1}/{total}: '{mrow['CRM Name']}' (Key='{key}') -> {len(cands)} Kandidaten") logger.info(f"Prüfe {i+1}/{total}: '{mrow['CRM Name']}' -> {len(candidates)} Kandidaten")
if not cands: if not candidates:
results.append({'Potenzieller Treffer im CRM': '', 'Ähnlichkeits-Score': 0}) results.append({'Potenzieller Treffer im CRM': '', 'Ähnlichkeits-Score': 0})
continue continue
# Score für Kandidaten scored = [(crow['CRM Name'], calculate_similarity(mrow, crow)) for crow in candidates]
scored = [(crow['CRM Name'], calculate_similarity(mrow, crow)) for crow in cands] # Log Top-3 only
top3 = sorted(scored, key=lambda x: x[1], reverse=True)[:3] top3 = sorted(scored, key=lambda x: x[1], reverse=True)[:3]
logger.debug(f" Top3 Kandidaten: {top3}") logger.debug(f" Top3 Kandidaten: {top3}")
best_name, best_score = max(scored, key=lambda x: x[1]) best_name, best_score = max(scored, key=lambda x: x[1])
if best_score >= SCORE_THRESHOLD: if best_score >= SCORE_THRESHOLD:
results.append({'Potenzieller Treffer im CRM': best_name, 'Ähnlichkeits-Score': best_score}) results.append({'Potenzieller Treffer im CRM': best_name, 'Ähnlichkeits-Score': best_score})
logger.info(f" --> Match: '{best_name}' mit Score {best_score}") logger.info(f" --> Match: '{best_name}' Score={best_score}")
else: else:
results.append({'Potenzieller Treffer im CRM': '', 'Ähnlichkeits-Score': best_score}) results.append({'Potenzieller Treffer im CRM': best_name if best_name else '', 'Ähnlichkeits-Score': best_score})
logger.info(f" --> Kein Match (höchster Score {best_score})") logger.info(f" --> Kein Match (höchster Score {best_score})")
# Ergebnisse zurückschreiben # Write back
out_df = pd.DataFrame(results) logger.info("Schreibe Ergebnisse zurück ins Sheet...")
output = pd.concat([match_df[['CRM Name','CRM Website','CRM Ort','CRM Land']].reset_index(drop=True), out_df], axis=1) result_df = pd.DataFrame(results)
data = [output.columns.tolist()] + output.values.tolist() output_df = match_df[['CRM Name','CRM Website','CRM Ort','CRM Land']].copy()
ok = sheet.clear_and_write_data(MATCHING_SHEET_NAME, data) output_df = pd.concat([output_df.reset_index(drop=True), result_df], axis=1)
if ok: data_to_write = [output_df.columns.tolist()] + output_df.values.tolist()
success = sheet_handler.clear_and_write_data(MATCHING_SHEET_NAME, data_to_write)
if success:
logger.info("Ergebnisse erfolgreich geschrieben") logger.info("Ergebnisse erfolgreich geschrieben")
else: else:
logger.error("Fehler beim Schreiben ins Google Sheet") logger.error("Fehler beim Schreiben ins Google Sheet")