duplicate_checker.py aktualisiert

This commit is contained in:
2025-08-05 14:38:52 +00:00
parent 4cb3e12c21
commit 6cf123d98e

View File

@@ -1,3 +1,4 @@
import os
import re
import logging
import pandas as pd
@@ -14,11 +15,37 @@ WEIGHTS = {
'name': 0.4,
'city': 0.1,
}
LOG_DIR = '/log'
LOG_FILENAME = 'duplicate_check.log'
# --- Logging Setup ---
LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)s - %(message)s'
logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT)
if not os.path.exists(LOG_DIR):
try:
os.makedirs(LOG_DIR)
except Exception as e:
print(f"Warnung: Konnte Log-Ordner nicht anlegen: {e}")
log_path = os.path.join(LOG_DIR, LOG_FILENAME)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(levelname)-8s - %(name)s - %(message)s')
# Console Handler
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
# File Handler
try:
file_handler = logging.FileHandler(log_path, mode='a', encoding='utf-8')
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.info(f"Logging auch in Datei: {log_path}")
except Exception as e:
logger.warning(f"Konnte keine Log-Datei schreiben: {e}")
# --- Hilfsfunktionen ---
def normalize_company_name(name: str) -> str:
@@ -41,7 +68,7 @@ def normalize_domain(url: str) -> str:
def main():
logger.info("Starte den Duplikats-Check (v2.0 mit Blocking und Maximum Logging)...")
logger.info("Starte den Duplikats-Check (v2.0 mit Logging in /log)...")
# GoogleSheetHandler initialisieren
try:
sheet_handler = GoogleSheetHandler()
@@ -71,7 +98,7 @@ def main():
df['norm_name'] = df['CRM Name'].fillna('').apply(normalize_company_name)
df['norm_domain'] = df['CRM Website'].fillna('').apply(normalize_domain)
df['city'] = df['CRM Ort'].fillna('').apply(lambda x: str(x).casefold().strip())
logger.debug(f"{label}-Daten nach Normalisierung. Erste Zeile: {df.iloc[0].to_dict()}")
logger.debug(f"{label}-Daten normalisiert. Erste Zeile: {df.iloc[0].to_dict()}")
# Blocking per Domain
indexer = recordlinkage.Index()
@@ -100,7 +127,7 @@ def main():
for match_idx, group in features.reset_index().groupby('level_1'):
logger.info(f"--- Prüfe: Zeile {match_idx} ---")
df_block = group.sort_values('score', ascending=False)
logger.debug(f" Kandidaten für Zeile {match_idx}:\n{df_block[['level_0','score','domain','name_sim','city']].to_string(index=False)}")
logger.debug(f"Kandidaten für Zeile {match_idx}:\n{df_block[['level_0','score','domain','name_sim','city']].to_string(index=False)}")
top = df_block.iloc[0]
crm_idx = top['level_0'] if top['score'] >= SCORE_THRESHOLD else None
if crm_idx is not None: