duplicate_checker.py aktualisiert
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
import os
|
||||
import re
|
||||
import logging
|
||||
import pandas as pd
|
||||
@@ -14,11 +15,37 @@ WEIGHTS = {
|
||||
'name': 0.4,
|
||||
'city': 0.1,
|
||||
}
|
||||
LOG_DIR = '/log'
|
||||
LOG_FILENAME = 'duplicate_check.log'
|
||||
|
||||
# --- Logging Setup ---
|
||||
LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)s - %(message)s'
|
||||
logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT)
|
||||
if not os.path.exists(LOG_DIR):
|
||||
try:
|
||||
os.makedirs(LOG_DIR)
|
||||
except Exception as e:
|
||||
print(f"Warnung: Konnte Log-Ordner nicht anlegen: {e}")
|
||||
log_path = os.path.join(LOG_DIR, LOG_FILENAME)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
formatter = logging.Formatter('%(asctime)s - %(levelname)-8s - %(name)s - %(message)s')
|
||||
|
||||
# Console Handler
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setLevel(logging.INFO)
|
||||
console_handler.setFormatter(formatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
# File Handler
|
||||
try:
|
||||
file_handler = logging.FileHandler(log_path, mode='a', encoding='utf-8')
|
||||
file_handler.setLevel(logging.DEBUG)
|
||||
file_handler.setFormatter(formatter)
|
||||
logger.addHandler(file_handler)
|
||||
logger.info(f"Logging auch in Datei: {log_path}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Konnte keine Log-Datei schreiben: {e}")
|
||||
|
||||
# --- Hilfsfunktionen ---
|
||||
def normalize_company_name(name: str) -> str:
|
||||
@@ -41,7 +68,7 @@ def normalize_domain(url: str) -> str:
|
||||
|
||||
|
||||
def main():
|
||||
logger.info("Starte den Duplikats-Check (v2.0 mit Blocking und Maximum Logging)...")
|
||||
logger.info("Starte den Duplikats-Check (v2.0 mit Logging in /log)...")
|
||||
# GoogleSheetHandler initialisieren
|
||||
try:
|
||||
sheet_handler = GoogleSheetHandler()
|
||||
@@ -71,7 +98,7 @@ def main():
|
||||
df['norm_name'] = df['CRM Name'].fillna('').apply(normalize_company_name)
|
||||
df['norm_domain'] = df['CRM Website'].fillna('').apply(normalize_domain)
|
||||
df['city'] = df['CRM Ort'].fillna('').apply(lambda x: str(x).casefold().strip())
|
||||
logger.debug(f"{label}-Daten nach Normalisierung. Erste Zeile: {df.iloc[0].to_dict()}")
|
||||
logger.debug(f"{label}-Daten normalisiert. Erste Zeile: {df.iloc[0].to_dict()}")
|
||||
|
||||
# Blocking per Domain
|
||||
indexer = recordlinkage.Index()
|
||||
@@ -100,7 +127,7 @@ def main():
|
||||
for match_idx, group in features.reset_index().groupby('level_1'):
|
||||
logger.info(f"--- Prüfe: Zeile {match_idx} ---")
|
||||
df_block = group.sort_values('score', ascending=False)
|
||||
logger.debug(f" Kandidaten für Zeile {match_idx}:\n{df_block[['level_0','score','domain','name_sim','city']].to_string(index=False)}")
|
||||
logger.debug(f"Kandidaten für Zeile {match_idx}:\n{df_block[['level_0','score','domain','name_sim','city']].to_string(index=False)}")
|
||||
top = df_block.iloc[0]
|
||||
crm_idx = top['level_0'] if top['score'] >= SCORE_THRESHOLD else None
|
||||
if crm_idx is not None:
|
||||
|
||||
Reference in New Issue
Block a user