duplicate_checker.py aktualisiert
This commit is contained in:
@@ -1,3 +1,4 @@
|
|||||||
|
import os
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@@ -14,11 +15,37 @@ WEIGHTS = {
|
|||||||
'name': 0.4,
|
'name': 0.4,
|
||||||
'city': 0.1,
|
'city': 0.1,
|
||||||
}
|
}
|
||||||
|
LOG_DIR = '/log'
|
||||||
|
LOG_FILENAME = 'duplicate_check.log'
|
||||||
|
|
||||||
# --- Logging Setup ---
|
# --- Logging Setup ---
|
||||||
LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)s - %(message)s'
|
if not os.path.exists(LOG_DIR):
|
||||||
logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT)
|
try:
|
||||||
|
os.makedirs(LOG_DIR)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warnung: Konnte Log-Ordner nicht anlegen: {e}")
|
||||||
|
log_path = os.path.join(LOG_DIR, LOG_FILENAME)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
formatter = logging.Formatter('%(asctime)s - %(levelname)-8s - %(name)s - %(message)s')
|
||||||
|
|
||||||
|
# Console Handler
|
||||||
|
console_handler = logging.StreamHandler()
|
||||||
|
console_handler.setLevel(logging.INFO)
|
||||||
|
console_handler.setFormatter(formatter)
|
||||||
|
logger.addHandler(console_handler)
|
||||||
|
|
||||||
|
# File Handler
|
||||||
|
try:
|
||||||
|
file_handler = logging.FileHandler(log_path, mode='a', encoding='utf-8')
|
||||||
|
file_handler.setLevel(logging.DEBUG)
|
||||||
|
file_handler.setFormatter(formatter)
|
||||||
|
logger.addHandler(file_handler)
|
||||||
|
logger.info(f"Logging auch in Datei: {log_path}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Konnte keine Log-Datei schreiben: {e}")
|
||||||
|
|
||||||
# --- Hilfsfunktionen ---
|
# --- Hilfsfunktionen ---
|
||||||
def normalize_company_name(name: str) -> str:
|
def normalize_company_name(name: str) -> str:
|
||||||
@@ -41,7 +68,7 @@ def normalize_domain(url: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
logger.info("Starte den Duplikats-Check (v2.0 mit Blocking und Maximum Logging)...")
|
logger.info("Starte den Duplikats-Check (v2.0 mit Logging in /log)...")
|
||||||
# GoogleSheetHandler initialisieren
|
# GoogleSheetHandler initialisieren
|
||||||
try:
|
try:
|
||||||
sheet_handler = GoogleSheetHandler()
|
sheet_handler = GoogleSheetHandler()
|
||||||
@@ -71,7 +98,7 @@ def main():
|
|||||||
df['norm_name'] = df['CRM Name'].fillna('').apply(normalize_company_name)
|
df['norm_name'] = df['CRM Name'].fillna('').apply(normalize_company_name)
|
||||||
df['norm_domain'] = df['CRM Website'].fillna('').apply(normalize_domain)
|
df['norm_domain'] = df['CRM Website'].fillna('').apply(normalize_domain)
|
||||||
df['city'] = df['CRM Ort'].fillna('').apply(lambda x: str(x).casefold().strip())
|
df['city'] = df['CRM Ort'].fillna('').apply(lambda x: str(x).casefold().strip())
|
||||||
logger.debug(f"{label}-Daten nach Normalisierung. Erste Zeile: {df.iloc[0].to_dict()}")
|
logger.debug(f"{label}-Daten normalisiert. Erste Zeile: {df.iloc[0].to_dict()}")
|
||||||
|
|
||||||
# Blocking per Domain
|
# Blocking per Domain
|
||||||
indexer = recordlinkage.Index()
|
indexer = recordlinkage.Index()
|
||||||
@@ -100,7 +127,7 @@ def main():
|
|||||||
for match_idx, group in features.reset_index().groupby('level_1'):
|
for match_idx, group in features.reset_index().groupby('level_1'):
|
||||||
logger.info(f"--- Prüfe: Zeile {match_idx} ---")
|
logger.info(f"--- Prüfe: Zeile {match_idx} ---")
|
||||||
df_block = group.sort_values('score', ascending=False)
|
df_block = group.sort_values('score', ascending=False)
|
||||||
logger.debug(f" Kandidaten für Zeile {match_idx}:\n{df_block[['level_0','score','domain','name_sim','city']].to_string(index=False)}")
|
logger.debug(f"Kandidaten für Zeile {match_idx}:\n{df_block[['level_0','score','domain','name_sim','city']].to_string(index=False)}")
|
||||||
top = df_block.iloc[0]
|
top = df_block.iloc[0]
|
||||||
crm_idx = top['level_0'] if top['score'] >= SCORE_THRESHOLD else None
|
crm_idx = top['level_0'] if top['score'] >= SCORE_THRESHOLD else None
|
||||||
if crm_idx is not None:
|
if crm_idx is not None:
|
||||||
|
|||||||
Reference in New Issue
Block a user