Add Logging

This commit is contained in:
2025-08-04 05:35:07 +00:00
parent c0db46d2a8
commit bc9591a409

View File

@@ -1,119 +1,148 @@
# duplicate_checker.py (v2.2 - Multi-Key Blocking & optimiertes Scoring) # duplicate_checker.py (v2.0 - Maximum Logging)
import logging import logging
import pandas as pd import pandas as pd
from thefuzz import fuzz from thefuzz import fuzz
from config import Config from config import Config
from helpers import normalize_company_name, simple_normalize_url from helpers import normalize_company_name, simple_normalize_url, create_log_filename
from google_sheet_handler import GoogleSheetHandler from google_sheet_handler import GoogleSheetHandler
from collections import defaultdict import time
# --- Konfiguration --- # --- Konfiguration ---
CRM_SHEET_NAME = "CRM_Accounts" CRM_SHEET_NAME = "CRM_Accounts"
MATCHING_SHEET_NAME = "Matching_Accounts" MATCHING_SHEET_NAME = "Matching_Accounts"
SCORE_THRESHOLD = 85 # Etwas höherer Schwellenwert für bessere Präzision SCORE_THRESHOLD = 80
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # --- VOLLSTÄNDIGES LOGGING SETUP ---
LOG_LEVEL = logging.DEBUG if Config.DEBUG else logging.INFO
LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)s - %(message)s'
def calculate_similarity_details(record1, record2): root_logger = logging.getLogger()
"""Berechnet einen gewichteten Ähnlichkeits-Score und gibt die Details zurück.""" root_logger.setLevel(LOG_LEVEL)
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(logging.Formatter(LOG_FORMAT))
root_logger.addHandler(stream_handler)
log_file_path = create_log_filename("duplicate_check_v2_0")
if log_file_path:
file_handler = logging.FileHandler(log_file_path, mode='a', encoding='utf-8')
file_handler.setFormatter(logging.Formatter(LOG_FORMAT))
root_logger.addHandler(file_handler)
logger = logging.getLogger(__name__)
def calculate_similarity_with_details(record1, record2):
"""
Berechnet einen gewichteten Ähnlichkeits-Score zwischen zwei Datensätzen
und gibt die Details für die Begründung zurück.
"""
scores = {'name': 0, 'location': 0, 'domain': 0} scores = {'name': 0, 'location': 0, 'domain': 0}
if record1.get('normalized_domain') and record1['normalized_domain'] != 'k.a.' and record1['normalized_domain'] == record2.get('normalized_domain'): if record1['normalized_domain'] and record1['normalized_domain'] != 'k.a.' and record1['normalized_domain'] == record2['normalized_domain']:
scores['domain'] = 100 scores['domain'] = 100
# Höhere Gewichtung für den Namen, da die Website oft fehlt if record1['normalized_name'] and record2['normalized_name']:
if record1.get('normalized_name') and record2.get('normalized_name'): name_similarity = fuzz.token_set_ratio(record1['normalized_name'], record2['normalized_name'])
scores['name'] = round(fuzz.token_set_ratio(record1['normalized_name'], record2['normalized_name']) * 0.85) scores['name'] = round(name_similarity * 0.7)
if record1.get('CRM Ort') and record1['CRM Ort'] == record2.get('CRM Ort'): if record1['CRM Ort'] and record1['CRM Ort'] == record2['CRM Ort']:
if record1.get('CRM Land') and record1['CRM Land'] == record2.get('CRM Land'): if record1['CRM Land'] and record1['CRM Land'] == record2['CRM Land']:
scores['location'] = 20 scores['location'] = 20
total_score = sum(scores.values()) total_score = sum(scores.values())
return {'total': total_score, 'details': scores}
def create_blocking_keys(name): reasons = []
"""Erstellt mehrere Blocking Keys für einen Namen, um die Sensitivität zu erhöhen.""" if scores['domain'] > 0: reasons.append(f"Domain({scores['domain']})")
if not name: if scores['name'] > 0: reasons.append(f"Name({scores['name']})")
return [] if scores['location'] > 0: reasons.append(f"Ort({scores['location']})")
reason_text = " + ".join(reasons) if reasons else "Keine Übereinstimmung"
words = name.split() return round(total_score), reason_text
keys = set()
# 1. Erstes Wort
if len(words) > 0:
keys.add(words[0])
# 2. Zweites Wort (falls vorhanden)
if len(words) > 1:
keys.add(words[1])
# 3. Erste 4 Buchstaben des ersten Wortes
if len(words) > 0 and len(words[0]) >= 4:
keys.add(words[0][:4])
return list(keys)
def main(): def main():
logging.info("Starte den Duplikats-Check (v2.2 mit Multi-Key Blocking)...") """Hauptfunktion zum Laden, Vergleichen und Schreiben der Daten."""
start_time = time.time()
logger.info("Starte den Duplikats-Check (v2.0 mit Blocking und Maximum Logging)...")
logger.info(f"Logdatei: {log_file_path}")
try: try:
sheet_handler = GoogleSheetHandler() sheet_handler = GoogleSheetHandler()
except Exception as e: except Exception as e:
logging.critical(f"FEHLER bei Initialisierung: {e}") logger.critical(f"FEHLER bei Initialisierung des GoogleSheetHandler: {e}")
return return
logging.info(f"Lade Master-Daten aus '{CRM_SHEET_NAME}'...") logger.info(f"Lade Master-Daten aus '{CRM_SHEET_NAME}'...")
crm_df = sheet_handler.get_sheet_as_dataframe(CRM_SHEET_NAME) crm_df = sheet_handler.get_sheet_as_dataframe(CRM_SHEET_NAME)
if crm_df is None or crm_df.empty: return if crm_df is None or crm_df.empty:
return
logging.info(f"Lade zu prüfende Daten aus '{MATCHING_SHEET_NAME}'...") logger.info(f"Lade zu prüfende Daten aus '{MATCHING_SHEET_NAME}'...")
matching_df = sheet_handler.get_sheet_as_dataframe(MATCHING_SHEET_NAME) matching_df = sheet_handler.get_sheet_as_dataframe(MATCHING_SHEET_NAME)
if matching_df is None or matching_df.empty: return if matching_df is None or matching_df.empty:
return
original_matching_df = matching_df.copy() original_matching_df = matching_df.copy()
logging.info("Normalisiere Daten für den Vergleich...") logger.info("Normalisiere Daten für den Vergleich...")
for df in [crm_df, matching_df]: for df in [crm_df, matching_df]:
df['normalized_name'] = df['CRM Name'].astype(str).apply(normalize_company_name) df['normalized_name'] = df['CRM Name'].astype(str).apply(normalize_company_name)
df['normalized_domain'] = df['CRM Website'].astype(str).apply(simple_normalize_url) df['normalized_domain'] = df['CRM Website'].astype(str).apply(simple_normalize_url)
df['CRM Ort'] = df['CRM Ort'].astype(str).str.lower().str.strip() df['CRM Ort'] = df['CRM Ort'].astype(str).str.lower().str.strip()
df['CRM Land'] = df['CRM Land'].astype(str).str.lower().str.strip() df['CRM Land'] = df['CRM Land'].astype(str).str.lower().str.strip()
df['block_keys'] = df['normalized_name'].apply(create_blocking_keys) df['block_key'] = df['normalized_name'].apply(lambda x: x.split()[0] if x and x.split() else None)
logging.info("Erstelle Index für CRM-Daten zur Beschleunigung...") logger.info("Erstelle Index für CRM-Daten zur Beschleunigung...")
crm_index = defaultdict(list) crm_index = {}
for record in crm_df.to_dict('records'): crm_records = crm_df.to_dict('records')
for key in record['block_keys']: for record in crm_records:
key = record['block_key']
if key:
if key not in crm_index:
crm_index[key] = []
crm_index[key].append(record) crm_index[key].append(record)
logging.info("Starte Matching-Prozess...") logger.info("Starte Matching-Prozess...")
results = [] results = []
for match_record in matching_df.to_dict('records'): for match_record in matching_df.to_dict('records'):
best_score_info = {'total': 0, 'details': {'name': 0, 'location': 0, 'domain': 0}} best_score = -1
best_match_name = "" best_match_name = ""
best_reason = ""
logging.info(f"Prüfe: {match_record['CRM Name']}...") logger.info(f"--- Prüfe: '{match_record.get('CRM Name', 'N/A')}' ---")
logger.debug(f" [Normalisiert: '{match_record.get('normalized_name')}', Domain: '{match_record.get('normalized_domain')}', Key: '{match_record.get('block_key')}']")
candidate_pool = {} block_key = match_record.get('block_key')
for key in match_record['block_keys']: candidates = crm_index.get(block_key, [])
for crm_record in crm_index.get(key, []):
candidate_pool[crm_record['CRM Name']] = crm_record
if not candidate_pool: if not candidates:
logging.debug(" -> Keine Kandidaten im Index gefunden.") logger.debug(" -> Keine Kandidaten im Index gefunden. Überspringe Vergleich.")
results.append({
'Potenzieller Treffer im CRM': "", 'Ähnlichkeits-Score': 0, 'Matching-Grund': "Keine Kandidaten"
})
continue
for crm_record in candidate_pool.values(): logger.debug(f" -> Vergleiche mit {len(candidates)} Kandidaten aus Block '{block_key}'.")
score_info = calculate_similarity_details(match_record, crm_record)
if score_info['total'] > best_score_info['total']: for crm_row in candidates:
best_score_info = score_info score, reason = calculate_similarity_with_details(match_record, crm_row)
best_match_name = crm_record['CRM Name']
if score > 0: # Logge jeden Vergleich, der einen Score > 0 hat
logger.debug(f" - Kandidat: '{crm_row.get('CRM Name', 'N/A')}' -> Score: {score} (Grund: {reason})")
if score > best_score:
best_score = score
best_match_name = crm_row.get('CRM Name', 'N/A')
best_reason = reason
logger.info(f" --> Neuer bester Treffer: '{best_match_name}' mit Score {best_score}")
results.append({ results.append({
'Potenzieller Treffer im CRM': best_match_name if best_score_info['total'] >= SCORE_THRESHOLD else "", 'Potenzieller Treffer im CRM': best_match_name if best_score >= SCORE_THRESHOLD else "",
'Score (Gesamt)': best_score_info['total'], 'Ähnlichkeits-Score': best_score,
'Score (Name)': best_score_info['details']['name'], 'Matching-Grund': best_reason
'Bonus (Standort)': best_score_info['details']['location'],
'Bonus (Domain)': best_score_info['details']['domain']
}) })
logging.info("Matching abgeschlossen. Schreibe Ergebnisse zurück ins Sheet...") logging.info("Matching abgeschlossen. Schreibe Ergebnisse zurück ins Sheet...")
@@ -129,5 +158,9 @@ def main():
else: else:
logging.error("FEHLER beim Schreiben der Ergebnisse ins Google Sheet.") logging.error("FEHLER beim Schreiben der Ergebnisse ins Google Sheet.")
end_time = time.time()
logger.info(f"Gesamtdauer des Duplikats-Checks: {end_time - start_time:.2f} Sekunden.")
logger.info(f"===== Skript beendet =====")
if __name__ == "__main__": if __name__ == "__main__":
main() main()