From b9a046bd0b0ece8e2fab79d98259bf246ec58ae1 Mon Sep 17 00:00:00 2001 From: Floke Date: Wed, 6 Aug 2025 09:31:33 +0000 Subject: [PATCH] Add Logging --- duplicate_checker.py | 158 ++++++++++++++++++++++++------------------- 1 file changed, 88 insertions(+), 70 deletions(-) diff --git a/duplicate_checker.py b/duplicate_checker.py index 51ff913c..5c58a74d 100644 --- a/duplicate_checker.py +++ b/duplicate_checker.py @@ -1,114 +1,132 @@ -# duplicate_checker.py (v2.0 - mit Blocking-Strategie) - +import os import logging import pandas as pd from thefuzz import fuzz -from config import Config from helpers import normalize_company_name, simple_normalize_url from google_sheet_handler import GoogleSheetHandler # --- Konfiguration --- CRM_SHEET_NAME = "CRM_Accounts" MATCHING_SHEET_NAME = "Matching_Accounts" -SCORE_THRESHOLD = 80 +SCORE_THRESHOLD = 80 # Score ab hier gilt als Match +LOG_DIR = "Log" +LOG_FILE = "duplicate_check.log" + +# --- Logging Setup --- +if not os.path.exists(LOG_DIR): + os.makedirs(LOG_DIR, exist_ok=True) +log_path = os.path.join(LOG_DIR, LOG_FILE) + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + +# Console Handler: INFO+ +ch = logging.StreamHandler() +ch.setLevel(logging.INFO) +ch.setFormatter(logging.Formatter("%(asctime)s - %(levelname)-8s - %(message)s")) +logger.addHandler(ch) + +# File Handler: DEBUG+ +fh = logging.FileHandler(log_path, mode='a', encoding='utf-8') +fh.setLevel(logging.DEBUG) +fh.setFormatter(logging.Formatter("%(asctime)s - %(levelname)-8s - %(name)s - %(message)s")) +logger.addHandler(fh) + +logger.info(f"Logging in Datei: {log_path}") -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') def calculate_similarity(record1, record2): - """Berechnet einen gewichteten Ähnlichkeits-Score zwischen zwei Datensätzen.""" + """Berechnet gewichteten Ähnlichkeits-Score zwischen zwei Datensätzen.""" total_score = 0 + # Domain exact match if record1['normalized_domain'] and record1['normalized_domain'] == record2['normalized_domain']: total_score += 100 - if record1['normalized_name'] and record2['normalized_name']: - name_similarity = fuzz.token_set_ratio(record1['normalized_name'], record2['normalized_name']) - total_score += name_similarity * 0.7 - if record1['CRM Ort'] and record1['CRM Ort'] == record2['CRM Ort']: - if record1['CRM Land'] and record1['CRM Land'] == record2['CRM Land']: - total_score += 20 + # Name fuzzy + name_similarity = fuzz.token_set_ratio(record1['normalized_name'], record2['normalized_name']) + total_score += name_similarity * 0.7 + # Ort+Land exact + if record1['CRM Ort'] == record2['CRM Ort'] and record1['CRM Land'] == record2['CRM Land']: + total_score += 20 return round(total_score) + def main(): - """Hauptfunktion zum Laden, Vergleichen und Schreiben der Daten.""" - logging.info("Starte den Duplikats-Check (v2.0 mit Blocking)...") - + logger.info("Starte Duplikats-Check (v2.0 - mit Blocking & relevantem Kandidaten-Log)") try: sheet_handler = GoogleSheetHandler() + logger.info("GoogleSheetHandler initialisiert") except Exception as e: - logging.critical(f"FEHLER bei Initialisierung des GoogleSheetHandler: {e}") + logger.critical(f"FEHLER Init GoogleSheetHandler: {e}") return - logging.info(f"Lade Master-Daten aus '{CRM_SHEET_NAME}'...") + # Daten laden crm_df = sheet_handler.get_sheet_as_dataframe(CRM_SHEET_NAME) + match_df = sheet_handler.get_sheet_as_dataframe(MATCHING_SHEET_NAME) if crm_df is None or crm_df.empty: - logging.critical(f"Konnte keine Daten aus '{CRM_SHEET_NAME}' laden. Breche ab.") + logger.critical(f"Keine Daten in '{CRM_SHEET_NAME}'") return - - logging.info(f"Lade zu prüfende Daten aus '{MATCHING_SHEET_NAME}'...") - matching_df = sheet_handler.get_sheet_as_dataframe(MATCHING_SHEET_NAME) - if matching_df is None or matching_df.empty: - logging.critical(f"Konnte keine Daten aus '{MATCHING_SHEET_NAME}' laden. Breche ab.") + if match_df is None or match_df.empty: + logger.critical(f"Keine Daten in '{MATCHING_SHEET_NAME}'") return + logger.info(f"{len(crm_df)} CRM-Zeilen, {len(match_df)} Matching-Zeilen geladen") - logging.info("Normalisiere Daten für den Vergleich...") - for df in [crm_df, matching_df]: + # Normalisierung + for df, label in [(crm_df, 'CRM'), (match_df, 'Matching')]: df['normalized_name'] = df['CRM Name'].astype(str).apply(normalize_company_name) df['normalized_domain'] = df['CRM Website'].astype(str).apply(simple_normalize_url) df['CRM Ort'] = df['CRM Ort'].astype(str).str.lower().str.strip() df['CRM Land'] = df['CRM Land'].astype(str).str.lower().str.strip() - # Blocking Key: Das erste Wort des normalisierten Namens + # Blocking Key df['block_key'] = df['normalized_name'].apply(lambda x: x.split()[0] if x else None) + logger.debug(f"{label}-Sample nach Norm: {df.iloc[0][['normalized_name','normalized_domain','block_key']].to_dict()}") - # --- NEUE, SCHNELLE BLOCKING-STRATEGIE --- - logging.info("Erstelle Index für CRM-Daten zur Beschleunigung...") + # Blocking Index erstellen crm_index = {} - for index, row in crm_df.iterrows(): + for idx, row in crm_df.iterrows(): key = row['block_key'] - if key: - if key not in crm_index: - crm_index[key] = [] - crm_index[key].append(row) - - logging.info("Starte Matching-Prozess...") - results = [] - total_matches = len(matching_df) - - for index, match_row in matching_df.iterrows(): - best_score = 0 - best_match_name = "" - - logging.info(f"Prüfe {index + 1}/{total_matches}: {match_row['CRM Name']}...") + if not key: continue + crm_index.setdefault(key, []).append(row) + logger.info(f"Blocking-Index erstellt: {len(crm_index)} Keys") - # Finde den Block von Kandidaten - block_key = match_row['block_key'] - candidates = crm_index.get(block_key, []) - - # Führe den teuren Vergleich nur für die Kandidaten in diesem Block durch + # Matching + results = [] + total = len(match_df) + for i, match_row in match_df.iterrows(): + key = match_row['block_key'] + candidates = crm_index.get(key, []) + logger.info(f"Prüfe {i+1}/{total}: {match_row['CRM Name']} (Key='{key}') -> {len(candidates)} Kandidaten") + + if not candidates: + results.append({'Potenzieller Treffer im CRM': '', 'Ähnlichkeits-Score': 0}) + continue + + # Scores für Kandidaten sammeln + scored = [] for crm_row in candidates: score = calculate_similarity(match_row, crm_row) - if score > best_score: - best_score = score - best_match_name = crm_row['CRM Name'] - + scored.append((crm_row['CRM Name'], score)) + # Top 3 loggen + top3 = sorted(scored, key=lambda x: x[1], reverse=True)[:3] + logger.debug(f" Top 3 Kandidaten: {top3}") + + # Besten Treffer wählen + best_name, best_score = max(scored, key=lambda x: x[1]) if best_score >= SCORE_THRESHOLD: - results.append({'Potenzieller Treffer im CRM': best_match_name, 'Ähnlichkeits-Score': best_score}) + results.append({'Potenzieller Treffer im CRM': best_name, 'Ähnlichkeits-Score': best_score}) + logger.info(f" --> Match: '{best_name}' mit Score {best_score}") else: - # Wenn nichts im Block gefunden wurde, trotzdem den besten Treffer (kann 0 sein) anzeigen - results.append({'Potenzieller Treffer im CRM': '' if not best_match_name else best_match_name, 'Ähnlichkeits-Score': best_score}) + results.append({'Potenzieller Treffer im CRM': '', 'Ähnlichkeits-Score': best_score}) + logger.info(f" --> Kein Match (höchster Score {best_score})") - logging.info("Matching abgeschlossen. Schreibe Ergebnisse zurück ins Sheet...") - result_df = pd.DataFrame(results) - - # Die ursprünglichen Spalten aus matching_df für die Ausgabe nehmen - output_df = matching_df[['CRM Name', 'CRM Website', 'CRM Ort', 'CRM Land']].copy() - output_df = pd.concat([output_df.reset_index(drop=True), result_df], axis=1) - - data_to_write = [output_df.columns.values.tolist()] + output_df.values.tolist() - - success = sheet_handler.clear_and_write_data(MATCHING_SHEET_NAME, data_to_write) - if success: - logging.info(f"Ergebnisse erfolgreich in das Tabellenblatt '{MATCHING_SHEET_NAME}' geschrieben.") + # Ergebnisse zurückschreiben + out_df = pd.DataFrame(results) + output = pd.concat([match_df[['CRM Name','CRM Website','CRM Ort','CRM Land']].reset_index(drop=True), out_df], axis=1) + data = [output.columns.tolist()] + output.values.tolist() + ok = sheet_handler.clear_and_write_data(MATCHING_SHEET_NAME, data) + if ok: + logger.info("Ergebnisse erfolgreich geschrieben") else: - logging.error("FEHLER beim Schreiben der Ergebnisse ins Google Sheet.") + logger.error("Fehler beim Schreiben ins Google Sheet") -if __name__ == "__main__": - main() \ No newline at end of file +if __name__ == '__main__': + main()