duplicate_checker.py aktualisiert

This commit is contained in:
2025-08-03 08:27:12 +00:00
parent f811eafda0
commit 434908e3ec

View File

@@ -1,10 +1,10 @@
# duplicate_checker.py (v2.3 - Intelligent Blocking) # duplicate_checker.py (v2.8 - Vollständiges Logging & Maximum Debugging)
import logging import logging
import pandas as pd import pandas as pd
from thefuzz import fuzz from thefuzz import fuzz
from config import Config from config import Config
from helpers import normalize_company_name, simple_normalize_url from helpers import normalize_company_name, simple_normalize_url, create_log_filename
from google_sheet_handler import GoogleSheetHandler from google_sheet_handler import GoogleSheetHandler
from collections import defaultdict from collections import defaultdict
import time import time
@@ -12,16 +12,43 @@ import time
# --- Konfiguration --- # --- Konfiguration ---
CRM_SHEET_NAME = "CRM_Accounts" CRM_SHEET_NAME = "CRM_Accounts"
MATCHING_SHEET_NAME = "Matching_Accounts" MATCHING_SHEET_NAME = "Matching_Accounts"
SCORE_THRESHOLD = 85 # Treffer unter diesem Wert werden nicht angezeigt SCORE_THRESHOLD = 85
# --- WICHTIG: VOLLSTÄNDIGES LOGGING SETUP ---
LOG_LEVEL = logging.DEBUG if Config.DEBUG else logging.INFO
LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)s - %(message)s'
# Root-Logger konfigurieren
root_logger = logging.getLogger()
root_logger.setLevel(LOG_LEVEL)
# Bestehende Handler entfernen, um Dopplung zu vermeiden
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
# Konsole-Handler hinzufügen
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(logging.Formatter(LOG_FORMAT))
root_logger.addHandler(stream_handler)
# File-Handler hinzufügen
log_file_path = create_log_filename("duplicate_check")
if log_file_path:
file_handler = logging.FileHandler(log_file_path, mode='a', encoding='utf-8')
file_handler.setFormatter(logging.Formatter(LOG_FORMAT))
root_logger.addHandler(file_handler)
logger = logging.getLogger(__name__) # Logger für dieses Modul holen
# --- Der eigentliche Code beginnt hier ---
# NEU: Liste von generischen Wörtern, die für das Blocking ignoriert werden
BLOCKING_STOP_WORDS = { BLOCKING_STOP_WORDS = {
'gmbh', 'ag', 'co', 'kg', 'se', 'holding', 'gruppe', 'industries', 'systems', 'gmbh', 'ag', 'co', 'kg', 'se', 'holding', 'gruppe', 'industries', 'systems',
'technik', 'service', 'services', 'solutions', 'management', 'international' 'technik', 'service', 'services', 'solutions', 'management', 'international', 'und',
'germany', 'deutschland', 'gbr', 'mbh', 'company', 'limited', 'logistics',
'construction', 'products', 'group', 'b-v'
} }
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def calculate_similarity_details(record1, record2): def calculate_similarity_details(record1, record2):
"""Berechnet einen gewichteten Ähnlichkeits-Score und gibt die Details zurück.""" """Berechnet einen gewichteten Ähnlichkeits-Score und gibt die Details zurück."""
scores = {'name': 0, 'location': 0, 'domain': 0} scores = {'name': 0, 'location': 0, 'domain': 0}
@@ -40,43 +67,33 @@ def calculate_similarity_details(record1, record2):
return {'total': total_score, 'details': scores} return {'total': total_score, 'details': scores}
def create_blocking_keys(name): def create_blocking_keys(name):
"""Erstellt mehrere Blocking Keys aus den signifikanten Wörtern eines Namens.""" """Erstellt Blocking Keys aus allen signifikanten Wörtern eines Namens."""
if not name: if not name:
return [] return []
significant_words = {word for word in name.split() if word not in BLOCKING_STOP_WORDS and len(word) >= 3}
# Filtere Stop-Wörter aus der Wortliste return list(significant_words)
significant_words = [word for word in name.split() if word not in BLOCKING_STOP_WORDS]
keys = set()
# 1. Erstes signifikantes Wort
if len(significant_words) > 0:
keys.add(significant_words[0])
# 2. Zweites signifikantes Wort (falls vorhanden)
if len(significant_words) > 1:
keys.add(significant_words[1])
return list(keys)
def main(): def main():
logging.info("Starte den Duplikats-Check (v2.3 mit Intelligent Blocking)...") start_time = time.time()
logger.info(f"===== Skript gestartet: Modus 'duplicate_check' v2.8 =====")
logger.info(f"Logdatei: {log_file_path}")
# ... (Initialisierung des GoogleSheetHandler bleibt gleich) ...
try: try:
sheet_handler = GoogleSheetHandler() sheet_handler = GoogleSheetHandler()
except Exception as e: except Exception as e:
logging.critical(f"FEHLER bei Initialisierung: {e}") logger.critical(f"FEHLER bei Initialisierung: {e}")
return return
logging.info(f"Lade Master-Daten aus '{CRM_SHEET_NAME}'...") logger.info(f"Lade Master-Daten aus '{CRM_SHEET_NAME}'...")
crm_df = sheet_handler.get_sheet_as_dataframe(CRM_SHEET_NAME) crm_df = sheet_handler.get_sheet_as_dataframe(CRM_SHEET_NAME)
if crm_df is None or crm_df.empty: return if crm_df is None or crm_df.empty: return
logging.info(f"Lade zu prüfende Daten aus '{MATCHING_SHEET_NAME}'...") logger.info(f"Lade zu prüfende Daten aus '{MATCHING_SHEET_NAME}'...")
matching_df = sheet_handler.get_sheet_as_dataframe(MATCHING_SHEET_NAME) matching_df = sheet_handler.get_sheet_as_dataframe(MATCHING_SHEET_NAME)
if matching_df is None or matching_df.empty: return if matching_df is None or matching_df.empty: return
original_matching_df = matching_df.copy() original_matching_df = matching_df.copy()
logging.info("Normalisiere Daten für den Vergleich...") logger.info("Normalisiere Daten für den Vergleich...")
for df in [crm_df, matching_df]: for df in [crm_df, matching_df]:
df['normalized_name'] = df['CRM Name'].astype(str).apply(normalize_company_name) df['normalized_name'] = df['CRM Name'].astype(str).apply(normalize_company_name)
df['normalized_domain'] = df['CRM Website'].astype(str).apply(simple_normalize_url) df['normalized_domain'] = df['CRM Website'].astype(str).apply(simple_normalize_url)
@@ -84,32 +101,51 @@ def main():
df['CRM Land'] = df['CRM Land'].astype(str).str.lower().str.strip() df['CRM Land'] = df['CRM Land'].astype(str).str.lower().str.strip()
df['block_keys'] = df['normalized_name'].apply(create_blocking_keys) df['block_keys'] = df['normalized_name'].apply(create_blocking_keys)
logging.info("Erstelle Index für CRM-Daten zur Beschleunigung...") logger.info("Erstelle Index für CRM-Daten zur Beschleunigung...")
crm_index = defaultdict(list) crm_index = defaultdict(list)
crm_records = crm_df.to_dict('records') crm_records = crm_df.to_dict('records')
for record in crm_records: for record in crm_records:
for key in record['block_keys']: for key in record['block_keys']:
crm_index[key].append(record) crm_index[key].append(record)
logging.info("Starte Matching-Prozess...") logger.info("Starte Matching-Prozess...")
results = [] results = []
for match_record in matching_df.to_dict('records'): for match_record in matching_df.to_dict('records'):
best_score_info = {'total': 0, 'details': {'name': 0, 'location': 0, 'domain': 0}} best_score_info = {'total': -1, 'details': {'name': 0, 'location': 0, 'domain': 0}}
best_match_name = "" best_match_name = ""
logging.info(f"Prüfe: {match_record['CRM Name']}...") logger.info(f"--- Prüfe: '{match_record.get('CRM Name', 'N/A')}' ---")
logger.debug(f" [Normalisiert: '{match_record.get('normalized_name')}', Domain: '{match_record.get('normalized_domain')}', Keys: {match_record.get('block_keys')}]")
candidate_pool = {} candidate_pool = {}
for key in match_record['block_keys']: for key in match_record['block_keys']:
for crm_record in crm_index.get(key, []): candidates_from_key = crm_index.get(key, [])
candidate_pool[crm_record['CRM Name']] = crm_record if candidates_from_key:
logger.debug(f" -> Block-Key '{key}' gefunden. {len(candidates_from_key)} Kandidaten hinzugefügt.")
for crm_record in candidates_from_key:
candidate_pool[crm_record['CRM Name']] = crm_record
if not candidate_pool:
logger.debug(" -> Keine Kandidaten im Index gefunden. Überspringe Vergleich.")
results.append({
'Potenzieller Treffer im CRM': "", 'Score (Gesamt)': 0, 'Score (Name)': 0,
'Bonus (Standort)': 0, 'Bonus (Domain)': 0
})
continue
logger.debug(f" -> Vergleiche mit insgesamt {len(candidate_pool)} einzigartigen Kandidaten.")
for crm_record in candidate_pool.values(): for crm_record in candidate_pool.values():
score_info = calculate_similarity_details(match_record, crm_record) score_info = calculate_similarity_details(match_record, crm_record)
if score_info['total'] > 50: # Logge nur Vergleiche mit einem minimalen Score, um das Log nicht zu überfluten
logger.debug(f" - Kandidat: '{crm_record.get('CRM Name', 'N/A')}' -> Score: {score_info['total']} (Details: {score_info['details']})")
if score_info['total'] > best_score_info['total']: if score_info['total'] > best_score_info['total']:
best_score_info = score_info best_score_info = score_info
best_match_name = crm_record['CRM Name'] best_match_name = crm_record.get('CRM Name', 'N/A')
logger.info(f" --> Neuer bester Treffer: '{best_match_name}' mit Score {best_score_info['total']}")
results.append({ results.append({
'Potenzieller Treffer im CRM': best_match_name if best_score_info['total'] >= SCORE_THRESHOLD else "", 'Potenzieller Treffer im CRM': best_match_name if best_score_info['total'] >= SCORE_THRESHOLD else "",
@@ -122,7 +158,6 @@ def main():
logging.info("Matching abgeschlossen. Schreibe Ergebnisse zurück ins Sheet...") logging.info("Matching abgeschlossen. Schreibe Ergebnisse zurück ins Sheet...")
result_df = pd.DataFrame(results) result_df = pd.DataFrame(results)
# Originalspalten aus der Kopie nehmen, um saubere Ausgabe zu garantieren
output_df = pd.concat([original_matching_df.reset_index(drop=True), result_df], axis=1) output_df = pd.concat([original_matching_df.reset_index(drop=True), result_df], axis=1)
data_to_write = [output_df.columns.values.tolist()] + output_df.values.tolist() data_to_write = [output_df.columns.values.tolist()] + output_df.values.tolist()
@@ -133,5 +168,10 @@ def main():
else: else:
logging.error("FEHLER beim Schreiben der Ergebnisse ins Google Sheet.") logging.error("FEHLER beim Schreiben der Ergebnisse ins Google Sheet.")
end_time = time.time()
logger.info(f"Gesamtdauer des Duplikats-Checks: {end_time - start_time:.2f} Sekunden.")
logger.info(f"===== Skript beendet: Modus 'duplicate_check' =====")
if __name__ == "__main__": if __name__ == "__main__":
main() main()