duplicate_checker.py aktualisiert

This commit is contained in:
2025-08-03 08:31:02 +00:00
parent 434908e3ec
commit 5ca44ac036

View File

@@ -1,64 +1,39 @@
# duplicate_checker.py (v2.8 - Vollständiges Logging & Maximum Debugging) # duplicate_checker.py (v3.0 - Back to Basics: Optimized Brute-Force)
import logging import logging
import pandas as pd import pandas as pd
from thefuzz import fuzz from thefuzz import fuzz
from config import Config from config import Config
from helpers import normalize_company_name, simple_normalize_url, create_log_filename from helpers import normalize_company_name, simple_normalize_url
from google_sheet_handler import GoogleSheetHandler from google_sheet_handler import GoogleSheetHandler
from collections import defaultdict
import time import time
# --- Konfiguration --- # --- Konfiguration ---
CRM_SHEET_NAME = "CRM_Accounts" CRM_SHEET_NAME = "CRM_Accounts"
MATCHING_SHEET_NAME = "Matching_Accounts" MATCHING_SHEET_NAME = "Matching_Accounts"
SCORE_THRESHOLD = 85 SCORE_THRESHOLD = 85 # Treffer unter diesem Wert werden nicht als "potenzieller Treffer" angezeigt
# --- WICHTIG: VOLLSTÄNDIGES LOGGING SETUP --- # WICHTIG: Logging Setup für detaillierte Ausgaben
LOG_LEVEL = logging.DEBUG if Config.DEBUG else logging.INFO logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)-8s - %(name)s - %(message)s')
LOG_FORMAT = '%(asctime)s - %(levelname)-8s - %(name)s - %(message)s' logger = logging.getLogger(__name__)
# Root-Logger konfigurieren
root_logger = logging.getLogger()
root_logger.setLevel(LOG_LEVEL)
# Bestehende Handler entfernen, um Dopplung zu vermeiden
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
# Konsole-Handler hinzufügen
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(logging.Formatter(LOG_FORMAT))
root_logger.addHandler(stream_handler)
# File-Handler hinzufügen
log_file_path = create_log_filename("duplicate_check")
if log_file_path:
file_handler = logging.FileHandler(log_file_path, mode='a', encoding='utf-8')
file_handler.setFormatter(logging.Formatter(LOG_FORMAT))
root_logger.addHandler(file_handler)
logger = logging.getLogger(__name__) # Logger für dieses Modul holen
# --- Der eigentliche Code beginnt hier ---
BLOCKING_STOP_WORDS = {
'gmbh', 'ag', 'co', 'kg', 'se', 'holding', 'gruppe', 'industries', 'systems',
'technik', 'service', 'services', 'solutions', 'management', 'international', 'und',
'germany', 'deutschland', 'gbr', 'mbh', 'company', 'limited', 'logistics',
'construction', 'products', 'group', 'b-v'
}
def calculate_similarity_details(record1, record2): def calculate_similarity_details(record1, record2):
"""Berechnet einen gewichteten Ähnlichkeits-Score und gibt die Details zurück.""" """
Berechnet einen gewichteten Ähnlichkeits-Score und gibt die Details zurück.
"""
scores = {'name': 0, 'location': 0, 'domain': 0} scores = {'name': 0, 'location': 0, 'domain': 0}
# Domain-Match (höchste Priorität, 100 Punkte)
if record1.get('normalized_domain') and record1['normalized_domain'] != 'k.a.' and record1['normalized_domain'] == record2.get('normalized_domain'): if record1.get('normalized_domain') and record1['normalized_domain'] != 'k.a.' and record1['normalized_domain'] == record2.get('normalized_domain'):
scores['domain'] = 100 scores['domain'] = 100
# Namensähnlichkeit (hohe 85% Gewichtung)
if record1.get('normalized_name') and record2.get('normalized_name'): if record1.get('normalized_name') and record2.get('normalized_name'):
# token_set_ratio ist robust gegen zusätzliche Wörter wie "Holding" oder "Gruppe"
scores['name'] = round(fuzz.token_set_ratio(record1['normalized_name'], record2['normalized_name']) * 0.85) scores['name'] = round(fuzz.token_set_ratio(record1['normalized_name'], record2['normalized_name']) * 0.85)
# Standort-Bonus (20 Punkte)
if record1.get('CRM Ort') and record1['CRM Ort'] == record2.get('CRM Ort'): if record1.get('CRM Ort') and record1['CRM Ort'] == record2.get('CRM Ort'):
if record1.get('CRM Land') and record1['CRM Land'] == record2.get('CRM Land'): if record1.get('CRM Land') and record1['CRM Land'] == record2.get('CRM Land'):
scores['location'] = 20 scores['location'] = 20
@@ -66,18 +41,12 @@ def calculate_similarity_details(record1, record2):
total_score = sum(scores.values()) total_score = sum(scores.values())
return {'total': total_score, 'details': scores} return {'total': total_score, 'details': scores}
def create_blocking_keys(name):
"""Erstellt Blocking Keys aus allen signifikanten Wörtern eines Namens."""
if not name:
return []
significant_words = {word for word in name.split() if word not in BLOCKING_STOP_WORDS and len(word) >= 3}
return list(significant_words)
def main(): def main():
start_time = time.time() start_time = time.time()
logger.info(f"===== Skript gestartet: Modus 'duplicate_check' v2.8 =====") logger.info("Starte den Duplikats-Check (v3.0 - Back to Basics)...")
logger.info(f"Logdatei: {log_file_path}")
# ... (Initialisierung und Laden der Daten bleibt gleich) ...
try: try:
sheet_handler = GoogleSheetHandler() sheet_handler = GoogleSheetHandler()
except Exception as e: except Exception as e:
@@ -99,53 +68,32 @@ def main():
df['normalized_domain'] = df['CRM Website'].astype(str).apply(simple_normalize_url) df['normalized_domain'] = df['CRM Website'].astype(str).apply(simple_normalize_url)
df['CRM Ort'] = df['CRM Ort'].astype(str).str.lower().str.strip() df['CRM Ort'] = df['CRM Ort'].astype(str).str.lower().str.strip()
df['CRM Land'] = df['CRM Land'].astype(str).str.lower().str.strip() df['CRM Land'] = df['CRM Land'].astype(str).str.lower().str.strip()
df['block_keys'] = df['normalized_name'].apply(create_blocking_keys)
logger.info("Erstelle Index für CRM-Daten zur Beschleunigung...")
crm_index = defaultdict(list)
crm_records = crm_df.to_dict('records') crm_records = crm_df.to_dict('records')
for record in crm_records: matching_records = matching_df.to_dict('records')
for key in record['block_keys']:
crm_index[key].append(record)
logger.info("Starte Matching-Prozess...") logger.info(f"Starte Matching-Prozess: {len(matching_records)} Einträge werden mit {len(crm_records)} CRM-Einträgen verglichen...")
results = [] results = []
for match_record in matching_df.to_dict('records'): for i, match_record in enumerate(matching_records):
best_score_info = {'total': -1, 'details': {'name': 0, 'location': 0, 'domain': 0}} best_score_info = {'total': -1, 'details': {'name': 0, 'location': 0, 'domain': 0}}
best_match_name = "" best_match_name = ""
logger.info(f"--- Prüfe: '{match_record.get('CRM Name', 'N/A')}' ---") logger.info(f"--- Prüfe {i + 1}/{len(matching_records)}: '{match_record.get('CRM Name', 'N/A')}' ---")
logger.debug(f" [Normalisiert: '{match_record.get('normalized_name')}', Domain: '{match_record.get('normalized_domain')}', Keys: {match_record.get('block_keys')}]")
candidate_pool = {} # BRUTE-FORCE: Vergleiche mit jedem einzelnen CRM-Eintrag
for key in match_record['block_keys']: for crm_record in crm_records:
candidates_from_key = crm_index.get(key, [])
if candidates_from_key:
logger.debug(f" -> Block-Key '{key}' gefunden. {len(candidates_from_key)} Kandidaten hinzugefügt.")
for crm_record in candidates_from_key:
candidate_pool[crm_record['CRM Name']] = crm_record
if not candidate_pool:
logger.debug(" -> Keine Kandidaten im Index gefunden. Überspringe Vergleich.")
results.append({
'Potenzieller Treffer im CRM': "", 'Score (Gesamt)': 0, 'Score (Name)': 0,
'Bonus (Standort)': 0, 'Bonus (Domain)': 0
})
continue
logger.debug(f" -> Vergleiche mit insgesamt {len(candidate_pool)} einzigartigen Kandidaten.")
for crm_record in candidate_pool.values():
score_info = calculate_similarity_details(match_record, crm_record) score_info = calculate_similarity_details(match_record, crm_record)
if score_info['total'] > 50: # Logge nur Vergleiche mit einem minimalen Score, um das Log nicht zu überfluten # Logge jeden interessanten Vergleich (Score > 60)
if score_info['total'] > 60:
logger.debug(f" - Kandidat: '{crm_record.get('CRM Name', 'N/A')}' -> Score: {score_info['total']} (Details: {score_info['details']})") logger.debug(f" - Kandidat: '{crm_record.get('CRM Name', 'N/A')}' -> Score: {score_info['total']} (Details: {score_info['details']})")
if score_info['total'] > best_score_info['total']: if score_info['total'] > best_score_info['total']:
best_score_info = score_info best_score_info = score_info
best_match_name = crm_record.get('CRM Name', 'N/A') best_match_name = crm_record.get('CRM Name', 'N/A')
logger.info(f" --> Neuer bester Treffer: '{best_match_name}' mit Score {best_score_info['total']}")
logger.info(f" --> Bester Treffer: '{best_match_name}' mit Score {best_score_info['total']}")
results.append({ results.append({
'Potenzieller Treffer im CRM': best_match_name if best_score_info['total'] >= SCORE_THRESHOLD else "", 'Potenzieller Treffer im CRM': best_match_name if best_score_info['total'] >= SCORE_THRESHOLD else "",
@@ -170,8 +118,6 @@ def main():
end_time = time.time() end_time = time.time()
logger.info(f"Gesamtdauer des Duplikats-Checks: {end_time - start_time:.2f} Sekunden.") logger.info(f"Gesamtdauer des Duplikats-Checks: {end_time - start_time:.2f} Sekunden.")
logger.info(f"===== Skript beendet: Modus 'duplicate_check' =====")
if __name__ == "__main__": if __name__ == "__main__":
main() main()