Files
Brancheneinstufung2/duplicate_checker.py

139 lines
5.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import sys
import logging
import pandas as pd
from thefuzz import fuzz
from helpers import normalize_company_name, simple_normalize_url
from google_sheet_handler import GoogleSheetHandler
# duplicate_checker.py v2.6 (Original v2.0 Kern + Logging)
# Version: 2025-08-06_17-15
# --- Konfiguration ---
CRM_SHEET_NAME = "CRM_Accounts"
MATCHING_SHEET_NAME = "Matching_Accounts"
SCORE_THRESHOLD = 80
LOG_DIR = "Log"
LOG_FILE = "duplicate_check_v2.6.log"
# --- Logging Setup ---
if not os.path.exists(LOG_DIR):
os.makedirs(LOG_DIR, exist_ok=True)
log_path = os.path.join(LOG_DIR, LOG_FILE)
# Global logging config
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s - %(levelname)-8s - %(message)s",
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler(log_path, mode='a', encoding='utf-8')
]
)
logger = logging.getLogger(__name__)
logger.info(f"Starting duplicate_checker.py v2.6 | Log: {log_path}")
def calculate_similarity(record1, record2):
"""Berechnet einen gewichteten Ähnlichkeits-Score (0190)."""
total_score = 0
# Domain-Exact
dom1 = record1.get('normalized_domain', '')
dom2 = record2.get('normalized_domain', '')
if dom1 and dom1 == dom2:
total_score += 100
# Name-Fuzzy
name1 = record1.get('normalized_name', '')
name2 = record2.get('normalized_name', '')
if name1 and name2:
name_similarity = fuzz.token_set_ratio(name1, name2)
total_score += name_similarity * 0.7
# Ort+Land exact
if record1.get('CRM Ort') and record1.get('CRM Ort') == record2.get('CRM Ort'):
if record1.get('CRM Land') and record1.get('CRM Land') == record2.get('CRM Land'):
total_score += 20
return round(total_score)
def main():
logger.info("Starte Duplikats-Check v2.6 (Original v2.0 Kern mit Logging)")
try:
sheet_handler = GoogleSheetHandler()
logger.info("GoogleSheetHandler initialisiert")
except Exception as e:
logger.critical(f"FEHLER Init GoogleSheetHandler: {e}")
sys.exit(1)
# Load data
logger.info(f"Lade Master-Daten aus '{CRM_SHEET_NAME}'...")
crm_df = sheet_handler.get_sheet_as_dataframe(CRM_SHEET_NAME)
if crm_df is None or crm_df.empty:
logger.critical(f"Keine Daten in '{CRM_SHEET_NAME}'. Abbruch.")
return
logger.info(f"{len(crm_df)} CRM-Datensätze geladen")
logger.info(f"Lade Matching-Daten aus '{MATCHING_SHEET_NAME}'...")
match_df = sheet_handler.get_sheet_as_dataframe(MATCHING_SHEET_NAME)
if match_df is None or match_df.empty:
logger.critical(f"Keine Daten in '{MATCHING_SHEET_NAME}'. Abbruch.")
return
logger.info(f"{len(match_df)} Matching-Datensätze geladen")
# Normalize
logger.info("Normalisiere Daten...")
for df, label in [(crm_df, 'CRM'), (match_df, 'Matching')]:
df['normalized_name'] = df['CRM Name'].astype(str).apply(normalize_company_name)
df['normalized_domain'] = df['CRM Website'].astype(str).apply(simple_normalize_url)
df['CRM Ort'] = df['CRM Ort'].astype(str).str.lower().str.strip()
df['CRM Land'] = df['CRM Land'].astype(str).str.lower().str.strip()
df['block_key'] = df['normalized_name'].apply(lambda x: x.split()[0] if x else None)
logger.debug(f"{label}-Sample: {df.iloc[0][['normalized_name','normalized_domain','block_key']].to_dict()}")
# Build blocking index
logger.info("Erstelle Blocking-Index...")
crm_index = {}
for idx, row in crm_df.iterrows():
key = row['block_key']
if key:
crm_index.setdefault(key, []).append(row)
logger.info(f"Blocking-Index erstellt mit {len(crm_index)} Keys")
# Matching
logger.info("Starte Matching-Prozess...")
results = []
total = len(match_df)
for i, mrow in match_df.iterrows():
key = mrow['block_key']
candidates = crm_index.get(key, [])
logger.info(f"Prüfe {i+1}/{total}: '{mrow['CRM Name']}' -> {len(candidates)} Kandidaten")
if not candidates:
results.append({'Potenzieller Treffer im CRM': '', 'Ähnlichkeits-Score': 0})
continue
scored = [(crow['CRM Name'], calculate_similarity(mrow, crow)) for crow in candidates]
# Log Top-3 only
top3 = sorted(scored, key=lambda x: x[1], reverse=True)[:3]
logger.debug(f" Top3 Kandidaten: {top3}")
best_name, best_score = max(scored, key=lambda x: x[1])
if best_score >= SCORE_THRESHOLD:
results.append({'Potenzieller Treffer im CRM': best_name, 'Ähnlichkeits-Score': best_score})
logger.info(f" --> Match: '{best_name}' Score={best_score}")
else:
results.append({'Potenzieller Treffer im CRM': best_name if best_name else '', 'Ähnlichkeits-Score': best_score})
logger.info(f" --> Kein Match (höchster Score {best_score})")
# Write back
logger.info("Schreibe Ergebnisse zurück ins Sheet...")
result_df = pd.DataFrame(results)
output_df = match_df[['CRM Name','CRM Website','CRM Ort','CRM Land']].copy()
output_df = pd.concat([output_df.reset_index(drop=True), result_df], axis=1)
data_to_write = [output_df.columns.tolist()] + output_df.values.tolist()
success = sheet_handler.clear_and_write_data(MATCHING_SHEET_NAME, data_to_write)
if success:
logger.info("Ergebnisse erfolgreich geschrieben")
else:
logger.error("Fehler beim Schreiben ins Google Sheet")
if __name__ == '__main__':
main()