Files
Brancheneinstufung2/duplicate_checker.py

133 lines
5.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import sys
import logging
import pandas as pd
from datetime import datetime
import tldextract
from thefuzz import fuzz
from helpers import normalize_company_name, simple_normalize_url
from google_sheet_handler import GoogleSheetHandler
# --- Konfiguration ---
CRM_SHEET_NAME = "CRM_Accounts"
MATCHING_SHEET_NAME = "Matching_Accounts"
SCORE_THRESHOLD = 80 # ab hier automatisches Match
LOG_DIR = "Log"
# --- Logging Setup mit Datum im Dateinamen ---
if not os.path.exists(LOG_DIR):
os.makedirs(LOG_DIR, exist_ok=True)
now = datetime.now().strftime('%Y-%m-%d_%H-%M')
log_path = os.path.join(LOG_DIR, f"{now}_Duplicate.txt")
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
# Console-Handler (INFO+)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(logging.Formatter("%(asctime)s - %(levelname)-8s - %(message)s"))
logger.addHandler(ch)
# File-Handler (DEBUG+)
fh = logging.FileHandler(log_path, mode='a', encoding='utf-8')
fh.setLevel(logging.DEBUG)
fh.setFormatter(logging.Formatter("%(asctime)s - %(levelname)-8s - %(name)s - %(message)s"))
logger.addHandler(fh)
logger.info(f"Logging in Datei: {log_path}")
def calculate_similarity(record1, record2):
"""Berechnet gewichteten Ähnlichkeits-Score (0190) zwischen zwei Datensätzen."""
total = 0
# Domain-Check über registered domain
url1 = record1.get('CRM Website','')
url2 = record2.get('CRM Website','')
dom1 = tldextract.extract(url1).registered_domain or ''
dom2 = tldextract.extract(url2).registered_domain or ''
if dom1 and dom1 == dom2:
total += 100
# Name-Fuzzy
name1 = record1['normalized_name']
name2 = record2['normalized_name']
if name1 and name2:
total += fuzz.token_set_ratio(name1, name2) * 0.7
# Ort+Land exakt
if record1['CRM Ort'] == record2['CRM Ort'] and record1['CRM Land'] == record2['CRM Land']:
total += 20
return round(total)
def main():
logger.info("Starte Duplikats-Check (v2.0) mit Datum im Lognamen und verbessertem Domain-Match")
try:
sheet = GoogleSheetHandler()
logger.info("GoogleSheetHandler initialisiert")
except Exception as e:
logger.critical(f"FEHLER beim Init GoogleSheetHandler: {e}")
sys.exit(1)
# Daten einlesen
crm_df = sheet.get_sheet_as_dataframe(CRM_SHEET_NAME)
match_df = sheet.get_sheet_as_dataframe(MATCHING_SHEET_NAME)
if crm_df is None or crm_df.empty or match_df is None or match_df.empty:
logger.critical("CRM- oder Matching-Daten fehlen. Abbruch.")
return
logger.info(f"{len(crm_df)} CRM-Datensätze, {len(match_df)} Matching-Datensätze geladen")
# Normalisierung und Blocking-Key
for df, label in [(crm_df, 'CRM'), (match_df, 'Matching')]:
df['normalized_name'] = df['CRM Name'].astype(str).apply(normalize_company_name)
df['normalized_domain'] = df['CRM Website'].astype(str).apply(simple_normalize_url)
df['CRM Ort'] = df['CRM Ort'].astype(str).str.lower().str.strip()
df['CRM Land'] = df['CRM Land'].astype(str).str.lower().str.strip()
df['block_key'] = df['normalized_name'].apply(lambda x: x.split()[0] if x else None)
logger.debug(f"{label}-Normierung Beispiel: {df.iloc[0][['normalized_name','normalized_domain','block_key']].to_dict()}")
# Blocking-Index
crm_index = {}
for idx, row in crm_df.iterrows():
key = row['block_key']
if key:
crm_index.setdefault(key, []).append(row)
logger.info(f"Blocking-Index erstellt: {len(crm_index)} Keys")
# Matching
results = []
total = len(match_df)
for i, mrow in match_df.iterrows():
key = mrow['block_key']
cands = crm_index.get(key, [])
logger.info(f"Prüfe {i+1}/{total}: '{mrow['CRM Name']}' (Key='{key}') -> {len(cands)} Kandidaten")
if not cands:
results.append({'Match': '', 'Score': 0})
continue
scored = []
for crow in cands:
score = calculate_similarity(mrow, crow)
scored.append((crow['CRM Name'], score))
# Log relevante Kandidaten mit Score>=SCORE_THRESHOLD-20
relevant = [(n,s) for n,s in scored if s >= SCORE_THRESHOLD-20]
logger.debug(f" Relevante Kandidaten (>= {SCORE_THRESHOLD-20}): {relevant}")
best_name, best_score = max(scored, key=lambda x: x[1])
if best_score >= SCORE_THRESHOLD:
results.append({'Match': best_name, 'Score': best_score})
logger.info(f" --> Match: '{best_name}' mit Score {best_score}")
else:
results.append({'Match': '', 'Score': best_score})
logger.info(f" --> Kein Match (höchster Score {best_score})")
# Ergebnis zurück in Sheet
out = pd.DataFrame(results)
output = pd.concat([match_df[['CRM Name','CRM Website','CRM Ort','CRM Land']].reset_index(drop=True), out], axis=1)
data = [output.columns.tolist()] + output.values.tolist()
ok = sheet.clear_and_write_data(MATCHING_SHEET_NAME, data)
if ok:
logger.info("Ergebnisse erfolgreich geschrieben")
else:
logger.error("Fehler beim Schreiben ins Google Sheet")
if __name__ == '__main__':
main()