duplicate_checker.py aktualisiert

This commit is contained in:
2025-08-06 05:45:15 +00:00
parent 273aa04179
commit 223f719e38

View File

@@ -2,6 +2,7 @@ import os
import re import re
import logging import logging
import pandas as pd import pandas as pd
import numpy as np
import recordlinkage import recordlinkage
from rapidfuzz import fuzz from rapidfuzz import fuzz
from google_sheet_handler import GoogleSheetHandler from google_sheet_handler import GoogleSheetHandler
@@ -64,7 +65,8 @@ def normalize_domain(url: str) -> str:
def main(): def main():
logger.info("Starte den Duplikats-Check (v2.0 mit Kandidaten-Logging)...") logger.info("Starte den Duplikats-Check (v2.0 mit korrekten Missing-Werten)...")
# Initialize GoogleSheetHandler
try: try:
sheet_handler = GoogleSheetHandler() sheet_handler = GoogleSheetHandler()
logger.info("GoogleSheetHandler initialisiert") logger.info("GoogleSheetHandler initialisiert")
@@ -72,7 +74,7 @@ def main():
logger.critical(f"FEHLER bei Initialisierung des GoogleSheetHandler: {e}") logger.critical(f"FEHLER bei Initialisierung des GoogleSheetHandler: {e}")
return return
# Daten laden # Load data
crm_df = sheet_handler.get_sheet_as_dataframe(CRM_SHEET_NAME) crm_df = sheet_handler.get_sheet_as_dataframe(CRM_SHEET_NAME)
match_df = sheet_handler.get_sheet_as_dataframe(MATCHING_SHEET_NAME) match_df = sheet_handler.get_sheet_as_dataframe(MATCHING_SHEET_NAME)
if crm_df is None or crm_df.empty or match_df is None or match_df.empty: if crm_df is None or crm_df.empty or match_df is None or match_df.empty:
@@ -80,73 +82,73 @@ def main():
return return
logger.info(f"{len(crm_df)} CRM-Zeilen, {len(match_df)} Matching-Zeilen geladen") logger.info(f"{len(crm_df)} CRM-Zeilen, {len(match_df)} Matching-Zeilen geladen")
# Normalisierung # Normalize fields
for df, label in [(crm_df, 'CRM'), (match_df, 'Matching')]: for df, label in [(crm_df, 'CRM'), (match_df, 'Matching')]:
df['norm_name'] = df['CRM Name'].fillna('').apply(normalize_company_name) df['norm_name'] = df['CRM Name'].fillna('').apply(normalize_company_name)
df['norm_domain'] = df['CRM Website'].fillna('').apply(normalize_domain) df['norm_domain'] = df['CRM Website'].fillna('').apply(normalize_domain)
df['city'] = df['CRM Ort'].fillna('').apply(lambda x: str(x).casefold().strip()) df['city'] = df['CRM Ort'].fillna('').apply(lambda x: str(x).casefold().strip())
logger.debug(f"{label}-Daten normalisiert: Beispiel: {df.iloc[0][['norm_name','norm_domain','city']].to_dict()}") # Replace empty strings with NaN so they aren't considered matches
df['norm_domain'].replace('', np.nan, inplace=True)
df['city'].replace('', np.nan, inplace=True)
logger.debug(f"{label}-Daten normalisiert: Beispiel: {{'norm_name': df.iloc[0]['norm_name'], 'norm_domain': df.iloc[0]['norm_domain'], 'city': df.iloc[0]['city']}}")
# Blocking # Blocking per Domain
indexer = recordlinkage.Index() indexer = recordlinkage.Index()
indexer.block('norm_domain') indexer.block('norm_domain')
candidate_pairs = indexer.index(crm_df, match_df) candidate_pairs = indexer.index(crm_df, match_df)
logger.info(f"Blocking abgeschlossen: {len(candidate_pairs)} Kandidatenpaare") logger.info(f"Blocking abgeschlossen: {len(candidate_pairs)} Kandidatenpaare")
# Compare # Vergleichsregeln definieren
compare = recordlinkage.Compare() compare = recordlinkage.Compare()
compare.exact('norm_domain', 'norm_domain', label='domain') compare.exact('norm_domain', 'norm_domain', label='domain', missing_value=0)
compare.string('norm_name', 'norm_name', method='jarowinkler', label='name_sim') compare.string('norm_name', 'norm_name', method='jarowinkler', label='name_sim')
compare.exact('city', 'city', label='city') compare.exact('city', 'city', label='city', missing_value=0)
features = compare.compute(candidate_pairs, crm_df, match_df) features = compare.compute(candidate_pairs, crm_df, match_df)
logger.debug(f"Features berechnet: {features.head()}\n...") logger.debug(f"Features berechnet: {features.head()}\n...")
# Score # Score berechnen
features['score'] = (WEIGHTS['domain']*features['domain'] + features['score'] = (
WEIGHTS['name']*features['name_sim'] + WEIGHTS['domain'] * features['domain'] +
WEIGHTS['city']*features['city']) WEIGHTS['name'] * features['name_sim'] +
WEIGHTS['city'] * features['city']
)
logger.info("Scores berechnet") logger.info("Scores berechnet")
# Per Match Logging # Detailed per-match logging
results = [] results = []
crm_df_idx = crm_df.reset_index() crm_idx_map = crm_df.reset_index()
for match_idx, group in features.reset_index().groupby('level_1'): for match_idx, group in features.reset_index().groupby('level_1'):
logger.info(f"--- Prüfe Matching-Zeile {match_idx} ---") logger.info(f"--- Prüfe Matching-Zeile {match_idx} ---")
df_block = group.sort_values('score', ascending=False).copy() df_block = group.sort_values('score', ascending=False).copy()
# Enrich with CRM fields # Enrich with CRM info
df_block['CRM Name'] = df_block['level_0'].map(crm_df_idx.set_index('index')['CRM Name']) df_block['CRM Name'] = df_block['level_0'].map(crm_idx_map.set_index('index')['CRM Name'])
df_block['CRM Website'] = df_block['level_0'].map(crm_df_idx.set_index('index')['CRM Website']) df_block['CRM Website'] = df_block['level_0'].map(crm_idx_map.set_index('index')['CRM Website'])
df_block['CRM Ort'] = df_block['level_0'].map(crm_df_idx.set_index('index')['CRM Ort']) df_block['CRM Ort'] = df_block['level_0'].map(crm_idx_map.set_index('index')['CRM Ort'])
# Log top candidates logger.debug("Kandidaten (Index, Score, Domain, Name_sim, City, CRM Name):")
logger.debug("Kandidaten (CRM_Index, Score, Domain, Name_sim, City, CRM Name):")
for _, row in df_block.iterrows(): for _, row in df_block.iterrows():
logger.debug(f" [{int(row['level_0'])}] score={row['score']:.3f} dom={row['domain']} name_sim={row['name_sim']:.3f} city={row['city']} => {row['CRM Name']}") logger.debug(f" [{int(row['level_0'])}] score={row['score']:.3f} dom={row['domain']} name={row['name_sim']:.3f} city={row['city']} => {row['CRM Name']}")
top = df_block.iloc[0] top = df_block.iloc[0]
crm_idx = top['level_0'] if top['score'] >= SCORE_THRESHOLD else None crm_idx = top['level_0'] if top['score'] >= SCORE_THRESHOLD else None
if crm_idx is not None: if crm_idx is not None:
logger.info(f" --> Match: CRM-Index {int(crm_idx)} ({top['CRM Name']}) mit Score {top['score']:.2f}") logger.info(f" --> Match: {int(crm_idx)} ({top['CRM Name']}) mit Score {top['score']:.2f}")
else: else:
logger.info(f" --> Kein Match (höchster Score {top['score']:.2f})") logger.info(f" --> Kein Match (höchster Score {top['score']:.2f})")
results.append((crm_idx, match_idx, top['score'])) results.append((crm_idx, match_idx, top['score']))
# Prepare output # Prepare output
match_df_idx = match_df.reset_index() match_idx_map = match_df.reset_index()
output = match_df_idx[['CRM Name','CRM Website','CRM Ort','CRM Land']].copy() output = match_idx_map[['CRM Name','CRM Website','CRM Ort','CRM Land']].copy()
output['Matched CRM Name'] = '' output[['Matched CRM Name','Matched CRM Website','Matched CRM Ort','Matched CRM Land','Score']] = ''
output['Matched CRM Website'] = ''
output['Matched CRM Ort'] = ''
output['Matched CRM Land'] = ''
output['Score'] = 0.0
for crm_idx, match_idx, score in results: for crm_idx, match_idx, score in results:
if crm_idx is not None: if crm_idx is not None:
crm_row = crm_df_idx[crm_df_idx['index']==crm_idx].iloc[0] crm_row = crm_idx_map[crm_idx_map['index']==crm_idx].iloc[0]
output.at[match_idx, 'Matched CRM Name'] = crm_row['CRM Name'] output.at[match_idx, 'Matched CRM Name'] = crm_row['CRM Name']
output.at[match_idx, 'Matched CRM Website'] = crm_row['CRM Website'] output.at[match_idx, 'Matched CRM Website'] = crm_row['CRM Website']
output.at[match_idx, 'Matched CRM Ort'] = crm_row['CRM Ort'] output.at[match_idx, 'Matched CRM Ort'] = crm_row['CRM Ort']
output.at[match_idx, 'Matched CRM Land'] = crm_row['CRM Land'] output.at[match_idx, 'Matched CRM Land'] = crm_row['CRM Land']
output.at[match_idx, 'Score'] = round(score,3) output.at[match_idx, 'Score'] = round(score,3)
# Write back # Zurückschreiben ins Google Sheet
data = [output.columns.tolist()] + output.values.tolist() data = [output.columns.tolist()] + output.values.tolist()
success = sheet_handler.clear_and_write_data(MATCHING_SHEET_NAME, data) success = sheet_handler.clear_and_write_data(MATCHING_SHEET_NAME, data)
if success: if success: