Files
Brancheneinstufung2/duplicate_checker.py
2025-08-05 14:27:07 +00:00

118 lines
4.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import re
import pandas as pd
import recordlinkage
from rapidfuzz import fuzz
from google_sheet_handler import GoogleSheetHandler
# --- Konfiguration ---
CRM_SHEET_NAME = "CRM_Accounts"
MATCHING_SHEET_NAME = "Matching_Accounts"
SCORE_THRESHOLD = 0.8
WEIGHTS = {
'domain': 0.5,
'name': 0.4,
'city': 0.1,
}
# --- Hilfsfunktionen ---
def normalize_company_name(name: str) -> str:
"""
Vereinfachte Normalisierung von Firmennamen:
- Unicodesafe Kleinschreibung
- Umlaute in ae/oe/ue, ß in ss
- Entfernen von Rechtsformen und Stop-Wörtern
"""
s = str(name).casefold()
for src, dst in [('ä','ae'), ('ö','oe'), ('ü','ue'), ('ß','ss')]:
s = s.replace(src, dst)
# Nur alphanumerisch und Leerzeichen
s = re.sub(r'[^a-z0-9\s]', ' ', s)
stops = ['gmbh','ag','kg','ug','ohg','holding','group','international']
tokens = [t for t in s.split() if t and t not in stops]
return ' '.join(tokens)
def normalize_domain(url: str) -> str:
"""Root-Domain extrahieren, Protokoll und www entfernen"""
s = str(url).casefold().strip()
s = re.sub(r'^https?://', '', s)
s = s.split('/')[0]
return s.removeprefix('www.')
def main():
# Google Sheets laden
sheet_handler = GoogleSheetHandler()
crm_df = sheet_handler.get_sheet_as_dataframe(CRM_SHEET_NAME)
match_df = sheet_handler.get_sheet_as_dataframe(MATCHING_SHEET_NAME)
if crm_df is None or crm_df.empty or match_df is None or match_df.empty:
print("Fehler: Leere Daten in einem der Tabs. Abbruch.")
return
# Normalisierung
for df in (crm_df, match_df):
df['norm_name'] = df['CRM Name'].fillna('').apply(normalize_company_name)
df['norm_domain'] = df['CRM Website'].fillna('').apply(normalize_domain)
df['city'] = df['CRM Ort'].fillna('').apply(lambda x: str(x).casefold().strip())
# Blocking per Domain
indexer = recordlinkage.Index()
indexer.block('norm_domain')
candidate_pairs = indexer.index(crm_df, match_df)
# Vergleichsregeln definieren
compare = recordlinkage.Compare()
compare.exact('norm_domain', 'norm_domain', label='domain')
compare.string('norm_name', 'norm_name', method='jarowinkler', label='name_sim')
compare.exact('city', 'city', label='city')
features = compare.compute(candidate_pairs, crm_df, match_df)
# Gewichte und Score
features['score'] = (
WEIGHTS['domain'] * features['domain'] +
WEIGHTS['name'] * features['name_sim'] +
WEIGHTS['city'] * features['city']
)
# Bestes Match pro neuer Zeile
matches = features.reset_index()
best = matches.sort_values(['level_1','score'], ascending=[True, False]) \
.drop_duplicates('level_1')
best = best[best['score'] >= SCORE_THRESHOLD] \
.rename(columns={'level_0':'crm_idx','level_1':'match_idx'})
# Merges
crm_df = crm_df.reset_index()
match_df = match_df.reset_index()
merged = (best
.merge(crm_df, left_on='crm_idx', right_on='index')
.merge(match_df, left_on='match_idx', right_on='index', suffixes=('_CRM','_NEW'))
)
# Ausgabe aufbauen
output = match_df[['CRM Name','CRM Website','CRM Ort','CRM Land']].copy()
output['Matched CRM Name'] = ''
output['Matched CRM Website'] = ''
output['Matched CRM Ort'] = ''
output['Matched CRM Land'] = ''
output['Score'] = 0.0
for _, row in merged.iterrows():
i = int(row['match_idx'])
output.at[i, 'Matched CRM Name'] = row['CRM Name_CRM']
output.at[i, 'Matched CRM Website'] = row['CRM Website_CRM']
output.at[i, 'Matched CRM Ort'] = row['CRM Ort_CRM']
output.at[i, 'Matched CRM Land'] = row['CRM Land_CRM']
output.at[i, 'Score'] = row['score']
# Zurückschreiben ins Google Sheet
data = [output.columns.tolist()] + output.values.tolist()
success = sheet_handler.clear_and_write_data(MATCHING_SHEET_NAME, data)
if success:
print(f"Erfolgreich: {len(best)} Matches mit Score ≥ {SCORE_THRESHOLD}")
else:
print("Fehler beim Schreiben ins Google Sheet.")
if __name__ == '__main__':
main()