From 00edd44b465fc9564fbf6babb70a440745e4c55e Mon Sep 17 00:00:00 2001 From: Floke Date: Sun, 9 Nov 2025 08:48:06 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20Parent=20Account=20Logik=20f=C3=BCr=20i?= =?UTF-8?q?nterne=20Deduplizierung=20integriert?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Spalte 'Parent Account' wird geladen und normalisiert - 'calculate_similarity' erkennt Parent-Child-Beziehungen und markiert diese - 'run_internal_deduplication' ignoriert bekannte Hierarchien bei der Duplikatsfindung - Neue Spalte 'Duplicate_Hint' für Hinweise auf fehlende Parent Accounts hinzugefügt --- company_deduplicator.py | 71 +++++++++++++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 16 deletions(-) diff --git a/company_deduplicator.py b/company_deduplicator.py index 88756707..f7e55efa 100644 --- a/company_deduplicator.py +++ b/company_deduplicator.py @@ -180,8 +180,23 @@ def calculate_similarity(mrec: dict, crec: dict, token_freq: Counter): 'penalties': penalties, 'name_bonus': name_bonus, 'rare_overlap': rare_overlap, - 'city_only_overlap': int(city_only_overlap) + 'city_only_overlap': int(city_only_overlap), + 'is_parent_child': 0 # Standardwert } + + # Prüfen auf Parent-Child-Beziehung + n1_norm = mrec.get('normalized_name','') + n2_norm = crec.get('normalized_name','') + p1_norm = mrec.get('normalized_parent_name','') + p2_norm = crec.get('normalized_parent_name','') + + if (n1_norm and p2_norm and n1_norm == p2_norm) or \ + (n2_norm and p1_norm and n2_norm == p1_norm): + comp['is_parent_child'] = 1 + # Wenn es eine Parent-Child-Beziehung ist, geben wir einen sehr hohen Score zurück, + # aber mit dem Flag, damit es später ignoriert werden kann. + return 500, comp # Sehr hoher Score, um es leicht erkennbar zu machen + return round(total), comp # --- Indexe --- @@ -248,11 +263,13 @@ def run_internal_deduplication(): logger.info(f"{len(crm_df)} CRM-Datensätze geladen.") # Normalisierung - crm_df['normalized_name'] = crm_df['CRM Name'].astype(str).apply(normalize_company_name) - crm_df['normalized_domain'] = crm_df['CRM Website'].astype(str).apply(simple_normalize_url) - crm_df['CRM Ort'] = crm_df['CRM Ort'].astype(str).str.lower().str.strip() - crm_df['CRM Land'] = crm_df['CRM Land'].astype(str).str.lower().str.strip() - crm_df['domain_use_flag'] = 1 # CRM-Domain gilt als vertrauenswürdig + crm_df['normalized_name'] = crm_df['CRM Name'].astype(str).apply(normalize_company_name) + crm_df['normalized_domain'] = crm_df['CRM Website'].astype(str).apply(simple_normalize_url) + crm_df['CRM Ort'] = crm_df['CRM Ort'].astype(str).str.lower().str.strip() + crm_df['CRM Land'] = crm_df['CRM Land'].astype(str).str.lower().str.strip() + crm_df['Parent Account'] = crm_df.get('Parent Account', pd.Series(index=crm_df.index, dtype=object)).astype(str).fillna('').str.strip() + crm_df['normalized_parent_name'] = crm_df['Parent Account'].apply(normalize_company_name) + crm_df['domain_use_flag'] = 1 # CRM-Domain gilt als vertrauenswürdig # City-Tokens und Blocking-Indizes global CITY_TOKENS @@ -301,16 +318,27 @@ def run_internal_deduplication(): score, comp = calculate_similarity(record1, record2, token_freq) + # Wenn es eine bekannte Parent-Child-Beziehung ist, ignorieren wir sie. + if comp.get('is_parent_child') == 1: + logger.debug(f" -> Ignoriere bekannte Parent-Child-Beziehung: '{record1['CRM Name']}' <-> '{record2['CRM Name']}'") + continue + # Akzeptanzlogik (hier könnte man den Threshold anpassen) if score >= SCORE_THRESHOLD: + duplicate_hint = '' + # Prüfen, ob beide Accounts keinen Parent Account haben + if not record1.get('Parent Account') and not record2.get('Parent Account'): + duplicate_hint = 'Potenziell fehlende Parent-Account-Beziehung' + pair_info = { 'id1': record1['unique_id'], 'name1': record1['CRM Name'], 'id2': record2['unique_id'], 'name2': record2['CRM Name'], 'score': score, - 'details': str(comp) + 'details': str(comp), + 'hint': duplicate_hint } found_pairs.append(pair_info) - logger.info(f" -> Potenzielles Duplikat gefunden: '{record1['CRM Name']}' <-> '{record2['CRM Name']}' (Score: {score})") + logger.info(f" -> Potenzielles Duplikat gefunden: '{record1['CRM Name']}' <-> '{record2['CRM Name']}' (Score: {score}, Hint: {duplicate_hint})") logger.info("\n===== Interner Abgleich abgeschlossen ====") logger.info(f"Insgesamt {len(found_pairs)} potenzielle Duplikatspaare gefunden.") @@ -328,19 +356,28 @@ def run_internal_deduplication(): # Schritt 4: IDs zuweisen und in Tabelle schreiben crm_df['Duplicate_ID'] = '' + crm_df['Duplicate_Hint'] = '' # Neue Spalte für Hinweise dup_counter = 1 for group in groups: dup_id = f"Dup_{dup_counter:04d}" dup_counter += 1 + # IDs der Gruppe im DataFrame aktualisieren crm_df.loc[crm_df['unique_id'].isin(group), 'Duplicate_ID'] = dup_id + + # Hinweise für die Gruppe sammeln und setzen + group_hints = [p['hint'] for p in found_pairs if p['id1'] in group or p['id2'] in group and p['hint']] + if group_hints: + # Nur den ersten eindeutigen Hinweis pro Gruppe setzen, oder eine Zusammenfassung + unique_hints = list(set(group_hints)) + crm_df.loc[crm_df['unique_id'].isin(group), 'Duplicate_Hint'] = "; ".join(unique_hints) # Namen der Gruppenmitglieder für Log-Ausgabe sammeln member_names = crm_df[crm_df['unique_id'].isin(group)]['CRM Name'].tolist() logger.info(f"Gruppe {dup_id}: {member_names}") # Bereinigen der Hilfsspalten vor dem Schreiben - crm_df.drop(columns=['unique_id', 'normalized_name', 'normalized_domain', 'domain_use_flag'], inplace=True) + crm_df.drop(columns=['unique_id', 'normalized_name', 'normalized_domain', 'domain_use_flag', 'normalized_parent_name'], inplace=True) # Ergebnisse zurückschreiben logger.info("Schreibe Ergebnisse mit Duplikats-IDs ins Sheet...") @@ -438,12 +475,14 @@ def run_external_comparison(): logger.info("Serp-Fallback übersprungen: B oder E bereits befüllt (keine fehlenden Matching-URLs)") # Normalisierung CRM - crm_df['normalized_name'] = crm_df['CRM Name'].astype(str).apply(normalize_company_name) - crm_df['normalized_domain'] = crm_df['CRM Website'].astype(str).apply(simple_normalize_url) - crm_df['CRM Ort'] = crm_df['CRM Ort'].astype(str).str.lower().str.strip() - crm_df['CRM Land'] = crm_df['CRM Land'].astype(str).str.lower().str.strip() - crm_df['block_key'] = crm_df['normalized_name'].apply(lambda x: x.split()[0] if x else None) - crm_df['domain_use_flag'] = 1 # CRM-Domain gilt als vertrauenswürdig + crm_df['normalized_name'] = crm_df['CRM Name'].astype(str).apply(normalize_company_name) + crm_df['normalized_domain'] = crm_df['CRM Website'].astype(str).apply(simple_normalize_url) + crm_df['CRM Ort'] = crm_df['CRM Ort'].astype(str).str.lower().str.strip() + crm_df['CRM Land'] = crm_df['CRM Land'].astype(str).str.lower().str.strip() + crm_df['Parent Account'] = crm_df.get('Parent Account', pd.Series(index=crm_df.index, dtype=object)).astype(str).fillna('').str.strip() + crm_df['normalized_parent_name'] = crm_df['Parent Account'].apply(normalize_company_name) + crm_df['block_key'] = crm_df['normalized_name'].apply(lambda x: x.split()[0] if x else None) + crm_df['domain_use_flag'] = 1 # CRM-Domain gilt als vertrauenswürdig # Normalisierung Matching match_df['Gefundene Website'] = match_df.get('Gefundene Website', pd.Series(index=match_df.index, dtype=object)) @@ -584,7 +623,7 @@ def run_external_comparison(): write_df['Score'] = res_df['Score'] write_df['Match_Grund'] = res_df['Match_Grund'] - drop_cols = ['normalized_name','normalized_domain','block_key','Effektive Website','domain_use_flag'] + drop_cols = ['normalized_name','normalized_domain','block_key','Effektive Website','domain_use_flag', 'normalized_parent_name'] for c in drop_cols: if c in write_df.columns: write_df.drop(columns=[c], inplace=True)