feat: Parent Account Logik für interne Deduplizierung integriert

- Spalte 'Parent Account' wird geladen und normalisiert - 'calculate_similarity' erkennt Parent-Child-Beziehungen und markiert diese - 'run_internal_deduplication' ignoriert bekannte Hierarchien bei der Duplikatsfindung - Neue Spalte 'Duplicate_Hint' für Hinweise auf fehlende Parent Accounts hinzugefügt
2025-11-09 08:48:06 +00:00
parent 37182b3a7f
commit 00edd44b46
1 changed files with 55 additions and 16 deletions
--- a/company_deduplicator.py
+++ b/company_deduplicator.py
@@ -180,8 +180,23 @@ def calculate_similarity(mrec: dict, crec: dict, token_freq: Counter):
        'penalties': penalties,
        'name_bonus': name_bonus,
        'rare_overlap': rare_overlap,
-        'city_only_overlap': int(city_only_overlap)
+        'city_only_overlap': int(city_only_overlap),
+        'is_parent_child': 0 # Standardwert
    }
+
+    # Prüfen auf Parent-Child-Beziehung
+    n1_norm = mrec.get('normalized_name','')
+    n2_norm = crec.get('normalized_name','')
+    p1_norm = mrec.get('normalized_parent_name','')
+    p2_norm = crec.get('normalized_parent_name','')
+
+    if (n1_norm and p2_norm and n1_norm == p2_norm) or \
+       (n2_norm and p1_norm and n2_norm == p1_norm):
+        comp['is_parent_child'] = 1
+        # Wenn es eine Parent-Child-Beziehung ist, geben wir einen sehr hohen Score zurück,
+        # aber mit dem Flag, damit es später ignoriert werden kann.
+        return 500, comp # Sehr hoher Score, um es leicht erkennbar zu machen
+
    return round(total), comp

 # --- Indexe ---
@@ -252,6 +267,8 @@ def run_internal_deduplication():
    crm_df['normalized_domain']      = crm_df['CRM Website'].astype(str).apply(simple_normalize_url)
    crm_df['CRM Ort']                = crm_df['CRM Ort'].astype(str).str.lower().str.strip()
    crm_df['CRM Land']               = crm_df['CRM Land'].astype(str).str.lower().str.strip()
+    crm_df['Parent Account']         = crm_df.get('Parent Account', pd.Series(index=crm_df.index, dtype=object)).astype(str).fillna('').str.strip()
+    crm_df['normalized_parent_name'] = crm_df['Parent Account'].apply(normalize_company_name)
    crm_df['domain_use_flag']        = 1  # CRM-Domain gilt als vertrauenswürdig

    # City-Tokens und Blocking-Indizes
@@ -301,16 +318,27 @@ def run_internal_deduplication():

            score, comp = calculate_similarity(record1, record2, token_freq)

+            # Wenn es eine bekannte Parent-Child-Beziehung ist, ignorieren wir sie.
+            if comp.get('is_parent_child') == 1:
+                logger.debug(f"  -> Ignoriere bekannte Parent-Child-Beziehung: '{record1['CRM Name']}' <-> '{record2['CRM Name']}'")
+                continue
+
            # Akzeptanzlogik (hier könnte man den Threshold anpassen)
            if score >= SCORE_THRESHOLD:
+                duplicate_hint = ''
+                # Prüfen, ob beide Accounts keinen Parent Account haben
+                if not record1.get('Parent Account') and not record2.get('Parent Account'):
+                    duplicate_hint = 'Potenziell fehlende Parent-Account-Beziehung'
+
                pair_info = {
                    'id1': record1['unique_id'], 'name1': record1['CRM Name'],
                    'id2': record2['unique_id'], 'name2': record2['CRM Name'],
                    'score': score,
-                    'details': str(comp)
+                    'details': str(comp),
+                    'hint': duplicate_hint
                }
                found_pairs.append(pair_info)
-                logger.info(f"  -> Potenzielles Duplikat gefunden: '{record1['CRM Name']}' <-> '{record2['CRM Name']}' (Score: {score})")
+                logger.info(f"  -> Potenzielles Duplikat gefunden: '{record1['CRM Name']}' <-> '{record2['CRM Name']}' (Score: {score}, Hint: {duplicate_hint})")

    logger.info("\n===== Interner Abgleich abgeschlossen ====")
    logger.info(f"Insgesamt {len(found_pairs)} potenzielle Duplikatspaare gefunden.")
@@ -328,19 +356,28 @@ def run_internal_deduplication():

    # Schritt 4: IDs zuweisen und in Tabelle schreiben
    crm_df['Duplicate_ID'] = ''
+    crm_df['Duplicate_Hint'] = '' # Neue Spalte für Hinweise
    dup_counter = 1
    for group in groups:
        dup_id = f"Dup_{dup_counter:04d}"
        dup_counter += 1
+        
        # IDs der Gruppe im DataFrame aktualisieren
        crm_df.loc[crm_df['unique_id'].isin(group), 'Duplicate_ID'] = dup_id

+        # Hinweise für die Gruppe sammeln und setzen
+        group_hints = [p['hint'] for p in found_pairs if p['id1'] in group or p['id2'] in group and p['hint']]
+        if group_hints:
+            # Nur den ersten eindeutigen Hinweis pro Gruppe setzen, oder eine Zusammenfassung
+            unique_hints = list(set(group_hints))
+            crm_df.loc[crm_df['unique_id'].isin(group), 'Duplicate_Hint'] = "; ".join(unique_hints)
+        
        # Namen der Gruppenmitglieder für Log-Ausgabe sammeln
        member_names = crm_df[crm_df['unique_id'].isin(group)]['CRM Name'].tolist()
        logger.info(f"Gruppe {dup_id}: {member_names}")

    # Bereinigen der Hilfsspalten vor dem Schreiben
-    crm_df.drop(columns=['unique_id', 'normalized_name', 'normalized_domain', 'domain_use_flag'], inplace=True)
+    crm_df.drop(columns=['unique_id', 'normalized_name', 'normalized_domain', 'domain_use_flag', 'normalized_parent_name'], inplace=True)

    # Ergebnisse zurückschreiben
    logger.info("Schreibe Ergebnisse mit Duplikats-IDs ins Sheet...")
@@ -442,6 +479,8 @@ def run_external_comparison():
    crm_df['normalized_domain']      = crm_df['CRM Website'].astype(str).apply(simple_normalize_url)
    crm_df['CRM Ort']                = crm_df['CRM Ort'].astype(str).str.lower().str.strip()
    crm_df['CRM Land']               = crm_df['CRM Land'].astype(str).str.lower().str.strip()
+    crm_df['Parent Account']         = crm_df.get('Parent Account', pd.Series(index=crm_df.index, dtype=object)).astype(str).fillna('').str.strip()
+    crm_df['normalized_parent_name'] = crm_df['Parent Account'].apply(normalize_company_name)
    crm_df['block_key']              = crm_df['normalized_name'].apply(lambda x: x.split()[0] if x else None)
    crm_df['domain_use_flag']        = 1  # CRM-Domain gilt als vertrauenswürdig

@@ -584,7 +623,7 @@ def run_external_comparison():
    write_df['Score'] = res_df['Score']
    write_df['Match_Grund'] = res_df['Match_Grund']

-    drop_cols = ['normalized_name','normalized_domain','block_key','Effektive Website','domain_use_flag']
+    drop_cols = ['normalized_name','normalized_domain','block_key','Effektive Website','domain_use_flag', 'normalized_parent_name']
    for c in drop_cols:
        if c in write_df.columns:
            write_df.drop(columns=[c], inplace=True)