feat: Parent Account Logik für interne Deduplizierung integriert
- Spalte 'Parent Account' wird geladen und normalisiert - 'calculate_similarity' erkennt Parent-Child-Beziehungen und markiert diese - 'run_internal_deduplication' ignoriert bekannte Hierarchien bei der Duplikatsfindung - Neue Spalte 'Duplicate_Hint' für Hinweise auf fehlende Parent Accounts hinzugefügt
This commit is contained in:
@@ -180,8 +180,23 @@ def calculate_similarity(mrec: dict, crec: dict, token_freq: Counter):
|
||||
'penalties': penalties,
|
||||
'name_bonus': name_bonus,
|
||||
'rare_overlap': rare_overlap,
|
||||
'city_only_overlap': int(city_only_overlap)
|
||||
'city_only_overlap': int(city_only_overlap),
|
||||
'is_parent_child': 0 # Standardwert
|
||||
}
|
||||
|
||||
# Prüfen auf Parent-Child-Beziehung
|
||||
n1_norm = mrec.get('normalized_name','')
|
||||
n2_norm = crec.get('normalized_name','')
|
||||
p1_norm = mrec.get('normalized_parent_name','')
|
||||
p2_norm = crec.get('normalized_parent_name','')
|
||||
|
||||
if (n1_norm and p2_norm and n1_norm == p2_norm) or \
|
||||
(n2_norm and p1_norm and n2_norm == p1_norm):
|
||||
comp['is_parent_child'] = 1
|
||||
# Wenn es eine Parent-Child-Beziehung ist, geben wir einen sehr hohen Score zurück,
|
||||
# aber mit dem Flag, damit es später ignoriert werden kann.
|
||||
return 500, comp # Sehr hoher Score, um es leicht erkennbar zu machen
|
||||
|
||||
return round(total), comp
|
||||
|
||||
# --- Indexe ---
|
||||
@@ -252,6 +267,8 @@ def run_internal_deduplication():
|
||||
crm_df['normalized_domain'] = crm_df['CRM Website'].astype(str).apply(simple_normalize_url)
|
||||
crm_df['CRM Ort'] = crm_df['CRM Ort'].astype(str).str.lower().str.strip()
|
||||
crm_df['CRM Land'] = crm_df['CRM Land'].astype(str).str.lower().str.strip()
|
||||
crm_df['Parent Account'] = crm_df.get('Parent Account', pd.Series(index=crm_df.index, dtype=object)).astype(str).fillna('').str.strip()
|
||||
crm_df['normalized_parent_name'] = crm_df['Parent Account'].apply(normalize_company_name)
|
||||
crm_df['domain_use_flag'] = 1 # CRM-Domain gilt als vertrauenswürdig
|
||||
|
||||
# City-Tokens und Blocking-Indizes
|
||||
@@ -301,16 +318,27 @@ def run_internal_deduplication():
|
||||
|
||||
score, comp = calculate_similarity(record1, record2, token_freq)
|
||||
|
||||
# Wenn es eine bekannte Parent-Child-Beziehung ist, ignorieren wir sie.
|
||||
if comp.get('is_parent_child') == 1:
|
||||
logger.debug(f" -> Ignoriere bekannte Parent-Child-Beziehung: '{record1['CRM Name']}' <-> '{record2['CRM Name']}'")
|
||||
continue
|
||||
|
||||
# Akzeptanzlogik (hier könnte man den Threshold anpassen)
|
||||
if score >= SCORE_THRESHOLD:
|
||||
duplicate_hint = ''
|
||||
# Prüfen, ob beide Accounts keinen Parent Account haben
|
||||
if not record1.get('Parent Account') and not record2.get('Parent Account'):
|
||||
duplicate_hint = 'Potenziell fehlende Parent-Account-Beziehung'
|
||||
|
||||
pair_info = {
|
||||
'id1': record1['unique_id'], 'name1': record1['CRM Name'],
|
||||
'id2': record2['unique_id'], 'name2': record2['CRM Name'],
|
||||
'score': score,
|
||||
'details': str(comp)
|
||||
'details': str(comp),
|
||||
'hint': duplicate_hint
|
||||
}
|
||||
found_pairs.append(pair_info)
|
||||
logger.info(f" -> Potenzielles Duplikat gefunden: '{record1['CRM Name']}' <-> '{record2['CRM Name']}' (Score: {score})")
|
||||
logger.info(f" -> Potenzielles Duplikat gefunden: '{record1['CRM Name']}' <-> '{record2['CRM Name']}' (Score: {score}, Hint: {duplicate_hint})")
|
||||
|
||||
logger.info("\n===== Interner Abgleich abgeschlossen ====")
|
||||
logger.info(f"Insgesamt {len(found_pairs)} potenzielle Duplikatspaare gefunden.")
|
||||
@@ -328,19 +356,28 @@ def run_internal_deduplication():
|
||||
|
||||
# Schritt 4: IDs zuweisen und in Tabelle schreiben
|
||||
crm_df['Duplicate_ID'] = ''
|
||||
crm_df['Duplicate_Hint'] = '' # Neue Spalte für Hinweise
|
||||
dup_counter = 1
|
||||
for group in groups:
|
||||
dup_id = f"Dup_{dup_counter:04d}"
|
||||
dup_counter += 1
|
||||
|
||||
# IDs der Gruppe im DataFrame aktualisieren
|
||||
crm_df.loc[crm_df['unique_id'].isin(group), 'Duplicate_ID'] = dup_id
|
||||
|
||||
# Hinweise für die Gruppe sammeln und setzen
|
||||
group_hints = [p['hint'] for p in found_pairs if p['id1'] in group or p['id2'] in group and p['hint']]
|
||||
if group_hints:
|
||||
# Nur den ersten eindeutigen Hinweis pro Gruppe setzen, oder eine Zusammenfassung
|
||||
unique_hints = list(set(group_hints))
|
||||
crm_df.loc[crm_df['unique_id'].isin(group), 'Duplicate_Hint'] = "; ".join(unique_hints)
|
||||
|
||||
# Namen der Gruppenmitglieder für Log-Ausgabe sammeln
|
||||
member_names = crm_df[crm_df['unique_id'].isin(group)]['CRM Name'].tolist()
|
||||
logger.info(f"Gruppe {dup_id}: {member_names}")
|
||||
|
||||
# Bereinigen der Hilfsspalten vor dem Schreiben
|
||||
crm_df.drop(columns=['unique_id', 'normalized_name', 'normalized_domain', 'domain_use_flag'], inplace=True)
|
||||
crm_df.drop(columns=['unique_id', 'normalized_name', 'normalized_domain', 'domain_use_flag', 'normalized_parent_name'], inplace=True)
|
||||
|
||||
# Ergebnisse zurückschreiben
|
||||
logger.info("Schreibe Ergebnisse mit Duplikats-IDs ins Sheet...")
|
||||
@@ -442,6 +479,8 @@ def run_external_comparison():
|
||||
crm_df['normalized_domain'] = crm_df['CRM Website'].astype(str).apply(simple_normalize_url)
|
||||
crm_df['CRM Ort'] = crm_df['CRM Ort'].astype(str).str.lower().str.strip()
|
||||
crm_df['CRM Land'] = crm_df['CRM Land'].astype(str).str.lower().str.strip()
|
||||
crm_df['Parent Account'] = crm_df.get('Parent Account', pd.Series(index=crm_df.index, dtype=object)).astype(str).fillna('').str.strip()
|
||||
crm_df['normalized_parent_name'] = crm_df['Parent Account'].apply(normalize_company_name)
|
||||
crm_df['block_key'] = crm_df['normalized_name'].apply(lambda x: x.split()[0] if x else None)
|
||||
crm_df['domain_use_flag'] = 1 # CRM-Domain gilt als vertrauenswürdig
|
||||
|
||||
@@ -584,7 +623,7 @@ def run_external_comparison():
|
||||
write_df['Score'] = res_df['Score']
|
||||
write_df['Match_Grund'] = res_df['Match_Grund']
|
||||
|
||||
drop_cols = ['normalized_name','normalized_domain','block_key','Effektive Website','domain_use_flag']
|
||||
drop_cols = ['normalized_name','normalized_domain','block_key','Effektive Website','domain_use_flag', 'normalized_parent_name']
|
||||
for c in drop_cols:
|
||||
if c in write_df.columns:
|
||||
write_df.drop(columns=[c], inplace=True)
|
||||
|
||||
Reference in New Issue
Block a user