From 4f6d51df568659b0188052d668bf37888ca68198 Mon Sep 17 00:00:00 2001 From: Floke Date: Wed, 6 Aug 2025 13:28:50 +0000 Subject: [PATCH] duplicate_checker.py aktualisiert --- duplicate_checker.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/duplicate_checker.py b/duplicate_checker.py index 62824980..3301e1df 100644 --- a/duplicate_checker.py +++ b/duplicate_checker.py @@ -2,13 +2,13 @@ import os import sys import logging import pandas as pd -import tldextract from datetime import datetime from thefuzz import fuzz from helpers import normalize_company_name, simple_normalize_url from google_sheet_handler import GoogleSheetHandler -# duplicate_checker.py v2.4 (root-domain match via tldextract) +# duplicate_checker.py v2.5 (Original v2.0-Logik + Logging enhancements) +# Version: 2025-08-06_17-00 # Version: 2025-08-06_16-30 # --- Konfiguration --- CRM_SHEET_NAME = "CRM_Accounts" @@ -42,16 +42,14 @@ logger.info("Version: duplicate_checker.py v2.4 (root-domain match via tldextrac def calculate_similarity(record1, record2): - """Berechnet v2.0-Score mit root-domain match.""" + """Berechnet den v2.0-Score: Domain exact=100, Name*0.7, Ort+Land=20.""" total_score = 0 - # Domain root only - url1 = record1.get('CRM Website', '') - url2 = record2.get('CRM Website', '') - dom1 = tldextract.extract(url1).domain - dom2 = tldextract.extract(url2).domain + # Domain exact match + dom1 = record1.get('normalized_domain', '') + dom2 = record2.get('normalized_domain', '') if dom1 and dom1 == dom2: total_score += 100 - # Name fuzzy + # Name fuzzy token_set name1 = record1.get('normalized_name', '') name2 = record2.get('normalized_name', '') if name1 and name2: