duplicate_checker.py aktualisiert

This commit is contained in:
2025-08-06 13:28:50 +00:00
parent e11b96aee3
commit 9362c47ab7

View File

@@ -2,13 +2,13 @@ import os
import sys import sys
import logging import logging
import pandas as pd import pandas as pd
import tldextract
from datetime import datetime from datetime import datetime
from thefuzz import fuzz from thefuzz import fuzz
from helpers import normalize_company_name, simple_normalize_url from helpers import normalize_company_name, simple_normalize_url
from google_sheet_handler import GoogleSheetHandler from google_sheet_handler import GoogleSheetHandler
# duplicate_checker.py v2.4 (root-domain match via tldextract) # duplicate_checker.py v2.5 (Original v2.0-Logik + Logging enhancements)
# Version: 2025-08-06_17-00
# Version: 2025-08-06_16-30 # Version: 2025-08-06_16-30
# --- Konfiguration --- # --- Konfiguration ---
CRM_SHEET_NAME = "CRM_Accounts" CRM_SHEET_NAME = "CRM_Accounts"
@@ -42,16 +42,14 @@ logger.info("Version: duplicate_checker.py v2.4 (root-domain match via tldextrac
def calculate_similarity(record1, record2): def calculate_similarity(record1, record2):
"""Berechnet v2.0-Score mit root-domain match.""" """Berechnet den v2.0-Score: Domain exact=100, Name*0.7, Ort+Land=20."""
total_score = 0 total_score = 0
# Domain root only # Domain exact match
url1 = record1.get('CRM Website', '') dom1 = record1.get('normalized_domain', '')
url2 = record2.get('CRM Website', '') dom2 = record2.get('normalized_domain', '')
dom1 = tldextract.extract(url1).domain
dom2 = tldextract.extract(url2).domain
if dom1 and dom1 == dom2: if dom1 and dom1 == dom2:
total_score += 100 total_score += 100
# Name fuzzy # Name fuzzy token_set
name1 = record1.get('normalized_name', '') name1 = record1.get('normalized_name', '')
name2 = record2.get('normalized_name', '') name2 = record2.get('normalized_name', '')
if name1 and name2: if name1 and name2: