duplicate_checker.py aktualisiert

This commit is contained in:
2025-08-06 13:28:50 +00:00
parent e11b96aee3
commit 9362c47ab7

View File

@@ -2,13 +2,13 @@ import os
import sys
import logging
import pandas as pd
import tldextract
from datetime import datetime
from thefuzz import fuzz
from helpers import normalize_company_name, simple_normalize_url
from google_sheet_handler import GoogleSheetHandler
# duplicate_checker.py v2.4 (root-domain match via tldextract)
# duplicate_checker.py v2.5 (Original v2.0-Logik + Logging enhancements)
# Version: 2025-08-06_17-00
# Version: 2025-08-06_16-30
# --- Konfiguration ---
CRM_SHEET_NAME = "CRM_Accounts"
@@ -42,16 +42,14 @@ logger.info("Version: duplicate_checker.py v2.4 (root-domain match via tldextrac
def calculate_similarity(record1, record2):
"""Berechnet v2.0-Score mit root-domain match."""
"""Berechnet den v2.0-Score: Domain exact=100, Name*0.7, Ort+Land=20."""
total_score = 0
# Domain root only
url1 = record1.get('CRM Website', '')
url2 = record2.get('CRM Website', '')
dom1 = tldextract.extract(url1).domain
dom2 = tldextract.extract(url2).domain
# Domain exact match
dom1 = record1.get('normalized_domain', '')
dom2 = record2.get('normalized_domain', '')
if dom1 and dom1 == dom2:
total_score += 100
# Name fuzzy
# Name fuzzy token_set
name1 = record1.get('normalized_name', '')
name2 = record2.get('normalized_name', '')
if name1 and name2: