duplicate_checker.py aktualisiert
This commit is contained in:
@@ -2,13 +2,13 @@ import os
|
||||
import sys
|
||||
import logging
|
||||
import pandas as pd
|
||||
import tldextract
|
||||
from datetime import datetime
|
||||
from thefuzz import fuzz
|
||||
from helpers import normalize_company_name, simple_normalize_url
|
||||
from google_sheet_handler import GoogleSheetHandler
|
||||
|
||||
# duplicate_checker.py v2.4 (root-domain match via tldextract)
|
||||
# duplicate_checker.py v2.5 (Original v2.0-Logik + Logging enhancements)
|
||||
# Version: 2025-08-06_17-00
|
||||
# Version: 2025-08-06_16-30
|
||||
# --- Konfiguration ---
|
||||
CRM_SHEET_NAME = "CRM_Accounts"
|
||||
@@ -42,16 +42,14 @@ logger.info("Version: duplicate_checker.py v2.4 (root-domain match via tldextrac
|
||||
|
||||
|
||||
def calculate_similarity(record1, record2):
|
||||
"""Berechnet v2.0-Score mit root-domain match."""
|
||||
"""Berechnet den v2.0-Score: Domain exact=100, Name*0.7, Ort+Land=20."""
|
||||
total_score = 0
|
||||
# Domain root only
|
||||
url1 = record1.get('CRM Website', '')
|
||||
url2 = record2.get('CRM Website', '')
|
||||
dom1 = tldextract.extract(url1).domain
|
||||
dom2 = tldextract.extract(url2).domain
|
||||
# Domain exact match
|
||||
dom1 = record1.get('normalized_domain', '')
|
||||
dom2 = record2.get('normalized_domain', '')
|
||||
if dom1 and dom1 == dom2:
|
||||
total_score += 100
|
||||
# Name fuzzy
|
||||
# Name fuzzy token_set
|
||||
name1 = record1.get('normalized_name', '')
|
||||
name2 = record2.get('normalized_name', '')
|
||||
if name1 and name2:
|
||||
|
||||
Reference in New Issue
Block a user