duplicate_checker.py aktualisiert
This commit is contained in:
@@ -2,13 +2,13 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
import logging
|
import logging
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import tldextract
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from thefuzz import fuzz
|
from thefuzz import fuzz
|
||||||
from helpers import normalize_company_name, simple_normalize_url
|
from helpers import normalize_company_name, simple_normalize_url
|
||||||
from google_sheet_handler import GoogleSheetHandler
|
from google_sheet_handler import GoogleSheetHandler
|
||||||
|
|
||||||
# duplicate_checker.py v2.4 (root-domain match via tldextract)
|
# duplicate_checker.py v2.5 (Original v2.0-Logik + Logging enhancements)
|
||||||
|
# Version: 2025-08-06_17-00
|
||||||
# Version: 2025-08-06_16-30
|
# Version: 2025-08-06_16-30
|
||||||
# --- Konfiguration ---
|
# --- Konfiguration ---
|
||||||
CRM_SHEET_NAME = "CRM_Accounts"
|
CRM_SHEET_NAME = "CRM_Accounts"
|
||||||
@@ -42,16 +42,14 @@ logger.info("Version: duplicate_checker.py v2.4 (root-domain match via tldextrac
|
|||||||
|
|
||||||
|
|
||||||
def calculate_similarity(record1, record2):
|
def calculate_similarity(record1, record2):
|
||||||
"""Berechnet v2.0-Score mit root-domain match."""
|
"""Berechnet den v2.0-Score: Domain exact=100, Name*0.7, Ort+Land=20."""
|
||||||
total_score = 0
|
total_score = 0
|
||||||
# Domain root only
|
# Domain exact match
|
||||||
url1 = record1.get('CRM Website', '')
|
dom1 = record1.get('normalized_domain', '')
|
||||||
url2 = record2.get('CRM Website', '')
|
dom2 = record2.get('normalized_domain', '')
|
||||||
dom1 = tldextract.extract(url1).domain
|
|
||||||
dom2 = tldextract.extract(url2).domain
|
|
||||||
if dom1 and dom1 == dom2:
|
if dom1 and dom1 == dom2:
|
||||||
total_score += 100
|
total_score += 100
|
||||||
# Name fuzzy
|
# Name fuzzy token_set
|
||||||
name1 = record1.get('normalized_name', '')
|
name1 = record1.get('normalized_name', '')
|
||||||
name2 = record2.get('normalized_name', '')
|
name2 = record2.get('normalized_name', '')
|
||||||
if name1 and name2:
|
if name1 and name2:
|
||||||
|
|||||||
Reference in New Issue
Block a user