duplicate_checker.py aktualisiert
This commit is contained in:
@@ -5,9 +5,9 @@ from rapidfuzz import fuzz
|
||||
from google_sheet_handler import GoogleSheetHandler
|
||||
|
||||
# --- Konfiguration ---
|
||||
CRM_SHEET_NAME = "CRM_Accounts"
|
||||
CRM_SHEET_NAME = "CRM_Accounts"
|
||||
MATCHING_SHEET_NAME = "Matching_Accounts"
|
||||
SCORE_THRESHOLD = 0.8
|
||||
SCORE_THRESHOLD = 0.8
|
||||
WEIGHTS = {
|
||||
'domain': 0.5,
|
||||
'name': 0.4,
|
||||
@@ -17,15 +17,14 @@ WEIGHTS = {
|
||||
# --- Hilfsfunktionen ---
|
||||
def normalize_company_name(name: str) -> str:
|
||||
"""
|
||||
Vereinfachte Normalisierung von Firmennamen:
|
||||
- Unicode‑safe Kleinschreibung
|
||||
Vereinfacht Firmennamen:
|
||||
- Unicode-safe Kleinschreibung
|
||||
- Umlaute in ae/oe/ue, ß in ss
|
||||
- Entfernen von Rechtsformen und Stop-Wörtern
|
||||
- Entfernen von Rechtsformen/Stop-Wörtern
|
||||
"""
|
||||
s = str(name).casefold()
|
||||
for src, dst in [('ä','ae'), ('ö','oe'), ('ü','ue'), ('ß','ss')]:
|
||||
s = s.replace(src, dst)
|
||||
# Nur alphanumerisch und Leerzeichen
|
||||
s = re.sub(r'[^a-z0-9\s]', ' ', s)
|
||||
stops = ['gmbh','ag','kg','ug','ohg','holding','group','international']
|
||||
tokens = [t for t in s.split() if t and t not in stops]
|
||||
@@ -33,11 +32,13 @@ def normalize_company_name(name: str) -> str:
|
||||
|
||||
|
||||
def normalize_domain(url: str) -> str:
|
||||
"""Root-Domain extrahieren, Protokoll und www entfernen"""
|
||||
"""Extrahiere Root-Domain, entferne Protokoll und www-Präfix"""
|
||||
s = str(url).casefold().strip()
|
||||
s = re.sub(r'^https?://', '', s)
|
||||
s = s.split('/')[0]
|
||||
return s.removeprefix('www.')
|
||||
if s.startswith('www.'):
|
||||
s = s[4:]
|
||||
return s
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
Reference in New Issue
Block a user