From 270a5fc0e29ceeeb494ae3be198decc35ff4b728 Mon Sep 17 00:00:00 2001 From: Floke Date: Tue, 5 Aug 2025 14:30:02 +0000 Subject: [PATCH] duplicate_checker.py aktualisiert --- duplicate_checker.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/duplicate_checker.py b/duplicate_checker.py index e278f23e..ee9ea125 100644 --- a/duplicate_checker.py +++ b/duplicate_checker.py @@ -5,9 +5,9 @@ from rapidfuzz import fuzz from google_sheet_handler import GoogleSheetHandler # --- Konfiguration --- -CRM_SHEET_NAME = "CRM_Accounts" +CRM_SHEET_NAME = "CRM_Accounts" MATCHING_SHEET_NAME = "Matching_Accounts" -SCORE_THRESHOLD = 0.8 +SCORE_THRESHOLD = 0.8 WEIGHTS = { 'domain': 0.5, 'name': 0.4, @@ -17,15 +17,14 @@ WEIGHTS = { # --- Hilfsfunktionen --- def normalize_company_name(name: str) -> str: """ - Vereinfachte Normalisierung von Firmennamen: - - Unicode‑safe Kleinschreibung + Vereinfacht Firmennamen: + - Unicode-safe Kleinschreibung - Umlaute in ae/oe/ue, ß in ss - - Entfernen von Rechtsformen und Stop-Wörtern + - Entfernen von Rechtsformen/Stop-Wörtern """ s = str(name).casefold() for src, dst in [('ä','ae'), ('ö','oe'), ('ü','ue'), ('ß','ss')]: s = s.replace(src, dst) - # Nur alphanumerisch und Leerzeichen s = re.sub(r'[^a-z0-9\s]', ' ', s) stops = ['gmbh','ag','kg','ug','ohg','holding','group','international'] tokens = [t for t in s.split() if t and t not in stops] @@ -33,11 +32,13 @@ def normalize_company_name(name: str) -> str: def normalize_domain(url: str) -> str: - """Root-Domain extrahieren, Protokoll und www entfernen""" + """Extrahiere Root-Domain, entferne Protokoll und www-Präfix""" s = str(url).casefold().strip() s = re.sub(r'^https?://', '', s) s = s.split('/')[0] - return s.removeprefix('www.') + if s.startswith('www.'): + s = s[4:] + return s def main():