duplicate_checker.py aktualisiert
This commit is contained in:
@@ -5,9 +5,9 @@ from rapidfuzz import fuzz
|
|||||||
from google_sheet_handler import GoogleSheetHandler
|
from google_sheet_handler import GoogleSheetHandler
|
||||||
|
|
||||||
# --- Konfiguration ---
|
# --- Konfiguration ---
|
||||||
CRM_SHEET_NAME = "CRM_Accounts"
|
CRM_SHEET_NAME = "CRM_Accounts"
|
||||||
MATCHING_SHEET_NAME = "Matching_Accounts"
|
MATCHING_SHEET_NAME = "Matching_Accounts"
|
||||||
SCORE_THRESHOLD = 0.8
|
SCORE_THRESHOLD = 0.8
|
||||||
WEIGHTS = {
|
WEIGHTS = {
|
||||||
'domain': 0.5,
|
'domain': 0.5,
|
||||||
'name': 0.4,
|
'name': 0.4,
|
||||||
@@ -17,15 +17,14 @@ WEIGHTS = {
|
|||||||
# --- Hilfsfunktionen ---
|
# --- Hilfsfunktionen ---
|
||||||
def normalize_company_name(name: str) -> str:
|
def normalize_company_name(name: str) -> str:
|
||||||
"""
|
"""
|
||||||
Vereinfachte Normalisierung von Firmennamen:
|
Vereinfacht Firmennamen:
|
||||||
- Unicode‑safe Kleinschreibung
|
- Unicode-safe Kleinschreibung
|
||||||
- Umlaute in ae/oe/ue, ß in ss
|
- Umlaute in ae/oe/ue, ß in ss
|
||||||
- Entfernen von Rechtsformen und Stop-Wörtern
|
- Entfernen von Rechtsformen/Stop-Wörtern
|
||||||
"""
|
"""
|
||||||
s = str(name).casefold()
|
s = str(name).casefold()
|
||||||
for src, dst in [('ä','ae'), ('ö','oe'), ('ü','ue'), ('ß','ss')]:
|
for src, dst in [('ä','ae'), ('ö','oe'), ('ü','ue'), ('ß','ss')]:
|
||||||
s = s.replace(src, dst)
|
s = s.replace(src, dst)
|
||||||
# Nur alphanumerisch und Leerzeichen
|
|
||||||
s = re.sub(r'[^a-z0-9\s]', ' ', s)
|
s = re.sub(r'[^a-z0-9\s]', ' ', s)
|
||||||
stops = ['gmbh','ag','kg','ug','ohg','holding','group','international']
|
stops = ['gmbh','ag','kg','ug','ohg','holding','group','international']
|
||||||
tokens = [t for t in s.split() if t and t not in stops]
|
tokens = [t for t in s.split() if t and t not in stops]
|
||||||
@@ -33,11 +32,13 @@ def normalize_company_name(name: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def normalize_domain(url: str) -> str:
|
def normalize_domain(url: str) -> str:
|
||||||
"""Root-Domain extrahieren, Protokoll und www entfernen"""
|
"""Extrahiere Root-Domain, entferne Protokoll und www-Präfix"""
|
||||||
s = str(url).casefold().strip()
|
s = str(url).casefold().strip()
|
||||||
s = re.sub(r'^https?://', '', s)
|
s = re.sub(r'^https?://', '', s)
|
||||||
s = s.split('/')[0]
|
s = s.split('/')[0]
|
||||||
return s.removeprefix('www.')
|
if s.startswith('www.'):
|
||||||
|
s = s[4:]
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|||||||
Reference in New Issue
Block a user