duplicate_checker.py aktualisiert

This commit is contained in:
2025-08-05 14:30:02 +00:00
parent 7d3821ad3d
commit 270a5fc0e2

View File

@@ -17,15 +17,14 @@ WEIGHTS = {
# --- Hilfsfunktionen --- # --- Hilfsfunktionen ---
def normalize_company_name(name: str) -> str: def normalize_company_name(name: str) -> str:
""" """
Vereinfachte Normalisierung von Firmennamen: Vereinfacht Firmennamen:
- Unicodesafe Kleinschreibung - Unicode-safe Kleinschreibung
- Umlaute in ae/oe/ue, ß in ss - Umlaute in ae/oe/ue, ß in ss
- Entfernen von Rechtsformen und Stop-Wörtern - Entfernen von Rechtsformen/Stop-Wörtern
""" """
s = str(name).casefold() s = str(name).casefold()
for src, dst in [('ä','ae'), ('ö','oe'), ('ü','ue'), ('ß','ss')]: for src, dst in [('ä','ae'), ('ö','oe'), ('ü','ue'), ('ß','ss')]:
s = s.replace(src, dst) s = s.replace(src, dst)
# Nur alphanumerisch und Leerzeichen
s = re.sub(r'[^a-z0-9\s]', ' ', s) s = re.sub(r'[^a-z0-9\s]', ' ', s)
stops = ['gmbh','ag','kg','ug','ohg','holding','group','international'] stops = ['gmbh','ag','kg','ug','ohg','holding','group','international']
tokens = [t for t in s.split() if t and t not in stops] tokens = [t for t in s.split() if t and t not in stops]
@@ -33,11 +32,13 @@ def normalize_company_name(name: str) -> str:
def normalize_domain(url: str) -> str: def normalize_domain(url: str) -> str:
"""Root-Domain extrahieren, Protokoll und www entfernen""" """Extrahiere Root-Domain, entferne Protokoll und www-Präfix"""
s = str(url).casefold().strip() s = str(url).casefold().strip()
s = re.sub(r'^https?://', '', s) s = re.sub(r'^https?://', '', s)
s = s.split('/')[0] s = s.split('/')[0]
return s.removeprefix('www.') if s.startswith('www.'):
s = s[4:]
return s
def main(): def main():