diff --git a/brancheneinstufung.py b/brancheneinstufung.py index 04507f6a..6195128c 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -95,45 +95,61 @@ class WikipediaScraper: def __init__(self): wikipedia.set_lang(Config.LANG) + def _extract_domain_hint(self, website): + """Extrahiert den Domain-Schlüssel aus der Website-URL""" + if not website: + return "" + # Entferne Protokoll und www, zerlege in Teile + clean_url = website.lower().replace("https://", "").replace("http://", "").replace("www.", "") + domain_parts = clean_url.split(".") + return domain_parts[0] if domain_parts else "" + def _generate_search_terms(self, company_name, website_hint=""): """Generiert Suchbegriffe aus Firmenname und Website""" search_terms = [company_name.strip()] - # Zusatzbegriffe aus Firmennamen - name_parts = company_name.split() - if len(name_parts) > 1: - search_terms.append(" ".join(name_parts[:2])) + # Bereinigung von Rechtsformen und Sonderzeichen + clean_name = re.sub( + r'\s+(?:GmbH|AG|KG|OHG|e\.V\.|mbH|& Co\. KG| GmbH & Co\. KG).*$', + '', + company_name + ).strip() - # Bereinigung von Rechtsformen - clean_name = re.sub(r'\s+(?:GmbH|AG|KG|OHG|e\.V\.|mbH).*$', '', company_name) - if clean_name != company_name: + # Füge bereinigten Namen hinzu, wenn unterschiedlich + if clean_name and clean_name != company_name: search_terms.append(clean_name) - # Extraktion aus Website - if website_hint: - domain_parts = website_hint.replace("https://", "").replace("http://", "").replace("www.", "").split(".") - if len(domain_parts) > 1 and domain_parts[0] not in ["de", "com", "org"]: - search_terms.append(domain_parts[0]) - + # Extrahiere erste zwei relevante Wörter + name_words = [w for w in re.split(r'\W+', clean_name) if w] + if len(name_words) >= 2: + search_terms.append(" ".join(name_words[:2])) + + # Domain-Hint hinzufügen + domain_hint = self._extract_domain_hint(website_hint) + if domain_hint and domain_hint not in ["de", "com", "org", "net"]: + search_terms.append(domain_hint) + debug_print(f"Generierte Suchbegriffe: {search_terms}") - return search_terms - + return list(set(search_terms)) # Duplikate entfernen + def _validate_article(self, page, company_name, domain_hint=""): """Überprüft ob der Artikel zum Unternehmen passt""" - # Ähnlichkeitsprüfung des Titels - title_similarity = SequenceMatcher( - None, - page.title.lower(), - company_name.lower() - ).ratio() + # Normalisiere beide Namen + page_title = re.sub(r'\(.*?\)', '', page.title).strip().lower() + search_name = re.sub(r'[^a-zA-Z0-9äöüß ]', '', company_name).strip().lower() + + # Ähnlichkeitsprüfung + similarity = SequenceMatcher(None, page_title, search_name).ratio() + debug_print(f"Ähnlichkeit '{page_title}' vs '{search_name}': {similarity:.2f}") # Zusätzliche Domain-Prüfung if domain_hint: - html_content = requests.get(page.url).text - if domain_hint.lower() not in html_content.lower(): + html_content = requests.get(page.url).text.lower() + if domain_hint not in html_content: + debug_print(f"Domain-Hint '{domain_hint}' nicht im Artikel gefunden") return False - return title_similarity >= Config.SIMILARITY_THRESHOLD + return similarity >= Config.SIMILARITY_THRESHOLD @retry_on_failure def search_company_article(self, company_name, website_hint=""):