diff --git a/helpers.py b/helpers.py index 1615bf60..3e061d6c 100644 --- a/helpers.py +++ b/helpers.py @@ -1294,19 +1294,34 @@ def serp_website_lookup(company_name): for result in data["organic_results"][:5]: url = result.get("link", "") title = result.get("title", "") - snippet = result.get("snippet", "") - if url and isinstance(url, str) and url.lower().startswith(("http://", "https://")) and not any(bad_domain in url.lower() for bad_domain in blacklist): - normalized_url = simple_normalize_url(url) - if normalized_url != "k.A.": - normalized_company = normalize_company_name(company_name) - domain_part_normalized = normalized_url.replace('www.', '').split('.')[0] - title_lower = title.lower() - snippet_lower = snippet.lower() - domain_name_match = domain_part_normalized in normalized_company - name_in_result_text = normalized_company in title_lower or normalized_company in snippet_lower - if domain_name_match or name_in_result_text: - logger.info(f"SERP Lookup: Website '{normalized_url}' aus Organic Results fuer '{company_name}' gefunden.") - return normalized_url + if not url or not isinstance(url, str) or not url.lower().startswith(("http://", "https://")) or any(bad_domain in url.lower() for bad_domain in blacklist): + continue + + normalized_url = simple_normalize_url(url) + if normalized_url == "k.A.": + continue + + # --- NEUE, ROBUSTERE VALIDIERUNGSLOGIK --- + normalized_company_name = normalize_company_name(company_name) + # Zerlege den Firmennamen in signifikante Tokens (mind. 3 Zeichen) + company_tokens = {token for token in re.split(r'\s+', normalized_company_name) if len(token) >= 3} + + if not company_tokens: + continue # Wenn keine sinnvollen Tokens vorhanden sind, überspringen + + domain_part = normalized_url.split('.')[0].lower() + title_lower = title.lower() + + # Prüfe, ob mindestens ein signifikantes Token in der Domain oder im Titel vorkommt + token_in_domain = any(token in domain_part for token in company_tokens) + token_in_title = any(token in title_lower for token in company_tokens) + + # Wir akzeptieren die URL, wenn ein Token in der Domain ODER im Titel ist. + # Eine Übereinstimmung in der Domain ist ein stärkeres Signal. + if token_in_domain or token_in_title: + logger.info(f"SERP Lookup: Website '{normalized_url}' aus Organic Results fuer '{company_name}' gefunden (Token-Match).") + return normalized_url + # --- ENDE DER NEUEN LOGIK --- logger.info(f"SERP Lookup: Keine passende Website fuer '{company_name}' gefunden nach Pruefung KG und Top Organic Results.") return "k.A."