fix(helpers): Verbesserte SerpAPI-Website-Suche
- Ersetzt starre Namensprüfung durch flexible Token-basierte Validierung - Erhöht die Trefferquote für Firmennamen, die nicht exakt im Suchergebnis erscheinen - Löst das Problem, dass für bekannte Firmen wie 'Rohde & Schwarz' keine Website gefunden wurde
This commit is contained in:
41
helpers.py
41
helpers.py
@@ -1294,19 +1294,34 @@ def serp_website_lookup(company_name):
|
||||
for result in data["organic_results"][:5]:
|
||||
url = result.get("link", "")
|
||||
title = result.get("title", "")
|
||||
snippet = result.get("snippet", "")
|
||||
if url and isinstance(url, str) and url.lower().startswith(("http://", "https://")) and not any(bad_domain in url.lower() for bad_domain in blacklist):
|
||||
normalized_url = simple_normalize_url(url)
|
||||
if normalized_url != "k.A.":
|
||||
normalized_company = normalize_company_name(company_name)
|
||||
domain_part_normalized = normalized_url.replace('www.', '').split('.')[0]
|
||||
title_lower = title.lower()
|
||||
snippet_lower = snippet.lower()
|
||||
domain_name_match = domain_part_normalized in normalized_company
|
||||
name_in_result_text = normalized_company in title_lower or normalized_company in snippet_lower
|
||||
if domain_name_match or name_in_result_text:
|
||||
logger.info(f"SERP Lookup: Website '{normalized_url}' aus Organic Results fuer '{company_name}' gefunden.")
|
||||
return normalized_url
|
||||
if not url or not isinstance(url, str) or not url.lower().startswith(("http://", "https://")) or any(bad_domain in url.lower() for bad_domain in blacklist):
|
||||
continue
|
||||
|
||||
normalized_url = simple_normalize_url(url)
|
||||
if normalized_url == "k.A.":
|
||||
continue
|
||||
|
||||
# --- NEUE, ROBUSTERE VALIDIERUNGSLOGIK ---
|
||||
normalized_company_name = normalize_company_name(company_name)
|
||||
# Zerlege den Firmennamen in signifikante Tokens (mind. 3 Zeichen)
|
||||
company_tokens = {token for token in re.split(r'\s+', normalized_company_name) if len(token) >= 3}
|
||||
|
||||
if not company_tokens:
|
||||
continue # Wenn keine sinnvollen Tokens vorhanden sind, überspringen
|
||||
|
||||
domain_part = normalized_url.split('.')[0].lower()
|
||||
title_lower = title.lower()
|
||||
|
||||
# Prüfe, ob mindestens ein signifikantes Token in der Domain oder im Titel vorkommt
|
||||
token_in_domain = any(token in domain_part for token in company_tokens)
|
||||
token_in_title = any(token in title_lower for token in company_tokens)
|
||||
|
||||
# Wir akzeptieren die URL, wenn ein Token in der Domain ODER im Titel ist.
|
||||
# Eine Übereinstimmung in der Domain ist ein stärkeres Signal.
|
||||
if token_in_domain or token_in_title:
|
||||
logger.info(f"SERP Lookup: Website '{normalized_url}' aus Organic Results fuer '{company_name}' gefunden (Token-Match).")
|
||||
return normalized_url
|
||||
# --- ENDE DER NEUEN LOGIK ---
|
||||
|
||||
logger.info(f"SERP Lookup: Keine passende Website fuer '{company_name}' gefunden nach Pruefung KG und Top Organic Results.")
|
||||
return "k.A."
|
||||
|
||||
Reference in New Issue
Block a user