diff --git a/brancheneinstufung.py b/brancheneinstufung.py index 73804ace..fcd0cf7c 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -78,12 +78,12 @@ WHITELIST_KATEGORIEN = [ "logistik", "automobil" ] -def validate_wikipedia_page(content, name, domain_key): +def validate_wikipedia_page(content, title, name, domain_key): name_fragments = name.lower().split()[:2] - return ( - any(frag in content.lower() for frag in name_fragments) or - (domain_key and domain_key.lower() in content.lower()) - ) + title_check = any(frag in title.lower() for frag in name_fragments) + content_check = any(frag in content.lower() for frag in name_fragments) + domain_check = domain_key and domain_key.lower() in content.lower() + return (title_check or content_check or domain_check) def get_wikipedia_data(name, website_hint=""): domain_key = extract_domain_key(website_hint) @@ -98,7 +98,7 @@ def get_wikipedia_data(name, website_hint=""): try: page = wikipedia.page(title, auto_suggest=False) html = requests.get(page.url, timeout=10).text - if not validate_wikipedia_page(page.content, name, domain_key): + if not validate_wikipedia_page(page.content, title, name, domain_key): continue soup = BeautifulSoup(html, "html.parser") branche, umsatz = parse_infobox_with_fallback(soup) @@ -139,6 +139,7 @@ print("\n✅ Wikipedia-Auswertung abgeschlossen") + # === SCHRITT 2: GPT-BEWERTUNG === def classify_company(row, wikipedia_url=""): user_prompt = {