diff --git a/brancheneinstufung.py b/brancheneinstufung.py index da08e781..73804ace 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -47,19 +47,28 @@ def extract_domain_key(url): return parts[0] if len(parts) > 1 else "" # === INFOBOX-PARSING === -def parse_infobox_with_selector(soup): - try: - branche = soup.select_one("#mw-content-text > div.mw-content-ltr.mw-parser-output > table > tbody > tr:nth-child(7) > td:nth-child(2)") - umsatz = soup.select_one("#mw-content-text > div.mw-content-ltr.mw-parser-output > table > tbody > tr:nth-child(8) > td:nth-child(2)") - branche_text = branche.get_text(strip=True) if branche else "k.A." - umsatz_text = umsatz.get_text(strip=True) if umsatz else "k.A." - if "Mio" in umsatz_text: - match = re.search(r"(\d+[\d.,]*)\s*Mio", umsatz_text) - if match: - umsatz_text = match.group(1).replace(",", ".") - return branche_text, umsatz_text - except: - return "k.A.", "k.A." +def parse_infobox_with_fallback(soup): + infobox = soup.find("table", class_="infobox") + branche = "k.A." + umsatz = "k.A." + + if infobox: + for row in infobox.find_all("tr"): + th = row.find("th") + td = row.find("td") + if not th or not td: + continue + label = th.get_text(strip=True).lower() + value = td.get_text(strip=True) + + if any(b in label for b in ["branche", "tätigkeitsfeld", "industriezweig", "wirtschaftszweig"]): + branche = value + if "umsatz" in label and "mio" in value.lower(): + match = re.search(r"(\d+[\d.,]*)\s*Mio", value) + if match: + umsatz = match.group(1).replace(",", ".") + + return branche, umsatz # === WIKIPEDIA DATEN === WHITELIST_KATEGORIEN = [ @@ -69,6 +78,13 @@ WHITELIST_KATEGORIEN = [ "logistik", "automobil" ] +def validate_wikipedia_page(content, name, domain_key): + name_fragments = name.lower().split()[:2] + return ( + any(frag in content.lower() for frag in name_fragments) or + (domain_key and domain_key.lower() in content.lower()) + ) + def get_wikipedia_data(name, website_hint=""): domain_key = extract_domain_key(website_hint) search_terms = [name, domain_key] if domain_key else [name] @@ -82,15 +98,16 @@ def get_wikipedia_data(name, website_hint=""): try: page = wikipedia.page(title, auto_suggest=False) html = requests.get(page.url, timeout=10).text - if name.split()[0].lower() in page.content.lower() or (domain_key and domain_key.lower() in html.lower()): - soup = BeautifulSoup(html, "html.parser") - branche, umsatz = parse_infobox_with_selector(soup) - if not branche or branche == "k.A.": - for category in page.categories: - if any(kw in category.lower() for kw in WHITELIST_KATEGORIEN): - branche = category - break - return page.url, branche or "k.A.", umsatz or "k.A." + if not validate_wikipedia_page(page.content, name, domain_key): + continue + soup = BeautifulSoup(html, "html.parser") + branche, umsatz = parse_infobox_with_fallback(soup) + if not branche or branche == "k.A.": + for category in page.categories: + if any(kw in category.lower() for kw in WHITELIST_KATEGORIEN): + branche = category + break + return page.url, branche or "k.A.", umsatz or "k.A." except: continue except Exception as e: @@ -121,6 +138,7 @@ for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))): print("\n✅ Wikipedia-Auswertung abgeschlossen") + # === SCHRITT 2: GPT-BEWERTUNG === def classify_company(row, wikipedia_url=""): user_prompt = {