Wiki Debugging

Wikipedia-Validierung über validate_wikipedia_page ergänzt (stellt sicher, dass Seiteninhalt oder Domain zum Firmennamen passen). Fallback-Parsing parse_infobox_with_fallback() eingebaut für robustere Extraktion von Branche/Umsatz. Branchenbegriff-Erkennung verbessert (Synonyme wie „Tätigkeitsfeld“, „Industriezweig“ etc.). Bedingte Auswertung verbessert: Wenn kein Wikipedia-Link → "k.A." für Branche/Umsatz. Selektor-basierte Extraktion entfernt, da sie fehleranfällig war → durch flexible Infobox-Logik ersetzt.
2025-03-30 18:25:03 +00:00
parent e917ace53c
commit 21d9eb41ef
1 changed files with 40 additions and 22 deletions
--- a/brancheneinstufung.py
+++ b/brancheneinstufung.py
@@ -47,19 +47,28 @@ def extract_domain_key(url):
    return parts[0] if len(parts) > 1 else ""

 # === INFOBOX-PARSING ===
-def parse_infobox_with_selector(soup):
-    try:
-        branche = soup.select_one("#mw-content-text > div.mw-content-ltr.mw-parser-output > table > tbody > tr:nth-child(7) > td:nth-child(2)")
-        umsatz = soup.select_one("#mw-content-text > div.mw-content-ltr.mw-parser-output > table > tbody > tr:nth-child(8) > td:nth-child(2)")
-        branche_text = branche.get_text(strip=True) if branche else "k.A."
-        umsatz_text = umsatz.get_text(strip=True) if umsatz else "k.A."
-        if "Mio" in umsatz_text:
-            match = re.search(r"(\d+[\d.,]*)\s*Mio", umsatz_text)
-            if match:
-                umsatz_text = match.group(1).replace(",", ".")
-        return branche_text, umsatz_text
-    except:
-        return "k.A.", "k.A."
+def parse_infobox_with_fallback(soup):
+    infobox = soup.find("table", class_="infobox")
+    branche = "k.A."
+    umsatz = "k.A."
+
+    if infobox:
+        for row in infobox.find_all("tr"):
+            th = row.find("th")
+            td = row.find("td")
+            if not th or not td:
+                continue
+            label = th.get_text(strip=True).lower()
+            value = td.get_text(strip=True)
+
+            if any(b in label for b in ["branche", "tätigkeitsfeld", "industriezweig", "wirtschaftszweig"]):
+                branche = value
+            if "umsatz" in label and "mio" in value.lower():
+                match = re.search(r"(\d+[\d.,]*)\s*Mio", value)
+                if match:
+                    umsatz = match.group(1).replace(",", ".")
+
+    return branche, umsatz

 # === WIKIPEDIA DATEN ===
 WHITELIST_KATEGORIEN = [
@@ -69,6 +78,13 @@ WHITELIST_KATEGORIEN = [
    "logistik", "automobil"
 ]

+def validate_wikipedia_page(content, name, domain_key):
+    name_fragments = name.lower().split()[:2]
+    return (
+        any(frag in content.lower() for frag in name_fragments) or
+        (domain_key and domain_key.lower() in content.lower())
+    )
+
 def get_wikipedia_data(name, website_hint=""):
    domain_key = extract_domain_key(website_hint)
    search_terms = [name, domain_key] if domain_key else [name]
@@ -82,15 +98,16 @@ def get_wikipedia_data(name, website_hint=""):
                    try:
                        page = wikipedia.page(title, auto_suggest=False)
                        html = requests.get(page.url, timeout=10).text
-                        if name.split()[0].lower() in page.content.lower() or (domain_key and domain_key.lower() in html.lower()):
-                            soup = BeautifulSoup(html, "html.parser")
-                            branche, umsatz = parse_infobox_with_selector(soup)
-                            if not branche or branche == "k.A.":
-                                for category in page.categories:
-                                    if any(kw in category.lower() for kw in WHITELIST_KATEGORIEN):
-                                        branche = category
-                                        break
-                            return page.url, branche or "k.A.", umsatz or "k.A."
+                        if not validate_wikipedia_page(page.content, name, domain_key):
+                            continue
+                        soup = BeautifulSoup(html, "html.parser")
+                        branche, umsatz = parse_infobox_with_fallback(soup)
+                        if not branche or branche == "k.A.":
+                            for category in page.categories:
+                                if any(kw in category.lower() for kw in WHITELIST_KATEGORIEN):
+                                    branche = category
+                                    break
+                        return page.url, branche or "k.A.", umsatz or "k.A."
                    except:
                        continue
            except Exception as e:
@@ -121,6 +138,7 @@ for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))):
 print("\n✅ Wikipedia-Auswertung abgeschlossen")


+
 # === SCHRITT 2: GPT-BEWERTUNG ===
 def classify_company(row, wikipedia_url=""):
    user_prompt = {