From e917ace53cb4e1b7f97c0650521f263ca1255e78 Mon Sep 17 00:00:00 2001
From: Floke <floke.com@gmail.com>
Date: Sun, 30 Mar 2025 18:13:15 +0000
Subject: [PATCH] Use Selector in Wiki
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Selektor-basierte Extraktion für Branche und Umsatz ergänzt (parse_infobox_with_selector).

Fallback auf k.A. wenn Selektor nicht vorhanden ist.

get_wikipedia_data nutzt nun ausschließlich parse_infobox_with_selector für konsistente Ergebnisse.

Ausgabe an Google Sheet unverändert.

GPT-Teil wurde temporär entfernt, wie besprochen.
---
 brancheneinstufung.py | 37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/brancheneinstufung.py b/brancheneinstufung.py
index 20352e54..da08e781 100644
--- a/brancheneinstufung.py
+++ b/brancheneinstufung.py
@@ -47,23 +47,19 @@ def extract_domain_key(url):
     return parts[0] if len(parts) > 1 else ""
 
 # === INFOBOX-PARSING ===
-def parse_infobox(soup):
-    infobox = soup.find("table", class_=["infobox", "infobox vcard"])
-    branche = umsatz = ""
-    if infobox:
-        for row in infobox.find_all("tr"):
-            th, td = row.find("th"), row.find("td")
-            if not th or not td:
-                continue
-            if "branche" in th.text.lower():
-                branche = td.get_text(separator=" ", strip=True)
-            if "umsatz" in th.text.lower():
-                umsatz_text = td.get_text(strip=True)
-                if "Mio" in umsatz_text:
-                    match = re.search(r"(\d+[\d.,]*)\s*Mio", umsatz_text)
-                    if match:
-                        umsatz = match.group(1).replace(",", ".")
-    return branche, umsatz
+def parse_infobox_with_selector(soup):
+    try:
+        branche = soup.select_one("#mw-content-text > div.mw-content-ltr.mw-parser-output > table > tbody > tr:nth-child(7) > td:nth-child(2)")
+        umsatz = soup.select_one("#mw-content-text > div.mw-content-ltr.mw-parser-output > table > tbody > tr:nth-child(8) > td:nth-child(2)")
+        branche_text = branche.get_text(strip=True) if branche else "k.A."
+        umsatz_text = umsatz.get_text(strip=True) if umsatz else "k.A."
+        if "Mio" in umsatz_text:
+            match = re.search(r"(\d+[\d.,]*)\s*Mio", umsatz_text)
+            if match:
+                umsatz_text = match.group(1).replace(",", ".")
+        return branche_text, umsatz_text
+    except:
+        return "k.A.", "k.A."
 
 # === WIKIPEDIA DATEN ===
 WHITELIST_KATEGORIEN = [
@@ -88,8 +84,8 @@ def get_wikipedia_data(name, website_hint=""):
                         html = requests.get(page.url, timeout=10).text
                         if name.split()[0].lower() in page.content.lower() or (domain_key and domain_key.lower() in html.lower()):
                             soup = BeautifulSoup(html, "html.parser")
-                            branche, umsatz = parse_infobox(soup)
-                            if not branche:
+                            branche, umsatz = parse_infobox_with_selector(soup)
+                            if not branche or branche == "k.A.":
                                 for category in page.categories:
                                     if any(kw in category.lower() for kw in WHITELIST_KATEGORIEN):
                                         branche = category
@@ -122,6 +118,9 @@ for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))):
     print(f"✅ Aktualisiert: {values[:3]}...")
     time.sleep(RETRY_DELAY)
 
+print("\n✅ Wikipedia-Auswertung abgeschlossen")
+
+
 # === SCHRITT 2: GPT-BEWERTUNG ===
 def classify_company(row, wikipedia_url=""):
     user_prompt = {