From 0d3e320f85f3f29eadb77d161c634ad791551df6 Mon Sep 17 00:00:00 2001
From: Floke <floke.com@gmail.com>
Date: Mon, 31 Mar 2025 06:46:48 +0000
Subject: [PATCH] feat(wikipedia): Verbesserte Wikipedia-Erkennung und
 Infobox-Parsing (v1.0.4)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Domain-Key-Extraktion zur besseren Treffererkennung
- Scoring-Mechanismus zur Auswahl des besten Wikipedia-Artikels
- Erweiterter Infobox-Parser mit Label-Synonymen
- Validierung durch Titel-, Inhalts-, Domain- und Ähnlichkeitsprüfung
- Versionierung der Ergebnisse mit Spaltenausgabe
---
 brancheneinstufung.py | 45 +++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/brancheneinstufung.py b/brancheneinstufung.py
index 42242d29..ac1f31f7 100644
--- a/brancheneinstufung.py
+++ b/brancheneinstufung.py
@@ -12,9 +12,10 @@ from bs4 import BeautifulSoup
 from oauth2client.service_account import ServiceAccountCredentials
 from datetime import datetime
 from difflib import SequenceMatcher
+from lxml import html as lh
 
 # === KONFIGURATION ===
-VERSION = "1.0.2-wiki-only"
+VERSION = "1.0.5-xpath"
 LANG = "de"
 CREDENTIALS = "service_account.json"
 SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
@@ -48,28 +49,24 @@ def extract_domain_key(url):
     parts = clean_url.split(".")
     return parts[0] if len(parts) > 1 else ""
 
-# === INFOBOX-PARSING ===
-def parse_infobox_with_fallback(soup):
-    infobox = soup.find("table", class_="infobox")
+# === INFOBOX-PARSING MIT XPATH ===
+def parse_infobox_xpath(html_text):
+    doc = lh.fromstring(html_text)
     branche = "k.A."
     umsatz = "k.A."
-
-    if infobox:
-        for row in infobox.find_all("tr"):
-            th = row.find("th")
-            td = row.find("td")
-            if not th or not td:
-                continue
-            label = th.get_text(strip=True).lower()
-            value = td.get_text(strip=True)
-
-            if any(b in label for b in ["branche", "tätigkeitsfeld", "industriezweig", "wirtschaftszweig"]):
-                branche = value
-            if "umsatz" in label and "mio" in value.lower():
-                match = re.search(r"(\d+[\d.,]*)\\s*Mio", value)
+    try:
+        branche_xpath = doc.xpath("//table[contains(@class, 'infobox')]//tr[th[contains(text(), 'Branche')]]/td/text()")
+        umsatz_xpath = doc.xpath("//table[contains(@class, 'infobox')]//tr[th[contains(translate(text(),'UMSATZ','umsatz'), 'umsatz')]]/td/text()")
+        if branche_xpath:
+            branche = branche_xpath[0].strip()
+        if umsatz_xpath:
+            umsatz_raw = umsatz_xpath[0].strip()
+            if "mio" in umsatz_raw.lower() or "millionen" in umsatz_raw.lower():
+                match = re.search(r"(\d+[.,]?\d*)", umsatz_raw)
                 if match:
                     umsatz = match.group(1).replace(",", ".")
-
+    except:
+        pass
     return branche, umsatz
 
 # === WIKIPEDIA DATEN ===
@@ -88,7 +85,7 @@ def validate_wikipedia_page(content, title, name, domain_key):
     title_check = any(frag in title.lower() for frag in name_fragments)
     content_check = any(frag in content.lower() for frag in name_fragments)
     domain_check = domain_key and domain_key.lower() in content.lower()
-    sim_check = similarity(name, title) > 0.6
+    sim_check = similarity(name, title) > 0.5
     return (title_check or content_check or domain_check or sim_check)
 
 def get_wikipedia_data(name, website_hint=""):
@@ -109,11 +106,10 @@ def get_wikipedia_data(name, website_hint=""):
                 for title in results:
                     try:
                         page = wikipedia.page(title, auto_suggest=False)
-                        html = requests.get(page.url, timeout=10).text
+                        html_text = requests.get(page.url, timeout=10).text
                         if not validate_wikipedia_page(page.content, title, name, domain_key):
                             continue
-                        soup = BeautifulSoup(html, "html.parser")
-                        branche, umsatz = parse_infobox_with_fallback(soup)
+                        branche, umsatz = parse_infobox_xpath(html_text)
                         score = similarity(name, title)
                         if branche != "k.A.":
                             score += 0.1
@@ -158,6 +154,9 @@ print("\n✅ Wikipedia-Auswertung abgeschlossen")
 
 
 
+
+
+
 # === SCHRITT 2: GPT-BEWERTUNG ===
 def classify_company(row, wikipedia_url=""):
     user_prompt = {