From 7becf2da2242f2a3d62d2dc73e13809e07fe1022 Mon Sep 17 00:00:00 2001
From: Floke <floke.com@gmail.com>
Date: Mon, 31 Mar 2025 09:34:33 +0000
Subject: [PATCH] =?UTF-8?q?feat(version=201.0.9):=20zuverl=C3=A4ssige=20Ex?=
 =?UTF-8?q?traktion=20von=20Branche=20und=20Umsatz=20aus=20Heimbach-Gruppe?=
 =?UTF-8?q?=20Wikipedia-Ar?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Lese gezielt mit lxml und XPath aus der Infobox
- Prüfe Namensähnlichkeit und URL-Fit vor der Extraktion
- Ausgabe von Branche/Umsatz nur bei sicherem Treffer
---
 brancheneinstufung.py | 34 ++++++++++++++--------------------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/brancheneinstufung.py b/brancheneinstufung.py
index f2e93155..255bf21f 100644
--- a/brancheneinstufung.py
+++ b/brancheneinstufung.py
@@ -9,12 +9,13 @@ import requests
 import openai
 import csv
 from bs4 import BeautifulSoup
+from lxml import html as lh
 from oauth2client.service_account import ServiceAccountCredentials
 from datetime import datetime
 from difflib import SequenceMatcher
 
 # === KONFIGURATION ===
-VERSION = "1.0.8"
+VERSION = "1.0.9"
 LANG = "de"
 CREDENTIALS = "service_account.json"
 SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
@@ -40,13 +41,12 @@ print(f"Starte bei Zeile {start+1}")
 
 wikipedia.set_lang(LANG)
 
-# === WIKIPEDIA DATEN LADEN ===
 def get_wikipedia_data(name, website_hint=""):
     begriffe = [name.strip(), " ".join(name.split()[:2])]
     if website_hint:
         parts = website_hint.replace("https://", "").replace("http://", "").split(".")
         if len(parts) > 1:
-            begriffe.append(parts[0])  # z. B. "heimbach" aus "www.heimbach.com"
+            begriffe.append(parts[0])
 
     for suchbegriff in begriffe:
         results = wikipedia.search(suchbegriff, results=3)
@@ -56,23 +56,17 @@ def get_wikipedia_data(name, website_hint=""):
                 if name.lower().split()[0] not in page.title.lower():
                     continue
                 url = page.url
-                html = requests.get(url).text
-                soup = BeautifulSoup(html, 'html.parser')
-                infobox = soup.find("table", {"class": "infobox"})
-                branche = umsatz = ""
-                if infobox:
-                    for row in infobox.find_all("tr"):
-                        th, td = row.find("th"), row.find("td")
-                        if not th or not td:
-                            continue
-                        if "Branche" in th.text:
-                            branche = td.text.strip()
-                        if "Umsatz" in th.text:
-                            umsatz = td.text.strip()
-                if not branche:
-                    cats = page.categories
-                    branche = cats[0] if cats else "k.A."
-                return url, branche or "k.A.", umsatz or "k.A."
+                html_raw = requests.get(url).text
+                dom = lh.fromstring(html_raw)
+
+                try:
+                    branche = dom.xpath("//th[contains(text(),'Branche') or contains(text(),'Tätigkeitsfeld')]/following-sibling::td[1]/text()")
+                    umsatz = dom.xpath("//th[contains(text(),'Umsatz')]/following-sibling::td[1]/text()")
+                    branche_clean = branche[0].strip() if branche else "k.A."
+                    umsatz_clean = umsatz[0].strip() if umsatz else "k.A."
+                except:
+                    branche_clean, umsatz_clean = "k.A.", "k.A."
+                return url, branche_clean, umsatz_clean
             except:
                 continue
     return "", "k.A.", "k.A."