feat(version 1.0.9): zuverlässige Extraktion von Branche und Umsatz aus Heimbach-Gruppe Wikipedia-Ar

- Lese gezielt mit lxml und XPath aus der Infobox - Prüfe Namensähnlichkeit und URL-Fit vor der Extraktion - Ausgabe von Branche/Umsatz nur bei sicherem Treffer
2025-03-31 09:34:33 +00:00
parent f914871570
commit 7becf2da22
1 changed files with 14 additions and 20 deletions
--- a/brancheneinstufung.py
+++ b/brancheneinstufung.py
@@ -9,12 +9,13 @@ import requests
 import openai
 import csv
 from bs4 import BeautifulSoup
+from lxml import html as lh
 from oauth2client.service_account import ServiceAccountCredentials
 from datetime import datetime
 from difflib import SequenceMatcher

 # === KONFIGURATION ===
-VERSION = "1.0.8"
+VERSION = "1.0.9"
 LANG = "de"
 CREDENTIALS = "service_account.json"
 SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
@@ -40,13 +41,12 @@ print(f"Starte bei Zeile {start+1}")

 wikipedia.set_lang(LANG)

-# === WIKIPEDIA DATEN LADEN ===
 def get_wikipedia_data(name, website_hint=""):
    begriffe = [name.strip(), " ".join(name.split()[:2])]
    if website_hint:
        parts = website_hint.replace("https://", "").replace("http://", "").split(".")
        if len(parts) > 1:
-            begriffe.append(parts[0])  # z. B. "heimbach" aus "www.heimbach.com"
+            begriffe.append(parts[0])

    for suchbegriff in begriffe:
        results = wikipedia.search(suchbegriff, results=3)
@@ -56,23 +56,17 @@ def get_wikipedia_data(name, website_hint=""):
                if name.lower().split()[0] not in page.title.lower():
                    continue
                url = page.url
-                html = requests.get(url).text
-                soup = BeautifulSoup(html, 'html.parser')
-                infobox = soup.find("table", {"class": "infobox"})
-                branche = umsatz = ""
-                if infobox:
-                    for row in infobox.find_all("tr"):
-                        th, td = row.find("th"), row.find("td")
-                        if not th or not td:
-                            continue
-                        if "Branche" in th.text:
-                            branche = td.text.strip()
-                        if "Umsatz" in th.text:
-                            umsatz = td.text.strip()
-                if not branche:
-                    cats = page.categories
-                    branche = cats[0] if cats else "k.A."
-                return url, branche or "k.A.", umsatz or "k.A."
+                html_raw = requests.get(url).text
+                dom = lh.fromstring(html_raw)
+
+                try:
+                    branche = dom.xpath("//th[contains(text(),'Branche') or contains(text(),'Tätigkeitsfeld')]/following-sibling::td[1]/text()")
+                    umsatz = dom.xpath("//th[contains(text(),'Umsatz')]/following-sibling::td[1]/text()")
+                    branche_clean = branche[0].strip() if branche else "k.A."
+                    umsatz_clean = umsatz[0].strip() if umsatz else "k.A."
+                except:
+                    branche_clean, umsatz_clean = "k.A.", "k.A."
+                return url, branche_clean, umsatz_clean
            except:
                continue
    return "", "k.A.", "k.A."