feat(version 1.0.9): zuverlässige Extraktion von Branche und Umsatz aus Heimbach-Gruppe Wikipedia-Ar

- Lese gezielt mit lxml und XPath aus der Infobox
- Prüfe Namensähnlichkeit und URL-Fit vor der Extraktion
- Ausgabe von Branche/Umsatz nur bei sicherem Treffer
This commit is contained in:
2025-03-31 09:34:33 +00:00
parent f914871570
commit 7becf2da22

View File

@@ -9,12 +9,13 @@ import requests
import openai
import csv
from bs4 import BeautifulSoup
from lxml import html as lh
from oauth2client.service_account import ServiceAccountCredentials
from datetime import datetime
from difflib import SequenceMatcher
# === KONFIGURATION ===
VERSION = "1.0.8"
VERSION = "1.0.9"
LANG = "de"
CREDENTIALS = "service_account.json"
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
@@ -40,13 +41,12 @@ print(f"Starte bei Zeile {start+1}")
wikipedia.set_lang(LANG)
# === WIKIPEDIA DATEN LADEN ===
def get_wikipedia_data(name, website_hint=""):
begriffe = [name.strip(), " ".join(name.split()[:2])]
if website_hint:
parts = website_hint.replace("https://", "").replace("http://", "").split(".")
if len(parts) > 1:
begriffe.append(parts[0]) # z.B. "heimbach" aus "www.heimbach.com"
begriffe.append(parts[0])
for suchbegriff in begriffe:
results = wikipedia.search(suchbegriff, results=3)
@@ -56,23 +56,17 @@ def get_wikipedia_data(name, website_hint=""):
if name.lower().split()[0] not in page.title.lower():
continue
url = page.url
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
infobox = soup.find("table", {"class": "infobox"})
branche = umsatz = ""
if infobox:
for row in infobox.find_all("tr"):
th, td = row.find("th"), row.find("td")
if not th or not td:
continue
if "Branche" in th.text:
branche = td.text.strip()
if "Umsatz" in th.text:
umsatz = td.text.strip()
if not branche:
cats = page.categories
branche = cats[0] if cats else "k.A."
return url, branche or "k.A.", umsatz or "k.A."
html_raw = requests.get(url).text
dom = lh.fromstring(html_raw)
try:
branche = dom.xpath("//th[contains(text(),'Branche') or contains(text(),'Tätigkeitsfeld')]/following-sibling::td[1]/text()")
umsatz = dom.xpath("//th[contains(text(),'Umsatz')]/following-sibling::td[1]/text()")
branche_clean = branche[0].strip() if branche else "k.A."
umsatz_clean = umsatz[0].strip() if umsatz else "k.A."
except:
branche_clean, umsatz_clean = "k.A.", "k.A."
return url, branche_clean, umsatz_clean
except:
continue
return "", "k.A.", "k.A."