From 7becf2da2242f2a3d62d2dc73e13809e07fe1022 Mon Sep 17 00:00:00 2001 From: Floke Date: Mon, 31 Mar 2025 09:34:33 +0000 Subject: [PATCH] =?UTF-8?q?feat(version=201.0.9):=20zuverl=C3=A4ssige=20Ex?= =?UTF-8?q?traktion=20von=20Branche=20und=20Umsatz=20aus=20Heimbach-Gruppe?= =?UTF-8?q?=20Wikipedia-Ar?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Lese gezielt mit lxml und XPath aus der Infobox - Prüfe Namensähnlichkeit und URL-Fit vor der Extraktion - Ausgabe von Branche/Umsatz nur bei sicherem Treffer --- brancheneinstufung.py | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/brancheneinstufung.py b/brancheneinstufung.py index f2e93155..255bf21f 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -9,12 +9,13 @@ import requests import openai import csv from bs4 import BeautifulSoup +from lxml import html as lh from oauth2client.service_account import ServiceAccountCredentials from datetime import datetime from difflib import SequenceMatcher # === KONFIGURATION === -VERSION = "1.0.8" +VERSION = "1.0.9" LANG = "de" CREDENTIALS = "service_account.json" SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo" @@ -40,13 +41,12 @@ print(f"Starte bei Zeile {start+1}") wikipedia.set_lang(LANG) -# === WIKIPEDIA DATEN LADEN === def get_wikipedia_data(name, website_hint=""): begriffe = [name.strip(), " ".join(name.split()[:2])] if website_hint: parts = website_hint.replace("https://", "").replace("http://", "").split(".") if len(parts) > 1: - begriffe.append(parts[0]) # z. B. "heimbach" aus "www.heimbach.com" + begriffe.append(parts[0]) for suchbegriff in begriffe: results = wikipedia.search(suchbegriff, results=3) @@ -56,23 +56,17 @@ def get_wikipedia_data(name, website_hint=""): if name.lower().split()[0] not in page.title.lower(): continue url = page.url - html = requests.get(url).text - soup = BeautifulSoup(html, 'html.parser') - infobox = soup.find("table", {"class": "infobox"}) - branche = umsatz = "" - if infobox: - for row in infobox.find_all("tr"): - th, td = row.find("th"), row.find("td") - if not th or not td: - continue - if "Branche" in th.text: - branche = td.text.strip() - if "Umsatz" in th.text: - umsatz = td.text.strip() - if not branche: - cats = page.categories - branche = cats[0] if cats else "k.A." - return url, branche or "k.A.", umsatz or "k.A." + html_raw = requests.get(url).text + dom = lh.fromstring(html_raw) + + try: + branche = dom.xpath("//th[contains(text(),'Branche') or contains(text(),'Tätigkeitsfeld')]/following-sibling::td[1]/text()") + umsatz = dom.xpath("//th[contains(text(),'Umsatz')]/following-sibling::td[1]/text()") + branche_clean = branche[0].strip() if branche else "k.A." + umsatz_clean = umsatz[0].strip() if umsatz else "k.A." + except: + branche_clean, umsatz_clean = "k.A.", "k.A." + return url, branche_clean, umsatz_clean except: continue return "", "k.A.", "k.A."