feat(version 1.0.9): zuverlässige Extraktion von Branche und Umsatz aus Heimbach-Gruppe Wikipedia-Ar
- Lese gezielt mit lxml und XPath aus der Infobox - Prüfe Namensähnlichkeit und URL-Fit vor der Extraktion - Ausgabe von Branche/Umsatz nur bei sicherem Treffer
This commit is contained in:
@@ -9,12 +9,13 @@ import requests
|
||||
import openai
|
||||
import csv
|
||||
from bs4 import BeautifulSoup
|
||||
from lxml import html as lh
|
||||
from oauth2client.service_account import ServiceAccountCredentials
|
||||
from datetime import datetime
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
# === KONFIGURATION ===
|
||||
VERSION = "1.0.8"
|
||||
VERSION = "1.0.9"
|
||||
LANG = "de"
|
||||
CREDENTIALS = "service_account.json"
|
||||
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
|
||||
@@ -40,13 +41,12 @@ print(f"Starte bei Zeile {start+1}")
|
||||
|
||||
wikipedia.set_lang(LANG)
|
||||
|
||||
# === WIKIPEDIA DATEN LADEN ===
|
||||
def get_wikipedia_data(name, website_hint=""):
|
||||
begriffe = [name.strip(), " ".join(name.split()[:2])]
|
||||
if website_hint:
|
||||
parts = website_hint.replace("https://", "").replace("http://", "").split(".")
|
||||
if len(parts) > 1:
|
||||
begriffe.append(parts[0]) # z. B. "heimbach" aus "www.heimbach.com"
|
||||
begriffe.append(parts[0])
|
||||
|
||||
for suchbegriff in begriffe:
|
||||
results = wikipedia.search(suchbegriff, results=3)
|
||||
@@ -56,23 +56,17 @@ def get_wikipedia_data(name, website_hint=""):
|
||||
if name.lower().split()[0] not in page.title.lower():
|
||||
continue
|
||||
url = page.url
|
||||
html = requests.get(url).text
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
infobox = soup.find("table", {"class": "infobox"})
|
||||
branche = umsatz = ""
|
||||
if infobox:
|
||||
for row in infobox.find_all("tr"):
|
||||
th, td = row.find("th"), row.find("td")
|
||||
if not th or not td:
|
||||
continue
|
||||
if "Branche" in th.text:
|
||||
branche = td.text.strip()
|
||||
if "Umsatz" in th.text:
|
||||
umsatz = td.text.strip()
|
||||
if not branche:
|
||||
cats = page.categories
|
||||
branche = cats[0] if cats else "k.A."
|
||||
return url, branche or "k.A.", umsatz or "k.A."
|
||||
html_raw = requests.get(url).text
|
||||
dom = lh.fromstring(html_raw)
|
||||
|
||||
try:
|
||||
branche = dom.xpath("//th[contains(text(),'Branche') or contains(text(),'Tätigkeitsfeld')]/following-sibling::td[1]/text()")
|
||||
umsatz = dom.xpath("//th[contains(text(),'Umsatz')]/following-sibling::td[1]/text()")
|
||||
branche_clean = branche[0].strip() if branche else "k.A."
|
||||
umsatz_clean = umsatz[0].strip() if umsatz else "k.A."
|
||||
except:
|
||||
branche_clean, umsatz_clean = "k.A.", "k.A."
|
||||
return url, branche_clean, umsatz_clean
|
||||
except:
|
||||
continue
|
||||
return "", "k.A.", "k.A."
|
||||
|
||||
Reference in New Issue
Block a user