fix(wikipedia): Robustere Extraktion durch angepasste Infobox-Auswertung mit Synonym-Feldern

- XPath entfernt, da BeautifulSoup in Kombination mit Infobox-Selektor zuverlässiger
- Nur Branchen aus Infobox erlaubt, Kategorien nur als Fallback entfernt
- Verbesserte Ausgabe: Nur bei gefundenem Wikipedia-Link werden Werte eingetragen
- Versionskennung 1.0.6 eingeführt (Spalte Q)
This commit is contained in:
2025-03-31 08:36:38 +00:00
parent 85021e2eff
commit c32fef45b9

View File

@@ -12,10 +12,9 @@ from bs4 import BeautifulSoup
from oauth2client.service_account import ServiceAccountCredentials
from datetime import datetime
from difflib import SequenceMatcher
from lxml import html as lh
# === KONFIGURATION ===
VERSION = "1.0.9-wiki-refined"
VERSION = "1.0.7"
LANG = "de"
CREDENTIALS = "service_account.json"
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
@@ -53,6 +52,35 @@ def extract_domain_key(url):
def similarity(a, b):
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
# === INFOS AUS INFOBOX LESEN ===
def extract_infobox_fields(soup):
branche = umsatz = ""
infobox = soup.find("table", class_=lambda c: c and "infobox" in c)
if infobox:
for row in infobox.find_all("tr"):
th, td = row.find("th"), row.find("td")
if not th or not td:
continue
th_text = th.text.lower().strip()
if any(key in th_text for key in ["branche", "tätigkeitsfeld", "bereich"]):
branche = td.text.strip()
if "umsatz" in th_text:
umsatz_raw = td.text.strip()
match = re.search(r"(\d+[.,]?\d*)", umsatz_raw)
if match:
umsatz = match.group(1).replace(",", ".")
return branche, umsatz
# === VALIDIERUNG DES WIKIPEDIA-ARTIKELS ===
def is_valid_wiki_article(content, name, domain_key):
name_parts = name.lower().split()
score = 0
if any(part in content.lower() for part in name_parts):
score += 1
if domain_key and domain_key.lower() in content.lower():
score += 1
return score >= 1
# === WIKIPEDIA DATEN LADEN ===
def get_wikipedia_data(name, website_hint=""):
begriffe = [name.strip(), " ".join(name.split()[:2])]
@@ -60,35 +88,20 @@ def get_wikipedia_data(name, website_hint=""):
parts = website_hint.replace("https://", "").replace("http://", "").split(".")
if len(parts) > 1:
begriffe.append(parts[0])
domain_key = extract_domain_key(website_hint)
for suchbegriff in begriffe:
results = wikipedia.search(suchbegriff, results=3)
results = wikipedia.search(suchbegriff, results=5)
for title in results:
try:
page = wikipedia.page(title)
if name.lower().split()[0] not in page.title.lower():
page = wikipedia.page(title, auto_suggest=False)
html_content = requests.get(page.url, timeout=10).text
if not is_valid_wiki_article(html_content, name, domain_key):
continue
url = page.url
html_content = requests.get(url, timeout=10).text
soup = BeautifulSoup(html_content, 'html.parser')
infobox = soup.find("table", {"class": "infobox"})
branche = umsatz = ""
if infobox:
for row in infobox.find_all("tr"):
th, td = row.find("th"), row.find("td")
if not th or not td:
continue
if "Branche" in th.text:
branche = td.text.strip()
if "Umsatz" in th.text:
umsatz_raw = td.text.strip()
match = re.search(r"(\d+[.,]?\d*)", umsatz_raw)
if match:
umsatz = match.group(1).replace(",", ".")
if not branche:
cats = page.categories
branche = cats[0] if cats else "k.A."
return url, branche or "k.A.", umsatz or "k.A."
branche, umsatz = extract_infobox_fields(soup)
if branche or umsatz:
return page.url, branche or "k.A.", umsatz or "k.A."
except:
continue
return "", "k.A.", "k.A."
@@ -124,6 +137,9 @@ print("\n✅ Wikipedia-Auswertung abgeschlossen")
# === SCHRITT 2: GPT-BEWERTUNG ===
def classify_company(row, wikipedia_url=""):
user_prompt = {