fix(wikipedia): Robustere Extraktion durch angepasste Infobox-Auswertung mit Synonym-Feldern

- XPath entfernt, da BeautifulSoup in Kombination mit Infobox-Selektor zuverlässiger - Nur Branchen aus Infobox erlaubt, Kategorien nur als Fallback entfernt - Verbesserte Ausgabe: Nur bei gefundenem Wikipedia-Link werden Werte eingetragen - Versionskennung 1.0.6 eingeführt (Spalte Q)
2025-03-31 08:36:38 +00:00
parent 85021e2eff
commit c32fef45b9
1 changed files with 41 additions and 25 deletions
--- a/brancheneinstufung.py
+++ b/brancheneinstufung.py
@@ -12,10 +12,9 @@ from bs4 import BeautifulSoup
 from oauth2client.service_account import ServiceAccountCredentials
 from datetime import datetime
 from difflib import SequenceMatcher
 from lxml import html as lh
 # === KONFIGURATION ===
-VERSION = "1.0.9-wiki-refined"
+VERSION = "1.0.7"
 LANG = "de"
 CREDENTIALS = "service_account.json"
 SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
@@ -53,6 +52,35 @@ def extract_domain_key(url):
 def similarity(a, b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()
 # === INFOS AUS INFOBOX LESEN ===
 def extract_infobox_fields(soup):
    branche = umsatz = ""
    infobox = soup.find("table", class_=lambda c: c and "infobox" in c)
    if infobox:
        for row in infobox.find_all("tr"):
            th, td = row.find("th"), row.find("td")
            if not th or not td:
                continue
            th_text = th.text.lower().strip()
            if any(key in th_text for key in ["branche", "tätigkeitsfeld", "bereich"]):
                branche = td.text.strip()
            if "umsatz" in th_text:
                umsatz_raw = td.text.strip()
                match = re.search(r"(\d+[.,]?\d*)", umsatz_raw)
                if match:
                    umsatz = match.group(1).replace(",", ".")
    return branche, umsatz
 # === VALIDIERUNG DES WIKIPEDIA-ARTIKELS ===
 def is_valid_wiki_article(content, name, domain_key):
    name_parts = name.lower().split()
    score = 0
    if any(part in content.lower() for part in name_parts):
        score += 1
    if domain_key and domain_key.lower() in content.lower():
        score += 1
    return score >= 1
 # === WIKIPEDIA DATEN LADEN ===
 def get_wikipedia_data(name, website_hint=""):
    begriffe = [name.strip(), " ".join(name.split()[:2])]
@@ -60,35 +88,20 @@ def get_wikipedia_data(name, website_hint=""):
        parts = website_hint.replace("https://", "").replace("http://", "").split(".")
        if len(parts) > 1:
            begriffe.append(parts[0])
    domain_key = extract_domain_key(website_hint)
    for suchbegriff in begriffe:
-        results = wikipedia.search(suchbegriff, results=3)
+        results = wikipedia.search(suchbegriff, results=5)
        for title in results:
            try:
-                page = wikipedia.page(title)
+                page = wikipedia.page(title, auto_suggest=False)
-                if name.lower().split()[0] not in page.title.lower():
+                html_content = requests.get(page.url, timeout=10).text
                if not is_valid_wiki_article(html_content, name, domain_key):
                    continue
                url = page.url
                html_content = requests.get(url, timeout=10).text
                soup = BeautifulSoup(html_content, 'html.parser')
-                infobox = soup.find("table", {"class": "infobox"})
+                branche, umsatz = extract_infobox_fields(soup)
-                branche = umsatz = ""
+                if branche or umsatz:
-                if infobox:
+                    return page.url, branche or "k.A.", umsatz or "k.A."
                    for row in infobox.find_all("tr"):
                        th, td = row.find("th"), row.find("td")
                        if not th or not td:
                            continue
                        if "Branche" in th.text:
                            branche = td.text.strip()
                        if "Umsatz" in th.text:
                            umsatz_raw = td.text.strip()
                            match = re.search(r"(\d+[.,]?\d*)", umsatz_raw)
                            if match:
                                umsatz = match.group(1).replace(",", ".")
                if not branche:
                    cats = page.categories
                    branche = cats[0] if cats else "k.A."
                return url, branche or "k.A.", umsatz or "k.A."
            except:
                continue
    return "", "k.A.", "k.A."
@@ -124,6 +137,9 @@ print("\n✅ Wikipedia-Auswertung abgeschlossen")
 # === SCHRITT 2: GPT-BEWERTUNG ===
 def classify_company(row, wikipedia_url=""):
    user_prompt = {