fix(wikipedia): Robustere Extraktion durch angepasste Infobox-Auswertung mit Synonym-Feldern

- XPath entfernt, da BeautifulSoup in Kombination mit Infobox-Selektor zuverlässiger - Nur Branchen aus Infobox erlaubt, Kategorien nur als Fallback entfernt - Verbesserte Ausgabe: Nur bei gefundenem Wikipedia-Link werden Werte eingetragen - Versionskennung 1.0.6 eingeführt (Spalte Q)
2025-03-31 08:36:38 +00:00
parent 85021e2eff
commit c32fef45b9
1 changed files with 41 additions and 25 deletions
--- a/brancheneinstufung.py
+++ b/brancheneinstufung.py
@@ -12,10 +12,9 @@ from bs4 import BeautifulSoup
 from oauth2client.service_account import ServiceAccountCredentials
 from datetime import datetime
 from difflib import SequenceMatcher
-from lxml import html as lh

 # === KONFIGURATION ===
-VERSION = "1.0.9-wiki-refined"
+VERSION = "1.0.7"
 LANG = "de"
 CREDENTIALS = "service_account.json"
 SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
@@ -53,6 +52,35 @@ def extract_domain_key(url):
 def similarity(a, b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

+# === INFOS AUS INFOBOX LESEN ===
+def extract_infobox_fields(soup):
+    branche = umsatz = ""
+    infobox = soup.find("table", class_=lambda c: c and "infobox" in c)
+    if infobox:
+        for row in infobox.find_all("tr"):
+            th, td = row.find("th"), row.find("td")
+            if not th or not td:
+                continue
+            th_text = th.text.lower().strip()
+            if any(key in th_text for key in ["branche", "tätigkeitsfeld", "bereich"]):
+                branche = td.text.strip()
+            if "umsatz" in th_text:
+                umsatz_raw = td.text.strip()
+                match = re.search(r"(\d+[.,]?\d*)", umsatz_raw)
+                if match:
+                    umsatz = match.group(1).replace(",", ".")
+    return branche, umsatz
+
+# === VALIDIERUNG DES WIKIPEDIA-ARTIKELS ===
+def is_valid_wiki_article(content, name, domain_key):
+    name_parts = name.lower().split()
+    score = 0
+    if any(part in content.lower() for part in name_parts):
+        score += 1
+    if domain_key and domain_key.lower() in content.lower():
+        score += 1
+    return score >= 1
+
 # === WIKIPEDIA DATEN LADEN ===
 def get_wikipedia_data(name, website_hint=""):
    begriffe = [name.strip(), " ".join(name.split()[:2])]
@@ -60,35 +88,20 @@ def get_wikipedia_data(name, website_hint=""):
        parts = website_hint.replace("https://", "").replace("http://", "").split(".")
        if len(parts) > 1:
            begriffe.append(parts[0])
+    domain_key = extract_domain_key(website_hint)

    for suchbegriff in begriffe:
-        results = wikipedia.search(suchbegriff, results=3)
+        results = wikipedia.search(suchbegriff, results=5)
        for title in results:
            try:
-                page = wikipedia.page(title)
-                if name.lower().split()[0] not in page.title.lower():
+                page = wikipedia.page(title, auto_suggest=False)
+                html_content = requests.get(page.url, timeout=10).text
+                if not is_valid_wiki_article(html_content, name, domain_key):
                    continue
-                url = page.url
-                html_content = requests.get(url, timeout=10).text
                soup = BeautifulSoup(html_content, 'html.parser')
-                infobox = soup.find("table", {"class": "infobox"})
-                branche = umsatz = ""
-                if infobox:
-                    for row in infobox.find_all("tr"):
-                        th, td = row.find("th"), row.find("td")
-                        if not th or not td:
-                            continue
-                        if "Branche" in th.text:
-                            branche = td.text.strip()
-                        if "Umsatz" in th.text:
-                            umsatz_raw = td.text.strip()
-                            match = re.search(r"(\d+[.,]?\d*)", umsatz_raw)
-                            if match:
-                                umsatz = match.group(1).replace(",", ".")
-                if not branche:
-                    cats = page.categories
-                    branche = cats[0] if cats else "k.A."
-                return url, branche or "k.A.", umsatz or "k.A."
+                branche, umsatz = extract_infobox_fields(soup)
+                if branche or umsatz:
+                    return page.url, branche or "k.A.", umsatz or "k.A."
            except:
                continue
    return "", "k.A.", "k.A."
@@ -124,6 +137,9 @@ print("\n✅ Wikipedia-Auswertung abgeschlossen")



+
+
+
 # === SCHRITT 2: GPT-BEWERTUNG ===
 def classify_company(row, wikipedia_url=""):
    user_prompt = {