🔁 Refactor Wikipedia-Parser to stable v1.0.7 logic

- reverted to earlier reliable search strategy - restored infobox-first extraction - ensured fallback to categories only if infobox branch is missing - maintained timestamp and version output
2025-03-31 09:00:44 +00:00
parent 21415698e0
commit c27f2cdca2
1 changed files with 24 additions and 52 deletions
--- a/brancheneinstufung.py
+++ b/brancheneinstufung.py
@@ -14,7 +14,7 @@ from datetime import datetime
 from difflib import SequenceMatcher

 # === KONFIGURATION ===
-VERSION = "1.0.7"
+VERSION = "1.0.8"
 LANG = "de"
 CREDENTIALS = "service_account.json"
 SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
@@ -40,68 +40,39 @@ print(f"Starte bei Zeile {start+1}")

 wikipedia.set_lang(LANG)

-# === DOMAIN SCHLÜSSEL ===
-def extract_domain_key(url):
-    if not url:
-        return ""
-    clean_url = url.replace("https://", "").replace("http://", "").split("/")[0]
-    parts = clean_url.split(".")
-    return parts[0] if len(parts) > 1 else ""
-
-# === ÄHNLICHKEITSPRÜFUNG ===
-def similarity(a, b):
-    return SequenceMatcher(None, a.lower(), b.lower()).ratio()
-
-# === INFOS AUS INFOBOX LESEN ===
-def extract_infobox_fields(soup):
-    branche = umsatz = ""
-    infobox = soup.find("table", class_=lambda c: c and "infobox" in c)
-    if infobox:
-        for row in infobox.find_all("tr"):
-            th, td = row.find("th"), row.find("td")
-            if not th or not td:
-                continue
-            th_text = th.text.lower().strip()
-            if any(key in th_text for key in ["branche", "tätigkeitsfeld", "bereich"]):
-                branche = td.text.strip()
-            if "umsatz" in th_text:
-                umsatz_raw = td.text.strip()
-                match = re.search(r"(\d+[.,]?\d*)", umsatz_raw)
-                if match:
-                    umsatz = match.group(1).replace(",", ".")
-    return branche, umsatz
-
-# === VALIDIERUNG DES WIKIPEDIA-ARTIKELS ===
-def is_valid_wiki_article(content, name, domain_key):
-    name_parts = name.lower().split()
-    score = 0
-    if any(part in content.lower() for part in name_parts):
-        score += 1
-    if domain_key and domain_key.lower() in content.lower():
-        score += 1
-    return score >= 1
-
 # === WIKIPEDIA DATEN LADEN ===
 def get_wikipedia_data(name, website_hint=""):
    begriffe = [name.strip(), " ".join(name.split()[:2])]
    if website_hint:
        parts = website_hint.replace("https://", "").replace("http://", "").split(".")
        if len(parts) > 1:
-            begriffe.append(parts[0])
-    domain_key = extract_domain_key(website_hint)
+            begriffe.append(parts[0])  # z. B. "heimbach" aus "www.heimbach.com"

    for suchbegriff in begriffe:
-        results = wikipedia.search(suchbegriff, results=5)
+        results = wikipedia.search(suchbegriff, results=3)
        for title in results:
            try:
-                page = wikipedia.page(title, auto_suggest=False)
-                html_content = requests.get(page.url, timeout=10).text
-                if not is_valid_wiki_article(html_content, name, domain_key):
+                page = wikipedia.page(title)
+                if name.lower().split()[0] not in page.title.lower():
                    continue
-                soup = BeautifulSoup(html_content, 'html.parser')
-                branche, umsatz = extract_infobox_fields(soup)
-                if branche or umsatz:
-                    return page.url, branche or "k.A.", umsatz or "k.A."
+                url = page.url
+                html = requests.get(url).text
+                soup = BeautifulSoup(html, 'html.parser')
+                infobox = soup.find("table", {"class": "infobox"})
+                branche = umsatz = ""
+                if infobox:
+                    for row in infobox.find_all("tr"):
+                        th, td = row.find("th"), row.find("td")
+                        if not th or not td:
+                            continue
+                        if "Branche" in th.text:
+                            branche = td.text.strip()
+                        if "Umsatz" in th.text:
+                            umsatz = td.text.strip()
+                if not branche:
+                    cats = page.categories
+                    branche = cats[0] if cats else "k.A."
+                return url, branche or "k.A.", umsatz or "k.A."
            except:
                continue
    return "", "k.A.", "k.A."
@@ -140,6 +111,7 @@ print("\n✅ Wikipedia-Auswertung abgeschlossen")



+
 # === SCHRITT 2: GPT-BEWERTUNG ===
 def classify_company(row, wikipedia_url=""):
    user_prompt = {