🔁 Refactor Wikipedia-Parser to stable v1.0.7 logic

- reverted to earlier reliable search strategy - restored infobox-first extraction - ensured fallback to categories only if infobox branch is missing - maintained timestamp and version output
2025-03-31 09:00:44 +00:00
parent 21415698e0
commit c27f2cdca2
1 changed files with 24 additions and 52 deletions
--- a/brancheneinstufung.py
+++ b/brancheneinstufung.py
@@ -14,7 +14,7 @@ from datetime import datetime
 from difflib import SequenceMatcher
 # === KONFIGURATION ===
-VERSION = "1.0.7"
+VERSION = "1.0.8"
 LANG = "de"
 CREDENTIALS = "service_account.json"
 SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
@@ -40,68 +40,39 @@ print(f"Starte bei Zeile {start+1}")
 wikipedia.set_lang(LANG)
 # === DOMAIN SCHLÜSSEL ===
 def extract_domain_key(url):
    if not url:
        return ""
    clean_url = url.replace("https://", "").replace("http://", "").split("/")[0]
    parts = clean_url.split(".")
    return parts[0] if len(parts) > 1 else ""
 # === ÄHNLICHKEITSPRÜFUNG ===
 def similarity(a, b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()
 # === INFOS AUS INFOBOX LESEN ===
 def extract_infobox_fields(soup):
    branche = umsatz = ""
    infobox = soup.find("table", class_=lambda c: c and "infobox" in c)
    if infobox:
        for row in infobox.find_all("tr"):
            th, td = row.find("th"), row.find("td")
            if not th or not td:
                continue
            th_text = th.text.lower().strip()
            if any(key in th_text for key in ["branche", "tätigkeitsfeld", "bereich"]):
                branche = td.text.strip()
            if "umsatz" in th_text:
                umsatz_raw = td.text.strip()
                match = re.search(r"(\d+[.,]?\d*)", umsatz_raw)
                if match:
                    umsatz = match.group(1).replace(",", ".")
    return branche, umsatz
 # === VALIDIERUNG DES WIKIPEDIA-ARTIKELS ===
 def is_valid_wiki_article(content, name, domain_key):
    name_parts = name.lower().split()
    score = 0
    if any(part in content.lower() for part in name_parts):
        score += 1
    if domain_key and domain_key.lower() in content.lower():
        score += 1
    return score >= 1
 # === WIKIPEDIA DATEN LADEN ===
 def get_wikipedia_data(name, website_hint=""):
    begriffe = [name.strip(), " ".join(name.split()[:2])]
    if website_hint:
        parts = website_hint.replace("https://", "").replace("http://", "").split(".")
        if len(parts) > 1:
-            begriffe.append(parts[0])
+            begriffe.append(parts[0])  # z. B. "heimbach" aus "www.heimbach.com"
    domain_key = extract_domain_key(website_hint)
    for suchbegriff in begriffe:
-        results = wikipedia.search(suchbegriff, results=5)
+        results = wikipedia.search(suchbegriff, results=3)
        for title in results:
            try:
-                page = wikipedia.page(title, auto_suggest=False)
+                page = wikipedia.page(title)
-                html_content = requests.get(page.url, timeout=10).text
+                if name.lower().split()[0] not in page.title.lower():
                if not is_valid_wiki_article(html_content, name, domain_key):
                    continue
-                soup = BeautifulSoup(html_content, 'html.parser')
+                url = page.url
-                branche, umsatz = extract_infobox_fields(soup)
+                html = requests.get(url).text
-                if branche or umsatz:
+                soup = BeautifulSoup(html, 'html.parser')
-                    return page.url, branche or "k.A.", umsatz or "k.A."
+                infobox = soup.find("table", {"class": "infobox"})
                branche = umsatz = ""
                if infobox:
                    for row in infobox.find_all("tr"):
                        th, td = row.find("th"), row.find("td")
                        if not th or not td:
                            continue
                        if "Branche" in th.text:
                            branche = td.text.strip()
                        if "Umsatz" in th.text:
                            umsatz = td.text.strip()
                if not branche:
                    cats = page.categories
                    branche = cats[0] if cats else "k.A."
                return url, branche or "k.A.", umsatz or "k.A."
            except:
                continue
    return "", "k.A.", "k.A."
@@ -140,6 +111,7 @@ print("\n✅ Wikipedia-Auswertung abgeschlossen")
 # === SCHRITT 2: GPT-BEWERTUNG ===
 def classify_company(row, wikipedia_url=""):
    user_prompt = {