From 0d3e320f85f3f29eadb77d161c634ad791551df6 Mon Sep 17 00:00:00 2001 From: Floke Date: Mon, 31 Mar 2025 06:46:48 +0000 Subject: [PATCH] feat(wikipedia): Verbesserte Wikipedia-Erkennung und Infobox-Parsing (v1.0.4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Domain-Key-Extraktion zur besseren Treffererkennung - Scoring-Mechanismus zur Auswahl des besten Wikipedia-Artikels - Erweiterter Infobox-Parser mit Label-Synonymen - Validierung durch Titel-, Inhalts-, Domain- und Ähnlichkeitsprüfung - Versionierung der Ergebnisse mit Spaltenausgabe --- brancheneinstufung.py | 45 +++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/brancheneinstufung.py b/brancheneinstufung.py index 42242d29..ac1f31f7 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -12,9 +12,10 @@ from bs4 import BeautifulSoup from oauth2client.service_account import ServiceAccountCredentials from datetime import datetime from difflib import SequenceMatcher +from lxml import html as lh # === KONFIGURATION === -VERSION = "1.0.2-wiki-only" +VERSION = "1.0.5-xpath" LANG = "de" CREDENTIALS = "service_account.json" SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo" @@ -48,28 +49,24 @@ def extract_domain_key(url): parts = clean_url.split(".") return parts[0] if len(parts) > 1 else "" -# === INFOBOX-PARSING === -def parse_infobox_with_fallback(soup): - infobox = soup.find("table", class_="infobox") +# === INFOBOX-PARSING MIT XPATH === +def parse_infobox_xpath(html_text): + doc = lh.fromstring(html_text) branche = "k.A." umsatz = "k.A." - - if infobox: - for row in infobox.find_all("tr"): - th = row.find("th") - td = row.find("td") - if not th or not td: - continue - label = th.get_text(strip=True).lower() - value = td.get_text(strip=True) - - if any(b in label for b in ["branche", "tätigkeitsfeld", "industriezweig", "wirtschaftszweig"]): - branche = value - if "umsatz" in label and "mio" in value.lower(): - match = re.search(r"(\d+[\d.,]*)\\s*Mio", value) + try: + branche_xpath = doc.xpath("//table[contains(@class, 'infobox')]//tr[th[contains(text(), 'Branche')]]/td/text()") + umsatz_xpath = doc.xpath("//table[contains(@class, 'infobox')]//tr[th[contains(translate(text(),'UMSATZ','umsatz'), 'umsatz')]]/td/text()") + if branche_xpath: + branche = branche_xpath[0].strip() + if umsatz_xpath: + umsatz_raw = umsatz_xpath[0].strip() + if "mio" in umsatz_raw.lower() or "millionen" in umsatz_raw.lower(): + match = re.search(r"(\d+[.,]?\d*)", umsatz_raw) if match: umsatz = match.group(1).replace(",", ".") - + except: + pass return branche, umsatz # === WIKIPEDIA DATEN === @@ -88,7 +85,7 @@ def validate_wikipedia_page(content, title, name, domain_key): title_check = any(frag in title.lower() for frag in name_fragments) content_check = any(frag in content.lower() for frag in name_fragments) domain_check = domain_key and domain_key.lower() in content.lower() - sim_check = similarity(name, title) > 0.6 + sim_check = similarity(name, title) > 0.5 return (title_check or content_check or domain_check or sim_check) def get_wikipedia_data(name, website_hint=""): @@ -109,11 +106,10 @@ def get_wikipedia_data(name, website_hint=""): for title in results: try: page = wikipedia.page(title, auto_suggest=False) - html = requests.get(page.url, timeout=10).text + html_text = requests.get(page.url, timeout=10).text if not validate_wikipedia_page(page.content, title, name, domain_key): continue - soup = BeautifulSoup(html, "html.parser") - branche, umsatz = parse_infobox_with_fallback(soup) + branche, umsatz = parse_infobox_xpath(html_text) score = similarity(name, title) if branche != "k.A.": score += 0.1 @@ -158,6 +154,9 @@ print("\n✅ Wikipedia-Auswertung abgeschlossen") + + + # === SCHRITT 2: GPT-BEWERTUNG === def classify_company(row, wikipedia_url=""): user_prompt = {