feat(wikipedia): Verbesserte Wikipedia-Erkennung und Infobox-Parsing (v1.0.4)
- Domain-Key-Extraktion zur besseren Treffererkennung - Scoring-Mechanismus zur Auswahl des besten Wikipedia-Artikels - Erweiterter Infobox-Parser mit Label-Synonymen - Validierung durch Titel-, Inhalts-, Domain- und Ähnlichkeitsprüfung - Versionierung der Ergebnisse mit Spaltenausgabe
This commit is contained in:
@@ -12,9 +12,10 @@ from bs4 import BeautifulSoup
|
|||||||
from oauth2client.service_account import ServiceAccountCredentials
|
from oauth2client.service_account import ServiceAccountCredentials
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from difflib import SequenceMatcher
|
from difflib import SequenceMatcher
|
||||||
|
from lxml import html as lh
|
||||||
|
|
||||||
# === KONFIGURATION ===
|
# === KONFIGURATION ===
|
||||||
VERSION = "1.0.2-wiki-only"
|
VERSION = "1.0.5-xpath"
|
||||||
LANG = "de"
|
LANG = "de"
|
||||||
CREDENTIALS = "service_account.json"
|
CREDENTIALS = "service_account.json"
|
||||||
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
|
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
|
||||||
@@ -48,28 +49,24 @@ def extract_domain_key(url):
|
|||||||
parts = clean_url.split(".")
|
parts = clean_url.split(".")
|
||||||
return parts[0] if len(parts) > 1 else ""
|
return parts[0] if len(parts) > 1 else ""
|
||||||
|
|
||||||
# === INFOBOX-PARSING ===
|
# === INFOBOX-PARSING MIT XPATH ===
|
||||||
def parse_infobox_with_fallback(soup):
|
def parse_infobox_xpath(html_text):
|
||||||
infobox = soup.find("table", class_="infobox")
|
doc = lh.fromstring(html_text)
|
||||||
branche = "k.A."
|
branche = "k.A."
|
||||||
umsatz = "k.A."
|
umsatz = "k.A."
|
||||||
|
try:
|
||||||
if infobox:
|
branche_xpath = doc.xpath("//table[contains(@class, 'infobox')]//tr[th[contains(text(), 'Branche')]]/td/text()")
|
||||||
for row in infobox.find_all("tr"):
|
umsatz_xpath = doc.xpath("//table[contains(@class, 'infobox')]//tr[th[contains(translate(text(),'UMSATZ','umsatz'), 'umsatz')]]/td/text()")
|
||||||
th = row.find("th")
|
if branche_xpath:
|
||||||
td = row.find("td")
|
branche = branche_xpath[0].strip()
|
||||||
if not th or not td:
|
if umsatz_xpath:
|
||||||
continue
|
umsatz_raw = umsatz_xpath[0].strip()
|
||||||
label = th.get_text(strip=True).lower()
|
if "mio" in umsatz_raw.lower() or "millionen" in umsatz_raw.lower():
|
||||||
value = td.get_text(strip=True)
|
match = re.search(r"(\d+[.,]?\d*)", umsatz_raw)
|
||||||
|
|
||||||
if any(b in label for b in ["branche", "tätigkeitsfeld", "industriezweig", "wirtschaftszweig"]):
|
|
||||||
branche = value
|
|
||||||
if "umsatz" in label and "mio" in value.lower():
|
|
||||||
match = re.search(r"(\d+[\d.,]*)\\s*Mio", value)
|
|
||||||
if match:
|
if match:
|
||||||
umsatz = match.group(1).replace(",", ".")
|
umsatz = match.group(1).replace(",", ".")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
return branche, umsatz
|
return branche, umsatz
|
||||||
|
|
||||||
# === WIKIPEDIA DATEN ===
|
# === WIKIPEDIA DATEN ===
|
||||||
@@ -88,7 +85,7 @@ def validate_wikipedia_page(content, title, name, domain_key):
|
|||||||
title_check = any(frag in title.lower() for frag in name_fragments)
|
title_check = any(frag in title.lower() for frag in name_fragments)
|
||||||
content_check = any(frag in content.lower() for frag in name_fragments)
|
content_check = any(frag in content.lower() for frag in name_fragments)
|
||||||
domain_check = domain_key and domain_key.lower() in content.lower()
|
domain_check = domain_key and domain_key.lower() in content.lower()
|
||||||
sim_check = similarity(name, title) > 0.6
|
sim_check = similarity(name, title) > 0.5
|
||||||
return (title_check or content_check or domain_check or sim_check)
|
return (title_check or content_check or domain_check or sim_check)
|
||||||
|
|
||||||
def get_wikipedia_data(name, website_hint=""):
|
def get_wikipedia_data(name, website_hint=""):
|
||||||
@@ -109,11 +106,10 @@ def get_wikipedia_data(name, website_hint=""):
|
|||||||
for title in results:
|
for title in results:
|
||||||
try:
|
try:
|
||||||
page = wikipedia.page(title, auto_suggest=False)
|
page = wikipedia.page(title, auto_suggest=False)
|
||||||
html = requests.get(page.url, timeout=10).text
|
html_text = requests.get(page.url, timeout=10).text
|
||||||
if not validate_wikipedia_page(page.content, title, name, domain_key):
|
if not validate_wikipedia_page(page.content, title, name, domain_key):
|
||||||
continue
|
continue
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
branche, umsatz = parse_infobox_xpath(html_text)
|
||||||
branche, umsatz = parse_infobox_with_fallback(soup)
|
|
||||||
score = similarity(name, title)
|
score = similarity(name, title)
|
||||||
if branche != "k.A.":
|
if branche != "k.A.":
|
||||||
score += 0.1
|
score += 0.1
|
||||||
@@ -158,6 +154,9 @@ print("\n✅ Wikipedia-Auswertung abgeschlossen")
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# === SCHRITT 2: GPT-BEWERTUNG ===
|
# === SCHRITT 2: GPT-BEWERTUNG ===
|
||||||
def classify_company(row, wikipedia_url=""):
|
def classify_company(row, wikipedia_url=""):
|
||||||
user_prompt = {
|
user_prompt = {
|
||||||
|
|||||||
Reference in New Issue
Block a user