fix(wikipedia): Robustere Extraktion durch angepasste Infobox-Auswertung mit Synonym-Feldern
- XPath entfernt, da BeautifulSoup in Kombination mit Infobox-Selektor zuverlässiger - Nur Branchen aus Infobox erlaubt, Kategorien nur als Fallback entfernt - Verbesserte Ausgabe: Nur bei gefundenem Wikipedia-Link werden Werte eingetragen - Versionskennung 1.0.6 eingeführt (Spalte Q)
This commit is contained in:
@@ -12,10 +12,9 @@ from bs4 import BeautifulSoup
|
|||||||
from oauth2client.service_account import ServiceAccountCredentials
|
from oauth2client.service_account import ServiceAccountCredentials
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from difflib import SequenceMatcher
|
from difflib import SequenceMatcher
|
||||||
from lxml import html as lh
|
|
||||||
|
|
||||||
# === KONFIGURATION ===
|
# === KONFIGURATION ===
|
||||||
VERSION = "1.0.9-wiki-refined"
|
VERSION = "1.0.7"
|
||||||
LANG = "de"
|
LANG = "de"
|
||||||
CREDENTIALS = "service_account.json"
|
CREDENTIALS = "service_account.json"
|
||||||
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
|
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
|
||||||
@@ -53,6 +52,35 @@ def extract_domain_key(url):
|
|||||||
def similarity(a, b):
|
def similarity(a, b):
|
||||||
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
|
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
|
||||||
|
|
||||||
|
# === INFOS AUS INFOBOX LESEN ===
|
||||||
|
def extract_infobox_fields(soup):
|
||||||
|
branche = umsatz = ""
|
||||||
|
infobox = soup.find("table", class_=lambda c: c and "infobox" in c)
|
||||||
|
if infobox:
|
||||||
|
for row in infobox.find_all("tr"):
|
||||||
|
th, td = row.find("th"), row.find("td")
|
||||||
|
if not th or not td:
|
||||||
|
continue
|
||||||
|
th_text = th.text.lower().strip()
|
||||||
|
if any(key in th_text for key in ["branche", "tätigkeitsfeld", "bereich"]):
|
||||||
|
branche = td.text.strip()
|
||||||
|
if "umsatz" in th_text:
|
||||||
|
umsatz_raw = td.text.strip()
|
||||||
|
match = re.search(r"(\d+[.,]?\d*)", umsatz_raw)
|
||||||
|
if match:
|
||||||
|
umsatz = match.group(1).replace(",", ".")
|
||||||
|
return branche, umsatz
|
||||||
|
|
||||||
|
# === VALIDIERUNG DES WIKIPEDIA-ARTIKELS ===
|
||||||
|
def is_valid_wiki_article(content, name, domain_key):
|
||||||
|
name_parts = name.lower().split()
|
||||||
|
score = 0
|
||||||
|
if any(part in content.lower() for part in name_parts):
|
||||||
|
score += 1
|
||||||
|
if domain_key and domain_key.lower() in content.lower():
|
||||||
|
score += 1
|
||||||
|
return score >= 1
|
||||||
|
|
||||||
# === WIKIPEDIA DATEN LADEN ===
|
# === WIKIPEDIA DATEN LADEN ===
|
||||||
def get_wikipedia_data(name, website_hint=""):
|
def get_wikipedia_data(name, website_hint=""):
|
||||||
begriffe = [name.strip(), " ".join(name.split()[:2])]
|
begriffe = [name.strip(), " ".join(name.split()[:2])]
|
||||||
@@ -60,35 +88,20 @@ def get_wikipedia_data(name, website_hint=""):
|
|||||||
parts = website_hint.replace("https://", "").replace("http://", "").split(".")
|
parts = website_hint.replace("https://", "").replace("http://", "").split(".")
|
||||||
if len(parts) > 1:
|
if len(parts) > 1:
|
||||||
begriffe.append(parts[0])
|
begriffe.append(parts[0])
|
||||||
|
domain_key = extract_domain_key(website_hint)
|
||||||
|
|
||||||
for suchbegriff in begriffe:
|
for suchbegriff in begriffe:
|
||||||
results = wikipedia.search(suchbegriff, results=3)
|
results = wikipedia.search(suchbegriff, results=5)
|
||||||
for title in results:
|
for title in results:
|
||||||
try:
|
try:
|
||||||
page = wikipedia.page(title)
|
page = wikipedia.page(title, auto_suggest=False)
|
||||||
if name.lower().split()[0] not in page.title.lower():
|
html_content = requests.get(page.url, timeout=10).text
|
||||||
|
if not is_valid_wiki_article(html_content, name, domain_key):
|
||||||
continue
|
continue
|
||||||
url = page.url
|
|
||||||
html_content = requests.get(url, timeout=10).text
|
|
||||||
soup = BeautifulSoup(html_content, 'html.parser')
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
infobox = soup.find("table", {"class": "infobox"})
|
branche, umsatz = extract_infobox_fields(soup)
|
||||||
branche = umsatz = ""
|
if branche or umsatz:
|
||||||
if infobox:
|
return page.url, branche or "k.A.", umsatz or "k.A."
|
||||||
for row in infobox.find_all("tr"):
|
|
||||||
th, td = row.find("th"), row.find("td")
|
|
||||||
if not th or not td:
|
|
||||||
continue
|
|
||||||
if "Branche" in th.text:
|
|
||||||
branche = td.text.strip()
|
|
||||||
if "Umsatz" in th.text:
|
|
||||||
umsatz_raw = td.text.strip()
|
|
||||||
match = re.search(r"(\d+[.,]?\d*)", umsatz_raw)
|
|
||||||
if match:
|
|
||||||
umsatz = match.group(1).replace(",", ".")
|
|
||||||
if not branche:
|
|
||||||
cats = page.categories
|
|
||||||
branche = cats[0] if cats else "k.A."
|
|
||||||
return url, branche or "k.A.", umsatz or "k.A."
|
|
||||||
except:
|
except:
|
||||||
continue
|
continue
|
||||||
return "", "k.A.", "k.A."
|
return "", "k.A.", "k.A."
|
||||||
@@ -124,6 +137,9 @@ print("\n✅ Wikipedia-Auswertung abgeschlossen")
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# === SCHRITT 2: GPT-BEWERTUNG ===
|
# === SCHRITT 2: GPT-BEWERTUNG ===
|
||||||
def classify_company(row, wikipedia_url=""):
|
def classify_company(row, wikipedia_url=""):
|
||||||
user_prompt = {
|
user_prompt = {
|
||||||
|
|||||||
Reference in New Issue
Block a user