🔁 Refactor Wikipedia-Parser to stable v1.0.7 logic

- reverted to earlier reliable search strategy
- restored infobox-first extraction
- ensured fallback to categories only if infobox branch is missing
- maintained timestamp and version output
This commit is contained in:
2025-03-31 09:00:44 +00:00
parent 21415698e0
commit c27f2cdca2

View File

@@ -14,7 +14,7 @@ from datetime import datetime
from difflib import SequenceMatcher from difflib import SequenceMatcher
# === KONFIGURATION === # === KONFIGURATION ===
VERSION = "1.0.7" VERSION = "1.0.8"
LANG = "de" LANG = "de"
CREDENTIALS = "service_account.json" CREDENTIALS = "service_account.json"
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo" SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
@@ -40,68 +40,39 @@ print(f"Starte bei Zeile {start+1}")
wikipedia.set_lang(LANG) wikipedia.set_lang(LANG)
# === DOMAIN SCHLÜSSEL ===
def extract_domain_key(url):
if not url:
return ""
clean_url = url.replace("https://", "").replace("http://", "").split("/")[0]
parts = clean_url.split(".")
return parts[0] if len(parts) > 1 else ""
# === ÄHNLICHKEITSPRÜFUNG ===
def similarity(a, b):
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
# === INFOS AUS INFOBOX LESEN ===
def extract_infobox_fields(soup):
branche = umsatz = ""
infobox = soup.find("table", class_=lambda c: c and "infobox" in c)
if infobox:
for row in infobox.find_all("tr"):
th, td = row.find("th"), row.find("td")
if not th or not td:
continue
th_text = th.text.lower().strip()
if any(key in th_text for key in ["branche", "tätigkeitsfeld", "bereich"]):
branche = td.text.strip()
if "umsatz" in th_text:
umsatz_raw = td.text.strip()
match = re.search(r"(\d+[.,]?\d*)", umsatz_raw)
if match:
umsatz = match.group(1).replace(",", ".")
return branche, umsatz
# === VALIDIERUNG DES WIKIPEDIA-ARTIKELS ===
def is_valid_wiki_article(content, name, domain_key):
name_parts = name.lower().split()
score = 0
if any(part in content.lower() for part in name_parts):
score += 1
if domain_key and domain_key.lower() in content.lower():
score += 1
return score >= 1
# === WIKIPEDIA DATEN LADEN === # === WIKIPEDIA DATEN LADEN ===
def get_wikipedia_data(name, website_hint=""): def get_wikipedia_data(name, website_hint=""):
begriffe = [name.strip(), " ".join(name.split()[:2])] begriffe = [name.strip(), " ".join(name.split()[:2])]
if website_hint: if website_hint:
parts = website_hint.replace("https://", "").replace("http://", "").split(".") parts = website_hint.replace("https://", "").replace("http://", "").split(".")
if len(parts) > 1: if len(parts) > 1:
begriffe.append(parts[0]) begriffe.append(parts[0]) # z.B. "heimbach" aus "www.heimbach.com"
domain_key = extract_domain_key(website_hint)
for suchbegriff in begriffe: for suchbegriff in begriffe:
results = wikipedia.search(suchbegriff, results=5) results = wikipedia.search(suchbegriff, results=3)
for title in results: for title in results:
try: try:
page = wikipedia.page(title, auto_suggest=False) page = wikipedia.page(title)
html_content = requests.get(page.url, timeout=10).text if name.lower().split()[0] not in page.title.lower():
if not is_valid_wiki_article(html_content, name, domain_key):
continue continue
soup = BeautifulSoup(html_content, 'html.parser') url = page.url
branche, umsatz = extract_infobox_fields(soup) html = requests.get(url).text
if branche or umsatz: soup = BeautifulSoup(html, 'html.parser')
return page.url, branche or "k.A.", umsatz or "k.A." infobox = soup.find("table", {"class": "infobox"})
branche = umsatz = ""
if infobox:
for row in infobox.find_all("tr"):
th, td = row.find("th"), row.find("td")
if not th or not td:
continue
if "Branche" in th.text:
branche = td.text.strip()
if "Umsatz" in th.text:
umsatz = td.text.strip()
if not branche:
cats = page.categories
branche = cats[0] if cats else "k.A."
return url, branche or "k.A.", umsatz or "k.A."
except: except:
continue continue
return "", "k.A.", "k.A." return "", "k.A.", "k.A."
@@ -140,6 +111,7 @@ print("\n✅ Wikipedia-Auswertung abgeschlossen")
# === SCHRITT 2: GPT-BEWERTUNG === # === SCHRITT 2: GPT-BEWERTUNG ===
def classify_company(row, wikipedia_url=""): def classify_company(row, wikipedia_url=""):
user_prompt = { user_prompt = {