🔁 Refactor Wikipedia-Parser to stable v1.0.7 logic

- reverted to earlier reliable search strategy
- restored infobox-first extraction
- ensured fallback to categories only if infobox branch is missing
- maintained timestamp and version output
This commit is contained in:
2025-03-31 09:00:44 +00:00
parent 21415698e0
commit c27f2cdca2

View File

@@ -14,7 +14,7 @@ from datetime import datetime
from difflib import SequenceMatcher
# === KONFIGURATION ===
VERSION = "1.0.7"
VERSION = "1.0.8"
LANG = "de"
CREDENTIALS = "service_account.json"
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
@@ -40,68 +40,39 @@ print(f"Starte bei Zeile {start+1}")
wikipedia.set_lang(LANG)
# === DOMAIN SCHLÜSSEL ===
def extract_domain_key(url):
if not url:
return ""
clean_url = url.replace("https://", "").replace("http://", "").split("/")[0]
parts = clean_url.split(".")
return parts[0] if len(parts) > 1 else ""
# === ÄHNLICHKEITSPRÜFUNG ===
def similarity(a, b):
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
# === INFOS AUS INFOBOX LESEN ===
def extract_infobox_fields(soup):
branche = umsatz = ""
infobox = soup.find("table", class_=lambda c: c and "infobox" in c)
if infobox:
for row in infobox.find_all("tr"):
th, td = row.find("th"), row.find("td")
if not th or not td:
continue
th_text = th.text.lower().strip()
if any(key in th_text for key in ["branche", "tätigkeitsfeld", "bereich"]):
branche = td.text.strip()
if "umsatz" in th_text:
umsatz_raw = td.text.strip()
match = re.search(r"(\d+[.,]?\d*)", umsatz_raw)
if match:
umsatz = match.group(1).replace(",", ".")
return branche, umsatz
# === VALIDIERUNG DES WIKIPEDIA-ARTIKELS ===
def is_valid_wiki_article(content, name, domain_key):
name_parts = name.lower().split()
score = 0
if any(part in content.lower() for part in name_parts):
score += 1
if domain_key and domain_key.lower() in content.lower():
score += 1
return score >= 1
# === WIKIPEDIA DATEN LADEN ===
def get_wikipedia_data(name, website_hint=""):
begriffe = [name.strip(), " ".join(name.split()[:2])]
if website_hint:
parts = website_hint.replace("https://", "").replace("http://", "").split(".")
if len(parts) > 1:
begriffe.append(parts[0])
domain_key = extract_domain_key(website_hint)
begriffe.append(parts[0]) # z.B. "heimbach" aus "www.heimbach.com"
for suchbegriff in begriffe:
results = wikipedia.search(suchbegriff, results=5)
results = wikipedia.search(suchbegriff, results=3)
for title in results:
try:
page = wikipedia.page(title, auto_suggest=False)
html_content = requests.get(page.url, timeout=10).text
if not is_valid_wiki_article(html_content, name, domain_key):
page = wikipedia.page(title)
if name.lower().split()[0] not in page.title.lower():
continue
soup = BeautifulSoup(html_content, 'html.parser')
branche, umsatz = extract_infobox_fields(soup)
if branche or umsatz:
return page.url, branche or "k.A.", umsatz or "k.A."
url = page.url
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
infobox = soup.find("table", {"class": "infobox"})
branche = umsatz = ""
if infobox:
for row in infobox.find_all("tr"):
th, td = row.find("th"), row.find("td")
if not th or not td:
continue
if "Branche" in th.text:
branche = td.text.strip()
if "Umsatz" in th.text:
umsatz = td.text.strip()
if not branche:
cats = page.categories
branche = cats[0] if cats else "k.A."
return url, branche or "k.A.", umsatz or "k.A."
except:
continue
return "", "k.A.", "k.A."
@@ -140,6 +111,7 @@ print("\n✅ Wikipedia-Auswertung abgeschlossen")
# === SCHRITT 2: GPT-BEWERTUNG ===
def classify_company(row, wikipedia_url=""):
user_prompt = {