Wiki Debugging
Wikipedia-Validierung über validate_wikipedia_page ergänzt (stellt sicher, dass Seiteninhalt oder Domain zum Firmennamen passen). Fallback-Parsing parse_infobox_with_fallback() eingebaut für robustere Extraktion von Branche/Umsatz. Branchenbegriff-Erkennung verbessert (Synonyme wie „Tätigkeitsfeld“, „Industriezweig“ etc.). Bedingte Auswertung verbessert: Wenn kein Wikipedia-Link → "k.A." für Branche/Umsatz. Selektor-basierte Extraktion entfernt, da sie fehleranfällig war → durch flexible Infobox-Logik ersetzt.
This commit is contained in:
@@ -47,19 +47,28 @@ def extract_domain_key(url):
|
|||||||
return parts[0] if len(parts) > 1 else ""
|
return parts[0] if len(parts) > 1 else ""
|
||||||
|
|
||||||
# === INFOBOX-PARSING ===
|
# === INFOBOX-PARSING ===
|
||||||
def parse_infobox_with_selector(soup):
|
def parse_infobox_with_fallback(soup):
|
||||||
try:
|
infobox = soup.find("table", class_="infobox")
|
||||||
branche = soup.select_one("#mw-content-text > div.mw-content-ltr.mw-parser-output > table > tbody > tr:nth-child(7) > td:nth-child(2)")
|
branche = "k.A."
|
||||||
umsatz = soup.select_one("#mw-content-text > div.mw-content-ltr.mw-parser-output > table > tbody > tr:nth-child(8) > td:nth-child(2)")
|
umsatz = "k.A."
|
||||||
branche_text = branche.get_text(strip=True) if branche else "k.A."
|
|
||||||
umsatz_text = umsatz.get_text(strip=True) if umsatz else "k.A."
|
if infobox:
|
||||||
if "Mio" in umsatz_text:
|
for row in infobox.find_all("tr"):
|
||||||
match = re.search(r"(\d+[\d.,]*)\s*Mio", umsatz_text)
|
th = row.find("th")
|
||||||
|
td = row.find("td")
|
||||||
|
if not th or not td:
|
||||||
|
continue
|
||||||
|
label = th.get_text(strip=True).lower()
|
||||||
|
value = td.get_text(strip=True)
|
||||||
|
|
||||||
|
if any(b in label for b in ["branche", "tätigkeitsfeld", "industriezweig", "wirtschaftszweig"]):
|
||||||
|
branche = value
|
||||||
|
if "umsatz" in label and "mio" in value.lower():
|
||||||
|
match = re.search(r"(\d+[\d.,]*)\s*Mio", value)
|
||||||
if match:
|
if match:
|
||||||
umsatz_text = match.group(1).replace(",", ".")
|
umsatz = match.group(1).replace(",", ".")
|
||||||
return branche_text, umsatz_text
|
|
||||||
except:
|
return branche, umsatz
|
||||||
return "k.A.", "k.A."
|
|
||||||
|
|
||||||
# === WIKIPEDIA DATEN ===
|
# === WIKIPEDIA DATEN ===
|
||||||
WHITELIST_KATEGORIEN = [
|
WHITELIST_KATEGORIEN = [
|
||||||
@@ -69,6 +78,13 @@ WHITELIST_KATEGORIEN = [
|
|||||||
"logistik", "automobil"
|
"logistik", "automobil"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def validate_wikipedia_page(content, name, domain_key):
|
||||||
|
name_fragments = name.lower().split()[:2]
|
||||||
|
return (
|
||||||
|
any(frag in content.lower() for frag in name_fragments) or
|
||||||
|
(domain_key and domain_key.lower() in content.lower())
|
||||||
|
)
|
||||||
|
|
||||||
def get_wikipedia_data(name, website_hint=""):
|
def get_wikipedia_data(name, website_hint=""):
|
||||||
domain_key = extract_domain_key(website_hint)
|
domain_key = extract_domain_key(website_hint)
|
||||||
search_terms = [name, domain_key] if domain_key else [name]
|
search_terms = [name, domain_key] if domain_key else [name]
|
||||||
@@ -82,9 +98,10 @@ def get_wikipedia_data(name, website_hint=""):
|
|||||||
try:
|
try:
|
||||||
page = wikipedia.page(title, auto_suggest=False)
|
page = wikipedia.page(title, auto_suggest=False)
|
||||||
html = requests.get(page.url, timeout=10).text
|
html = requests.get(page.url, timeout=10).text
|
||||||
if name.split()[0].lower() in page.content.lower() or (domain_key and domain_key.lower() in html.lower()):
|
if not validate_wikipedia_page(page.content, name, domain_key):
|
||||||
|
continue
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
branche, umsatz = parse_infobox_with_selector(soup)
|
branche, umsatz = parse_infobox_with_fallback(soup)
|
||||||
if not branche or branche == "k.A.":
|
if not branche or branche == "k.A.":
|
||||||
for category in page.categories:
|
for category in page.categories:
|
||||||
if any(kw in category.lower() for kw in WHITELIST_KATEGORIEN):
|
if any(kw in category.lower() for kw in WHITELIST_KATEGORIEN):
|
||||||
@@ -121,6 +138,7 @@ for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))):
|
|||||||
print("\n✅ Wikipedia-Auswertung abgeschlossen")
|
print("\n✅ Wikipedia-Auswertung abgeschlossen")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# === SCHRITT 2: GPT-BEWERTUNG ===
|
# === SCHRITT 2: GPT-BEWERTUNG ===
|
||||||
def classify_company(row, wikipedia_url=""):
|
def classify_company(row, wikipedia_url=""):
|
||||||
user_prompt = {
|
user_prompt = {
|
||||||
|
|||||||
Reference in New Issue
Block a user