Verbesserte Wikipedia-Suche:
- Titelprüfung ergänzt: Namensfragmente müssen im Seitentitel vorkommen - Validierungsmethode erweitert: Kombiniert Titel- und Inhaltsprüfung - Präzisere Ergebnis-Auswahl durch Matching-Score - Blacklist bewusst NICHT verwendet - Leere oder fehlerhafte Wikipedia-Ergebnisse werden sicher erkannt und mit "k.A." ausgegeben
This commit is contained in:
@@ -78,12 +78,12 @@ WHITELIST_KATEGORIEN = [
|
|||||||
"logistik", "automobil"
|
"logistik", "automobil"
|
||||||
]
|
]
|
||||||
|
|
||||||
def validate_wikipedia_page(content, name, domain_key):
|
def validate_wikipedia_page(content, title, name, domain_key):
|
||||||
name_fragments = name.lower().split()[:2]
|
name_fragments = name.lower().split()[:2]
|
||||||
return (
|
title_check = any(frag in title.lower() for frag in name_fragments)
|
||||||
any(frag in content.lower() for frag in name_fragments) or
|
content_check = any(frag in content.lower() for frag in name_fragments)
|
||||||
(domain_key and domain_key.lower() in content.lower())
|
domain_check = domain_key and domain_key.lower() in content.lower()
|
||||||
)
|
return (title_check or content_check or domain_check)
|
||||||
|
|
||||||
def get_wikipedia_data(name, website_hint=""):
|
def get_wikipedia_data(name, website_hint=""):
|
||||||
domain_key = extract_domain_key(website_hint)
|
domain_key = extract_domain_key(website_hint)
|
||||||
@@ -98,7 +98,7 @@ def get_wikipedia_data(name, website_hint=""):
|
|||||||
try:
|
try:
|
||||||
page = wikipedia.page(title, auto_suggest=False)
|
page = wikipedia.page(title, auto_suggest=False)
|
||||||
html = requests.get(page.url, timeout=10).text
|
html = requests.get(page.url, timeout=10).text
|
||||||
if not validate_wikipedia_page(page.content, name, domain_key):
|
if not validate_wikipedia_page(page.content, title, name, domain_key):
|
||||||
continue
|
continue
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
branche, umsatz = parse_infobox_with_fallback(soup)
|
branche, umsatz = parse_infobox_with_fallback(soup)
|
||||||
@@ -139,6 +139,7 @@ print("\n✅ Wikipedia-Auswertung abgeschlossen")
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# === SCHRITT 2: GPT-BEWERTUNG ===
|
# === SCHRITT 2: GPT-BEWERTUNG ===
|
||||||
def classify_company(row, wikipedia_url=""):
|
def classify_company(row, wikipedia_url=""):
|
||||||
user_prompt = {
|
user_prompt = {
|
||||||
|
|||||||
Reference in New Issue
Block a user