Die Wikipedia-Suchfunktion wurde überarbeitet
Statt nur wikipedia.page() wird jetzt wikipedia.search() verwendet. Es werden die besten 3 Treffer geprüft. Nur Artikel, deren Titel den Unternehmensnamen enthalten, werden akzeptiert. Dadurch wird z. B. „Heimbach-Gruppe“ korrekt erkannt, auch wenn „Heimbach GmbH“ gesucht wurde.
This commit is contained in:
@@ -99,32 +99,37 @@ system_prompt = {
|
|||||||
def get_wikipedia_data(name, website_hint=""):
|
def get_wikipedia_data(name, website_hint=""):
|
||||||
begriffe = [name.strip(), " ".join(name.split()[:2])]
|
begriffe = [name.strip(), " ".join(name.split()[:2])]
|
||||||
if website_hint:
|
if website_hint:
|
||||||
begriffe.append(website_hint.split(".")[1])
|
parts = website_hint.replace("https://", "").replace("http://", "").split(".")
|
||||||
|
if len(parts) > 1:
|
||||||
|
begriffe.append(parts[0]) # z. B. "heimbach" aus "www.heimbach.com"
|
||||||
|
|
||||||
for suchbegriff in begriffe:
|
for suchbegriff in begriffe:
|
||||||
try:
|
results = wikipedia.search(suchbegriff, results=3)
|
||||||
page = wikipedia.page(suchbegriff, auto_suggest=False)
|
for title in results:
|
||||||
if name.lower().split()[0] not in page.title.lower():
|
try:
|
||||||
|
page = wikipedia.page(title)
|
||||||
|
if name.lower().split()[0] not in page.title.lower():
|
||||||
|
continue
|
||||||
|
url = page.url
|
||||||
|
html = requests.get(url).text
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
infobox = soup.find("table", {"class": "infobox"})
|
||||||
|
branche = umsatz = ""
|
||||||
|
if infobox:
|
||||||
|
for row in infobox.find_all("tr"):
|
||||||
|
th, td = row.find("th"), row.find("td")
|
||||||
|
if not th or not td:
|
||||||
|
continue
|
||||||
|
if "Branche" in th.text:
|
||||||
|
branche = td.text.strip()
|
||||||
|
if "Umsatz" in th.text:
|
||||||
|
umsatz = td.text.strip()
|
||||||
|
if not branche:
|
||||||
|
cats = page.categories
|
||||||
|
branche = cats[0] if cats else "k.A."
|
||||||
|
return url, branche or "k.A.", umsatz or "k.A."
|
||||||
|
except:
|
||||||
continue
|
continue
|
||||||
url = page.url
|
|
||||||
html = requests.get(url).text
|
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
|
||||||
infobox = soup.find("table", {"class": "infobox"})
|
|
||||||
branche = umsatz = ""
|
|
||||||
if infobox:
|
|
||||||
for row in infobox.find_all("tr"):
|
|
||||||
th, td = row.find("th"), row.find("td")
|
|
||||||
if not th or not td:
|
|
||||||
continue
|
|
||||||
if "Branche" in th.text:
|
|
||||||
branche = td.text.strip()
|
|
||||||
if "Umsatz" in th.text:
|
|
||||||
umsatz = td.text.strip()
|
|
||||||
if not branche:
|
|
||||||
cats = page.categories
|
|
||||||
branche = cats[0] if cats else "k.A."
|
|
||||||
return url, branche or "k.A.", umsatz or "k.A."
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
return "", "k.A.", "k.A."
|
return "", "k.A.", "k.A."
|
||||||
|
|
||||||
# === GPT BEWERTUNG ===
|
# === GPT BEWERTUNG ===
|
||||||
|
|||||||
Reference in New Issue
Block a user