diff --git a/brancheneinstufung.py b/brancheneinstufung.py index c1dadfb0..e16fa406 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -99,32 +99,37 @@ system_prompt = { def get_wikipedia_data(name, website_hint=""): begriffe = [name.strip(), " ".join(name.split()[:2])] if website_hint: - begriffe.append(website_hint.split(".")[1]) + parts = website_hint.replace("https://", "").replace("http://", "").split(".") + if len(parts) > 1: + begriffe.append(parts[0]) # z. B. "heimbach" aus "www.heimbach.com" + for suchbegriff in begriffe: - try: - page = wikipedia.page(suchbegriff, auto_suggest=False) - if name.lower().split()[0] not in page.title.lower(): + results = wikipedia.search(suchbegriff, results=3) + for title in results: + try: + page = wikipedia.page(title) + if name.lower().split()[0] not in page.title.lower(): + continue + url = page.url + html = requests.get(url).text + soup = BeautifulSoup(html, 'html.parser') + infobox = soup.find("table", {"class": "infobox"}) + branche = umsatz = "" + if infobox: + for row in infobox.find_all("tr"): + th, td = row.find("th"), row.find("td") + if not th or not td: + continue + if "Branche" in th.text: + branche = td.text.strip() + if "Umsatz" in th.text: + umsatz = td.text.strip() + if not branche: + cats = page.categories + branche = cats[0] if cats else "k.A." + return url, branche or "k.A.", umsatz or "k.A." + except: continue - url = page.url - html = requests.get(url).text - soup = BeautifulSoup(html, 'html.parser') - infobox = soup.find("table", {"class": "infobox"}) - branche = umsatz = "" - if infobox: - for row in infobox.find_all("tr"): - th, td = row.find("th"), row.find("td") - if not th or not td: - continue - if "Branche" in th.text: - branche = td.text.strip() - if "Umsatz" in th.text: - umsatz = td.text.strip() - if not branche: - cats = page.categories - branche = cats[0] if cats else "k.A." - return url, branche or "k.A.", umsatz or "k.A." - except: - continue return "", "k.A.", "k.A." # === GPT BEWERTUNG ===