Die Wikipedia-Suchfunktion wurde überarbeitet

Statt nur wikipedia.page() wird jetzt wikipedia.search() verwendet. Es werden die besten 3 Treffer geprüft. Nur Artikel, deren Titel den Unternehmensnamen enthalten, werden akzeptiert. Dadurch wird z. B. „Heimbach-Gruppe“ korrekt erkannt, auch wenn „Heimbach GmbH“ gesucht wurde.
2025-03-30 06:31:21 +00:00
parent 762aae0230
commit 8dffaedb51
1 changed files with 29 additions and 24 deletions
--- a/brancheneinstufung.py
+++ b/brancheneinstufung.py
@@ -99,32 +99,37 @@ system_prompt = {
 def get_wikipedia_data(name, website_hint=""):
    begriffe = [name.strip(), " ".join(name.split()[:2])]
    if website_hint:
-        begriffe.append(website_hint.split(".")[1])
+        parts = website_hint.replace("https://", "").replace("http://", "").split(".")
        if len(parts) > 1:
            begriffe.append(parts[0])  # z. B. "heimbach" aus "www.heimbach.com"
    for suchbegriff in begriffe:
-        try:
+        results = wikipedia.search(suchbegriff, results=3)
-            page = wikipedia.page(suchbegriff, auto_suggest=False)
+        for title in results:
-            if name.lower().split()[0] not in page.title.lower():
+            try:
                page = wikipedia.page(title)
                if name.lower().split()[0] not in page.title.lower():
                    continue
                url = page.url
                html = requests.get(url).text
                soup = BeautifulSoup(html, 'html.parser')
                infobox = soup.find("table", {"class": "infobox"})
                branche = umsatz = ""
                if infobox:
                    for row in infobox.find_all("tr"):
                        th, td = row.find("th"), row.find("td")
                        if not th or not td:
                            continue
                        if "Branche" in th.text:
                            branche = td.text.strip()
                        if "Umsatz" in th.text:
                            umsatz = td.text.strip()
                if not branche:
                    cats = page.categories
                    branche = cats[0] if cats else "k.A."
                return url, branche or "k.A.", umsatz or "k.A."
            except:
                continue
            url = page.url
            html = requests.get(url).text
            soup = BeautifulSoup(html, 'html.parser')
            infobox = soup.find("table", {"class": "infobox"})
            branche = umsatz = ""
            if infobox:
                for row in infobox.find_all("tr"):
                    th, td = row.find("th"), row.find("td")
                    if not th or not td:
                        continue
                    if "Branche" in th.text:
                        branche = td.text.strip()
                    if "Umsatz" in th.text:
                        umsatz = td.text.strip()
            if not branche:
                cats = page.categories
                branche = cats[0] if cats else "k.A."
            return url, branche or "k.A.", umsatz or "k.A."
        except:
            continue
    return "", "k.A.", "k.A."
 # === GPT BEWERTUNG ===