diff --git a/brancheneinstufung.py b/brancheneinstufung.py index 3642d116..78dadb89 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -3,6 +3,7 @@ import os import time import csv +import re import pandas as pd import gspread import openai @@ -101,8 +102,7 @@ def get_wikipedia_data(name, website_hint=""): if website_hint: parts = website_hint.replace("https://", "").replace("http://", "").split(".") if len(parts) > 1: - begriffe.append(parts[0]) # z. B. "heimbach" aus "www.heimbach.com" - + begriffe.append(parts[0]) for suchbegriff in begriffe: results = wikipedia.search(suchbegriff, results=3) for title in results: @@ -113,7 +113,7 @@ def get_wikipedia_data(name, website_hint=""): url = page.url html = requests.get(url).text soup = BeautifulSoup(html, 'html.parser') - infobox = soup.find("table", {"class": "infobox"}) + infobox = soup.find("table", class_=["infobox", "infobox vcard"]) branche = umsatz = "" if infobox: for row in infobox.find_all("tr"): @@ -123,7 +123,11 @@ def get_wikipedia_data(name, website_hint=""): if "Branche" in th.text: branche = td.text.strip() if "Umsatz" in th.text: - umsatz = td.text.strip() + umsatz_raw = td.text.strip() + if "Mio" in umsatz_raw: + match = re.search(r"(\d+[,.]?\d*)", umsatz_raw) + if match: + umsatz = match.group(1).replace(",", ".") if not branche: cats = page.categories branche = cats[0] if cats else "k.A."