diff --git a/brancheneinstufung.py b/brancheneinstufung.py index 3f41baa1..afe643ce 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -12,10 +12,9 @@ from bs4 import BeautifulSoup from oauth2client.service_account import ServiceAccountCredentials from datetime import datetime from difflib import SequenceMatcher -from lxml import html as lh # === KONFIGURATION === -VERSION = "1.0.9-wiki-refined" +VERSION = "1.0.7" LANG = "de" CREDENTIALS = "service_account.json" SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo" @@ -53,6 +52,35 @@ def extract_domain_key(url): def similarity(a, b): return SequenceMatcher(None, a.lower(), b.lower()).ratio() +# === INFOS AUS INFOBOX LESEN === +def extract_infobox_fields(soup): + branche = umsatz = "" + infobox = soup.find("table", class_=lambda c: c and "infobox" in c) + if infobox: + for row in infobox.find_all("tr"): + th, td = row.find("th"), row.find("td") + if not th or not td: + continue + th_text = th.text.lower().strip() + if any(key in th_text for key in ["branche", "tätigkeitsfeld", "bereich"]): + branche = td.text.strip() + if "umsatz" in th_text: + umsatz_raw = td.text.strip() + match = re.search(r"(\d+[.,]?\d*)", umsatz_raw) + if match: + umsatz = match.group(1).replace(",", ".") + return branche, umsatz + +# === VALIDIERUNG DES WIKIPEDIA-ARTIKELS === +def is_valid_wiki_article(content, name, domain_key): + name_parts = name.lower().split() + score = 0 + if any(part in content.lower() for part in name_parts): + score += 1 + if domain_key and domain_key.lower() in content.lower(): + score += 1 + return score >= 1 + # === WIKIPEDIA DATEN LADEN === def get_wikipedia_data(name, website_hint=""): begriffe = [name.strip(), " ".join(name.split()[:2])] @@ -60,35 +88,20 @@ def get_wikipedia_data(name, website_hint=""): parts = website_hint.replace("https://", "").replace("http://", "").split(".") if len(parts) > 1: begriffe.append(parts[0]) + domain_key = extract_domain_key(website_hint) for suchbegriff in begriffe: - results = wikipedia.search(suchbegriff, results=3) + results = wikipedia.search(suchbegriff, results=5) for title in results: try: - page = wikipedia.page(title) - if name.lower().split()[0] not in page.title.lower(): + page = wikipedia.page(title, auto_suggest=False) + html_content = requests.get(page.url, timeout=10).text + if not is_valid_wiki_article(html_content, name, domain_key): continue - url = page.url - html_content = requests.get(url, timeout=10).text soup = BeautifulSoup(html_content, 'html.parser') - infobox = soup.find("table", {"class": "infobox"}) - branche = umsatz = "" - if infobox: - for row in infobox.find_all("tr"): - th, td = row.find("th"), row.find("td") - if not th or not td: - continue - if "Branche" in th.text: - branche = td.text.strip() - if "Umsatz" in th.text: - umsatz_raw = td.text.strip() - match = re.search(r"(\d+[.,]?\d*)", umsatz_raw) - if match: - umsatz = match.group(1).replace(",", ".") - if not branche: - cats = page.categories - branche = cats[0] if cats else "k.A." - return url, branche or "k.A.", umsatz or "k.A." + branche, umsatz = extract_infobox_fields(soup) + if branche or umsatz: + return page.url, branche or "k.A.", umsatz or "k.A." except: continue return "", "k.A.", "k.A." @@ -124,6 +137,9 @@ print("\n✅ Wikipedia-Auswertung abgeschlossen") + + + # === SCHRITT 2: GPT-BEWERTUNG === def classify_company(row, wikipedia_url=""): user_prompt = {