diff --git a/brancheneinstufung.py b/brancheneinstufung.py index afe643ce..f2e93155 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -14,7 +14,7 @@ from datetime import datetime from difflib import SequenceMatcher # === KONFIGURATION === -VERSION = "1.0.7" +VERSION = "1.0.8" LANG = "de" CREDENTIALS = "service_account.json" SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo" @@ -40,68 +40,39 @@ print(f"Starte bei Zeile {start+1}") wikipedia.set_lang(LANG) -# === DOMAIN SCHLÜSSEL === -def extract_domain_key(url): - if not url: - return "" - clean_url = url.replace("https://", "").replace("http://", "").split("/")[0] - parts = clean_url.split(".") - return parts[0] if len(parts) > 1 else "" - -# === ÄHNLICHKEITSPRÜFUNG === -def similarity(a, b): - return SequenceMatcher(None, a.lower(), b.lower()).ratio() - -# === INFOS AUS INFOBOX LESEN === -def extract_infobox_fields(soup): - branche = umsatz = "" - infobox = soup.find("table", class_=lambda c: c and "infobox" in c) - if infobox: - for row in infobox.find_all("tr"): - th, td = row.find("th"), row.find("td") - if not th or not td: - continue - th_text = th.text.lower().strip() - if any(key in th_text for key in ["branche", "tätigkeitsfeld", "bereich"]): - branche = td.text.strip() - if "umsatz" in th_text: - umsatz_raw = td.text.strip() - match = re.search(r"(\d+[.,]?\d*)", umsatz_raw) - if match: - umsatz = match.group(1).replace(",", ".") - return branche, umsatz - -# === VALIDIERUNG DES WIKIPEDIA-ARTIKELS === -def is_valid_wiki_article(content, name, domain_key): - name_parts = name.lower().split() - score = 0 - if any(part in content.lower() for part in name_parts): - score += 1 - if domain_key and domain_key.lower() in content.lower(): - score += 1 - return score >= 1 - # === WIKIPEDIA DATEN LADEN === def get_wikipedia_data(name, website_hint=""): begriffe = [name.strip(), " ".join(name.split()[:2])] if website_hint: parts = website_hint.replace("https://", "").replace("http://", "").split(".") if len(parts) > 1: - begriffe.append(parts[0]) - domain_key = extract_domain_key(website_hint) + begriffe.append(parts[0]) # z. B. "heimbach" aus "www.heimbach.com" for suchbegriff in begriffe: - results = wikipedia.search(suchbegriff, results=5) + results = wikipedia.search(suchbegriff, results=3) for title in results: try: - page = wikipedia.page(title, auto_suggest=False) - html_content = requests.get(page.url, timeout=10).text - if not is_valid_wiki_article(html_content, name, domain_key): + page = wikipedia.page(title) + if name.lower().split()[0] not in page.title.lower(): continue - soup = BeautifulSoup(html_content, 'html.parser') - branche, umsatz = extract_infobox_fields(soup) - if branche or umsatz: - return page.url, branche or "k.A.", umsatz or "k.A." + url = page.url + html = requests.get(url).text + soup = BeautifulSoup(html, 'html.parser') + infobox = soup.find("table", {"class": "infobox"}) + branche = umsatz = "" + if infobox: + for row in infobox.find_all("tr"): + th, td = row.find("th"), row.find("td") + if not th or not td: + continue + if "Branche" in th.text: + branche = td.text.strip() + if "Umsatz" in th.text: + umsatz = td.text.strip() + if not branche: + cats = page.categories + branche = cats[0] if cats else "k.A." + return url, branche or "k.A.", umsatz or "k.A." except: continue return "", "k.A.", "k.A." @@ -140,6 +111,7 @@ print("\n✅ Wikipedia-Auswertung abgeschlossen") + # === SCHRITT 2: GPT-BEWERTUNG === def classify_company(row, wikipedia_url=""): user_prompt = {