diff --git a/brancheneinstufung.py b/brancheneinstufung.py index 6e9bf2b3..3f41baa1 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -15,7 +15,7 @@ from difflib import SequenceMatcher from lxml import html as lh # === KONFIGURATION === -VERSION = "1.0.8-wiki-api" +VERSION = "1.0.9-wiki-refined" LANG = "de" CREDENTIALS = "service_account.json" SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo" @@ -49,93 +49,60 @@ def extract_domain_key(url): parts = clean_url.split(".") return parts[0] if len(parts) > 1 else "" -# === INFOBOX-PARSING MIT XPATH === -def parse_infobox_xpath(html_text): - doc = lh.fromstring(html_text) - branche = "k.A." - umsatz = "k.A." - try: - branche_xpath = doc.xpath("//table[contains(@class, 'infobox')]//tr[th[contains(normalize-space(), 'Branche') or contains(normalize-space(), 'Tätigkeitsfeld')]]/td/text()") - umsatz_xpath = doc.xpath("//table[contains(@class, 'infobox')]//tr[th[contains(translate(normalize-space(), 'UMSATZ', 'umsatz'), 'umsatz')]]/td/text()") - if branche_xpath: - branche = branche_xpath[0].strip() - if umsatz_xpath: - umsatz_raw = umsatz_xpath[0].strip() - if "mio" in umsatz_raw.lower() or "millionen" in umsatz_raw.lower(): - match = re.search(r"(\d+[.,]?\d*)", umsatz_raw) - if match: - umsatz = match.group(1).replace(",", ".") - except: - pass - return branche, umsatz - -# === WIKIPEDIA DATEN === -WHITELIST_KATEGORIEN = [ - "unternehmen", "hersteller", "produktion", "industrie", - "maschinenbau", "technik", "dienstleistung", "chemie", - "pharma", "elektro", "medizin", "bau", "energie", - "logistik", "automobil" -] - +# === ÄHNLICHKEITSPRÜFUNG === def similarity(a, b): return SequenceMatcher(None, a.lower(), b.lower()).ratio() -def validate_wikipedia_page(content, title, name, domain_key): - name_fragments = name.lower().split()[:2] - title_check = any(frag in title.lower() for frag in name_fragments) - content_check = any(frag in content.lower() for frag in name_fragments) - domain_check = domain_key and domain_key.lower() in content.lower() - sim_check = similarity(name, title) > 0.5 - return (title_check or content_check or domain_check or sim_check) - +# === WIKIPEDIA DATEN LADEN === def get_wikipedia_data(name, website_hint=""): begriffe = [name.strip(), " ".join(name.split()[:2])] - domain_key = extract_domain_key(website_hint) - if domain_key: - begriffe.append(domain_key) - - best_score = 0 - best_result = ("", "k.A.", "k.A.") + if website_hint: + parts = website_hint.replace("https://", "").replace("http://", "").split(".") + if len(parts) > 1: + begriffe.append(parts[0]) for suchbegriff in begriffe: - if not suchbegriff: - continue - for attempt in range(MAX_RETRIES): + results = wikipedia.search(suchbegriff, results=3) + for title in results: try: - results = wikipedia.search(suchbegriff, results=5) - for title in results: - try: - page = wikipedia.page(title, auto_suggest=False) - html_text = requests.get(page.url, timeout=10).text - if not validate_wikipedia_page(page.content, title, name, domain_key): + page = wikipedia.page(title) + if name.lower().split()[0] not in page.title.lower(): + continue + url = page.url + html_content = requests.get(url, timeout=10).text + soup = BeautifulSoup(html_content, 'html.parser') + infobox = soup.find("table", {"class": "infobox"}) + branche = umsatz = "" + if infobox: + for row in infobox.find_all("tr"): + th, td = row.find("th"), row.find("td") + if not th or not td: continue - branche, umsatz = parse_infobox_xpath(html_text) - score = similarity(name, title) - if branche != "k.A.": - score += 0.1 - if domain_key and domain_key in page.content.lower(): - score += 0.1 - if score > best_score: - best_score = score - best_result = (page.url, branche or "k.A.", umsatz or "k.A.") - except: - continue - except Exception as e: - print(f"⚠️ Wikipedia-Fehler ({suchbegriff}, Versuch {attempt+1}): {str(e)[:100]}") - time.sleep(RETRY_DELAY) + if "Branche" in th.text: + branche = td.text.strip() + if "Umsatz" in th.text: + umsatz_raw = td.text.strip() + match = re.search(r"(\d+[.,]?\d*)", umsatz_raw) + if match: + umsatz = match.group(1).replace(",", ".") + if not branche: + cats = page.categories + branche = cats[0] if cats else "k.A." + return url, branche or "k.A.", umsatz or "k.A." + except: + continue + return "", "k.A.", "k.A." - return best_result - -# === SCHRITT 1: WIKIPEDIA VERARBEITUNG === +# === VERARBEITUNG === for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))): row = sheet_values[i] print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {i+1}: {row[0]}") - url, wiki_branche, umsatz = get_wikipedia_data(row[0], row[1]) - wiki_final = wiki_branche if url else "k.A." + url, branche, umsatz = get_wikipedia_data(row[0], row[1]) + branche_final = branche if url else "k.A." umsatz_final = umsatz if url else "k.A." values = [ - wiki_final, - "k.A.", # LinkedIn-Branche leer + branche_final, + "k.A.", umsatz_final, "k.A.", "k.A.", "k.A.", url,