From 09885848ec0a20b3f1cbe88b2df579dee05f130f Mon Sep 17 00:00:00 2001 From: Floke Date: Mon, 31 Mar 2025 06:12:53 +0000 Subject: [PATCH] Verbessert: Wikipedia-Suchlogik erneut optimiert (Version 1.0.2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Wieder eingeführt: Kombination aus vollständigem Firmennamen, ersten zwei Wörtern und Domain-Fragment zur Suche - Erhöhte Trefferwahrscheinlichkeit durch Titel-, Inhalts-, Domain- und Ähnlichkeitsprüfung - Info aus Infobox (Branche & Umsatz) wird bevorzugt; Kategorien nur als Fallback - Version in Spalte Q geschrieben zur Nachverfolgbarkeit --- brancheneinstufung.py | 48 ++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/brancheneinstufung.py b/brancheneinstufung.py index 1f501901..42242d29 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -11,9 +11,10 @@ import csv from bs4 import BeautifulSoup from oauth2client.service_account import ServiceAccountCredentials from datetime import datetime +from difflib import SequenceMatcher # === KONFIGURATION === -VERSION = "1.0.0-wiki-only" +VERSION = "1.0.2-wiki-only" LANG = "de" CREDENTIALS = "service_account.json" SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo" @@ -79,22 +80,32 @@ WHITELIST_KATEGORIEN = [ "logistik", "automobil" ] +def similarity(a, b): + return SequenceMatcher(None, a.lower(), b.lower()).ratio() + def validate_wikipedia_page(content, title, name, domain_key): name_fragments = name.lower().split()[:2] title_check = any(frag in title.lower() for frag in name_fragments) content_check = any(frag in content.lower() for frag in name_fragments) domain_check = domain_key and domain_key.lower() in content.lower() - return (title_check or content_check or domain_check) + sim_check = similarity(name, title) > 0.6 + return (title_check or content_check or domain_check or sim_check) def get_wikipedia_data(name, website_hint=""): + begriffe = [name.strip(), " ".join(name.split()[:2])] domain_key = extract_domain_key(website_hint) - search_terms = [name, domain_key] if domain_key else [name] - for term in search_terms: - if not term: + if domain_key: + begriffe.append(domain_key) + + best_score = 0 + best_result = ("", "k.A.", "k.A.") + + for suchbegriff in begriffe: + if not suchbegriff: continue for attempt in range(MAX_RETRIES): try: - results = wikipedia.search(term, results=3) + results = wikipedia.search(suchbegriff, results=5) for title in results: try: page = wikipedia.page(title, auto_suggest=False) @@ -103,18 +114,21 @@ def get_wikipedia_data(name, website_hint=""): continue soup = BeautifulSoup(html, "html.parser") branche, umsatz = parse_infobox_with_fallback(soup) - if (not branche or branche == "k.A.") and page.categories: - for category in page.categories: - if any(kw in category.lower() for kw in WHITELIST_KATEGORIEN): - branche = category - break - return page.url, branche or "k.A.", umsatz or "k.A." + score = similarity(name, title) + if branche != "k.A.": + score += 0.1 + if domain_key and domain_key in page.content.lower(): + score += 0.1 + if score > best_score: + best_score = score + best_result = (page.url, branche or "k.A.", umsatz or "k.A.") except: continue except Exception as e: - print(f"⚠️ Wikipedia-Fehler ({term}, Versuch {attempt+1}): {str(e)[:100]}") + print(f"⚠️ Wikipedia-Fehler ({suchbegriff}, Versuch {attempt+1}): {str(e)[:100]}") time.sleep(RETRY_DELAY) - return "", "k.A.", "k.A." + + return best_result # === SCHRITT 1: WIKIPEDIA VERARBEITUNG === for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))): @@ -130,10 +144,9 @@ for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))): "k.A.", "k.A.", "k.A.", url, datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - "k.A.", "k.A." + "k.A.", "k.A.", + VERSION ] - # Neue Spalte mit Version am Ende - values.append(VERSION) sheet.update(range_name=f"G{i+1}:Q{i+1}", values=[values]) print(f"✅ Aktualisiert: {values[:3]}...") time.sleep(RETRY_DELAY) @@ -144,6 +157,7 @@ print("\n✅ Wikipedia-Auswertung abgeschlossen") + # === SCHRITT 2: GPT-BEWERTUNG === def classify_company(row, wikipedia_url=""): user_prompt = {