From e4b4d3afc90300db49be711584ff6a22af0bd527 Mon Sep 17 00:00:00 2001
From: Floke <floke.com@gmail.com>
Date: Mon, 31 Mar 2025 09:45:39 +0000
Subject: [PATCH] 1.0.10 Wiki
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wikipedia-Suche prüft jetzt explizit auf Namensähnlichkeit via SequenceMatcher.

Einträge ohne passenden Domainbezug oder ähnlichen Namen werden übersprungen.

Robustere Extraktion von Branche/Umsatz via lxml-XPath.

Versionierung korrekt in Spalte Q ausgegeben.
---
 brancheneinstufung.py | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/brancheneinstufung.py b/brancheneinstufung.py
index 255bf21f..eb4e381d 100644
--- a/brancheneinstufung.py
+++ b/brancheneinstufung.py
@@ -15,7 +15,7 @@ from datetime import datetime
 from difflib import SequenceMatcher
 
 # === KONFIGURATION ===
-VERSION = "1.0.9"
+VERSION = "1.0.10"
 LANG = "de"
 CREDENTIALS = "service_account.json"
 SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
@@ -23,6 +23,7 @@ DURCHLÄUFE = int(input("Wieviele Zeilen sollen überprüft werden? "))
 MAX_RETRIES = 3
 RETRY_DELAY = 5
 LOG_CSV = "gpt_antworten_log.csv"
+SIMILARITY_THRESHOLD = 0.6
 
 # === OpenAI API-KEY LADEN ===
 with open("api_key.txt", "r") as f:
@@ -41,32 +42,35 @@ print(f"Starte bei Zeile {start+1}")
 
 wikipedia.set_lang(LANG)
 
+def similar(a, b):
+    return SequenceMatcher(None, a.lower(), b.lower()).ratio()
+
 def get_wikipedia_data(name, website_hint=""):
     begriffe = [name.strip(), " ".join(name.split()[:2])]
+    domain_key = ""
     if website_hint:
         parts = website_hint.replace("https://", "").replace("http://", "").split(".")
         if len(parts) > 1:
-            begriffe.append(parts[0])
+            domain_key = parts[0]
+            begriffe.append(domain_key)
 
     for suchbegriff in begriffe:
-        results = wikipedia.search(suchbegriff, results=3)
+        results = wikipedia.search(suchbegriff, results=5)
         for title in results:
             try:
-                page = wikipedia.page(title)
-                if name.lower().split()[0] not in page.title.lower():
+                page = wikipedia.page(title, auto_suggest=False)
+                html_raw = requests.get(page.url).text
+                if domain_key and domain_key not in html_raw.lower():
+                    continue
+                if similar(page.title, name) < SIMILARITY_THRESHOLD:
                     continue
-                url = page.url
-                html_raw = requests.get(url).text
-                dom = lh.fromstring(html_raw)
 
-                try:
-                    branche = dom.xpath("//th[contains(text(),'Branche') or contains(text(),'Tätigkeitsfeld')]/following-sibling::td[1]/text()")
-                    umsatz = dom.xpath("//th[contains(text(),'Umsatz')]/following-sibling::td[1]/text()")
-                    branche_clean = branche[0].strip() if branche else "k.A."
-                    umsatz_clean = umsatz[0].strip() if umsatz else "k.A."
-                except:
-                    branche_clean, umsatz_clean = "k.A.", "k.A."
-                return url, branche_clean, umsatz_clean
+                dom = lh.fromstring(html_raw)
+                branche = dom.xpath("//th[contains(text(),'Branche') or contains(text(),'Tätigkeitsfeld')]/following-sibling::td[1]/text()")
+                umsatz = dom.xpath("//th[contains(text(),'Umsatz')]/following-sibling::td[1]/text()")
+                branche_clean = branche[0].strip() if branche else "k.A."
+                umsatz_clean = umsatz[0].strip() if umsatz else "k.A."
+                return page.url, branche_clean, umsatz_clean
             except:
                 continue
     return "", "k.A.", "k.A."
@@ -106,6 +110,7 @@ print("\n✅ Wikipedia-Auswertung abgeschlossen")
 
 
 
+
 # === SCHRITT 2: GPT-BEWERTUNG ===
 def classify_company(row, wikipedia_url=""):
     user_prompt = {