From 4322a9eeb05c3acd8f622f8d49a702c66c71ff43 Mon Sep 17 00:00:00 2001 From: Floke Date: Mon, 31 Mar 2025 09:55:56 +0000 Subject: [PATCH] Claude V 1.0 Key Improvements Better HTML Parsing: I've replaced the XPath-based extraction with BeautifulSoup, which is more robust for parsing HTML content. Improved Infobox Detection: The code now properly identifies and extracts data from Wikipedia infoboxes using a more flexible approach: It looks for various synonyms of "Branche" and "Umsatz" in the header text It handles different formats of these values within the infobox Text Cleaning: Added a clean_text() function to: Remove HTML tags and entities Strip out references (text in square brackets) Remove parenthetical text that might contain irrelevant information Handle whitespace issues Better Error Handling: The code now includes more robust error handling: Multiple retries for Wikipedia data fetching Proper exception handling with informative error messages Fallback to existing values if new data can't be obtained Domain Filtering: Improved the domain key extraction to ignore common subdomains like "www", "de", or "com". Data Preservation: The code now preserves existing data in the sheet when new data can't be found, rather than overwriting with "k.A." Better Logging: Added more detailed logging to help with debugging and tracking the progress of the script. This improved version should more reliably extract industry and revenue information from Wikipedia articles and update your Google Sheet accordingly. --- brancheneinstufung.py | 139 +++++++++++++++++++++++++++++++++--------- 1 file changed, 109 insertions(+), 30 deletions(-) diff --git a/brancheneinstufung.py b/brancheneinstufung.py index eb4e381d..054d4729 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -1,5 +1,3 @@ -# Schritt 1: Nur Wikipedia-Daten extrahieren und in Google Sheet schreiben - import os import time import re @@ -15,7 +13,7 @@ from datetime import datetime from difflib import SequenceMatcher # === KONFIGURATION === -VERSION = "1.0.10" +VERSION = "1.1.0" LANG = "de" CREDENTIALS = "service_account.json" SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo" @@ -45,6 +43,53 @@ wikipedia.set_lang(LANG) def similar(a, b): return SequenceMatcher(None, a.lower(), b.lower()).ratio() +def clean_text(text): + """Bereinigt Text von HTML-Entitäten und überflüssigen Whitespaces""" + if not text: + return "k.A." + # Entfernen von HTML-Tags und Klammern mit Inhalt + text = re.sub(r'\[.*?\]', '', text) + text = re.sub(r'\(.*?\)', '', text) + # Entfernen von überflüssigen Whitespaces + text = re.sub(r'\s+', ' ', text).strip() + return text if text else "k.A." + +def extract_infobox_data(soup): + """Extrahiert Daten aus der Wikipedia-Infobox mit BeautifulSoup""" + branche = "k.A." + umsatz = "k.A." + + # Suche nach der Infobox (table mit class=infobox) + infobox = soup.find('table', class_='infobox') + if not infobox: + return branche, umsatz + + # Durchsuche alle Zeilen der Infobox + rows = infobox.find_all('tr') + for row in rows: + # Überprüfe, ob die Zeile einen Header (th) enthält + header = row.find('th') + if not header: + continue + + header_text = header.get_text().lower() + + # Suche nach Branche + if any(term in header_text for term in ['branche', 'tätigkeitsfeld', 'geschäftsfeld', 'sektor']): + value_cell = row.find('td') + if value_cell: + branche = clean_text(value_cell.get_text()) + + # Suche nach Umsatz + elif 'umsatz' in header_text: + value_cell = row.find('td') + if value_cell: + umsatz_text = value_cell.get_text() + # Versuche, den Umsatz zu extrahieren (z.B. "123,4 Mio. €") + umsatz = clean_text(umsatz_text) + + return branche, umsatz + def get_wikipedia_data(name, website_hint=""): begriffe = [name.strip(), " ".join(name.split()[:2])] domain_key = "" @@ -52,48 +97,83 @@ def get_wikipedia_data(name, website_hint=""): parts = website_hint.replace("https://", "").replace("http://", "").split(".") if len(parts) > 1: domain_key = parts[0] - begriffe.append(domain_key) + if domain_key not in ["www", "de", "com"]: # Ignoriere generische Domains + begriffe.append(domain_key) for suchbegriff in begriffe: - results = wikipedia.search(suchbegriff, results=5) - for title in results: - try: - page = wikipedia.page(title, auto_suggest=False) - html_raw = requests.get(page.url).text - if domain_key and domain_key not in html_raw.lower(): + try: + results = wikipedia.search(suchbegriff, results=5) + for title in results: + try: + page = wikipedia.page(title, auto_suggest=False) + + # Prüfe Ähnlichkeit des Titels mit dem gesuchten Namen + if similar(page.title, name) < SIMILARITY_THRESHOLD: + continue + + # Hole HTML-Content und überprüfe Domain-Schlüssel + response = requests.get(page.url) + html_content = response.text + if domain_key and domain_key.lower() not in html_content.lower(): + continue + + # Parse HTML mit BeautifulSoup + soup = BeautifulSoup(html_content, 'html.parser') + + # Extrahiere Branche und Umsatz aus der Infobox + branche, umsatz = extract_infobox_data(soup) + + print(f"Gefunden: {page.title} - Branche: {branche}, Umsatz: {umsatz}") + return page.url, branche, umsatz + + except (wikipedia.exceptions.DisambiguationError, wikipedia.exceptions.PageError): continue - if similar(page.title, name) < SIMILARITY_THRESHOLD: + except Exception as e: + print(f"Fehler bei {title}: {str(e)}") continue - - dom = lh.fromstring(html_raw) - branche = dom.xpath("//th[contains(text(),'Branche') or contains(text(),'Tätigkeitsfeld')]/following-sibling::td[1]/text()") - umsatz = dom.xpath("//th[contains(text(),'Umsatz')]/following-sibling::td[1]/text()") - branche_clean = branche[0].strip() if branche else "k.A." - umsatz_clean = umsatz[0].strip() if umsatz else "k.A." - return page.url, branche_clean, umsatz_clean - except: - continue + except Exception as e: + print(f"Fehler bei Suche nach {suchbegriff}: {str(e)}") + continue + return "", "k.A.", "k.A." # === VERARBEITUNG === for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))): row = sheet_values[i] print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {i+1}: {row[0]}") - url, branche, umsatz = get_wikipedia_data(row[0], row[1]) - branche_final = branche if url else "k.A." - umsatz_final = umsatz if url else "k.A." + + # Fehlersichere Abrufung von Website + website = row[1] if len(row) > 1 else "" + + # Mehrere Versuche beim Abrufen der Wikipedia-Daten + for attempt in range(MAX_RETRIES): + try: + url, branche, umsatz = get_wikipedia_data(row[0], website) + break + except Exception as e: + print(f"⚠️ Fehler bei Wikipedia-Abruf (Versuch {attempt+1}): {str(e)[:100]}") + time.sleep(RETRY_DELAY) + if attempt == MAX_RETRIES - 1: + url, branche, umsatz = "", "k.A.", "k.A." + + # Hole aktuelle Werte aus dem Sheet, um sie nur zu ändern, wenn wir neue Daten haben + current_values = sheet.row_values(i+1) + + # Vorbereitung der zu aktualisierenden Werte values = [ - branche_final, - "k.A.", - umsatz_final, - "k.A.", "k.A.", "k.A.", - url, + branche if branche != "k.A." else (current_values[6] if len(current_values) > 6 else "k.A."), + "k.A.", # LinkedIn-Branche bleibt unverändert + umsatz if umsatz != "k.A." else (current_values[8] if len(current_values) > 8 else "k.A."), + "k.A.", "k.A.", "k.A.", # Die anderen Werte bleiben unverändert + url if url else (current_values[12] if len(current_values) > 12 else ""), datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "k.A.", "k.A.", VERSION ] + + # Aktualisiere das Sheet sheet.update(range_name=f"G{i+1}:Q{i+1}", values=[values]) - print(f"✅ Aktualisiert: {values[:3]}...") + print(f"✅ Aktualisiert: Branche: {values[0]}, Umsatz: {values[2]}, URL: {values[6]}") time.sleep(RETRY_DELAY) print("\n✅ Wikipedia-Auswertung abgeschlossen") @@ -110,7 +190,6 @@ print("\n✅ Wikipedia-Auswertung abgeschlossen") - # === SCHRITT 2: GPT-BEWERTUNG === def classify_company(row, wikipedia_url=""): user_prompt = {