Claude V 1.0

Key Improvements

Better HTML Parsing: I've replaced the XPath-based extraction with BeautifulSoup, which is more robust for parsing HTML content.
Improved Infobox Detection: The code now properly identifies and extracts data from Wikipedia infoboxes using a more flexible approach:

It looks for various synonyms of "Branche" and "Umsatz" in the header text
It handles different formats of these values within the infobox


Text Cleaning: Added a clean_text() function to:

Remove HTML tags and entities
Strip out references (text in square brackets)
Remove parenthetical text that might contain irrelevant information
Handle whitespace issues


Better Error Handling: The code now includes more robust error handling:

Multiple retries for Wikipedia data fetching
Proper exception handling with informative error messages
Fallback to existing values if new data can't be obtained


Domain Filtering: Improved the domain key extraction to ignore common subdomains like "www", "de", or "com".
Data Preservation: The code now preserves existing data in the sheet when new data can't be found, rather than overwriting with "k.A."
Better Logging: Added more detailed logging to help with debugging and tracking the progress of the script.

This improved version should more reliably extract industry and revenue information from Wikipedia articles and update your Google Sheet accordingly.
This commit is contained in:
2025-03-31 09:55:56 +00:00
parent f1ca42d98e
commit 4322a9eeb0

View File

@@ -1,5 +1,3 @@
# Schritt 1: Nur Wikipedia-Daten extrahieren und in Google Sheet schreiben
import os
import time
import re
@@ -15,7 +13,7 @@ from datetime import datetime
from difflib import SequenceMatcher
# === KONFIGURATION ===
VERSION = "1.0.10"
VERSION = "1.1.0"
LANG = "de"
CREDENTIALS = "service_account.json"
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
@@ -45,6 +43,53 @@ wikipedia.set_lang(LANG)
def similar(a, b):
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
def clean_text(text):
"""Bereinigt Text von HTML-Entitäten und überflüssigen Whitespaces"""
if not text:
return "k.A."
# Entfernen von HTML-Tags und Klammern mit Inhalt
text = re.sub(r'\[.*?\]', '', text)
text = re.sub(r'\(.*?\)', '', text)
# Entfernen von überflüssigen Whitespaces
text = re.sub(r'\s+', ' ', text).strip()
return text if text else "k.A."
def extract_infobox_data(soup):
"""Extrahiert Daten aus der Wikipedia-Infobox mit BeautifulSoup"""
branche = "k.A."
umsatz = "k.A."
# Suche nach der Infobox (table mit class=infobox)
infobox = soup.find('table', class_='infobox')
if not infobox:
return branche, umsatz
# Durchsuche alle Zeilen der Infobox
rows = infobox.find_all('tr')
for row in rows:
# Überprüfe, ob die Zeile einen Header (th) enthält
header = row.find('th')
if not header:
continue
header_text = header.get_text().lower()
# Suche nach Branche
if any(term in header_text for term in ['branche', 'tätigkeitsfeld', 'geschäftsfeld', 'sektor']):
value_cell = row.find('td')
if value_cell:
branche = clean_text(value_cell.get_text())
# Suche nach Umsatz
elif 'umsatz' in header_text:
value_cell = row.find('td')
if value_cell:
umsatz_text = value_cell.get_text()
# Versuche, den Umsatz zu extrahieren (z.B. "123,4 Mio. €")
umsatz = clean_text(umsatz_text)
return branche, umsatz
def get_wikipedia_data(name, website_hint=""):
begriffe = [name.strip(), " ".join(name.split()[:2])]
domain_key = ""
@@ -52,48 +97,83 @@ def get_wikipedia_data(name, website_hint=""):
parts = website_hint.replace("https://", "").replace("http://", "").split(".")
if len(parts) > 1:
domain_key = parts[0]
begriffe.append(domain_key)
if domain_key not in ["www", "de", "com"]: # Ignoriere generische Domains
begriffe.append(domain_key)
for suchbegriff in begriffe:
results = wikipedia.search(suchbegriff, results=5)
for title in results:
try:
page = wikipedia.page(title, auto_suggest=False)
html_raw = requests.get(page.url).text
if domain_key and domain_key not in html_raw.lower():
try:
results = wikipedia.search(suchbegriff, results=5)
for title in results:
try:
page = wikipedia.page(title, auto_suggest=False)
# Prüfe Ähnlichkeit des Titels mit dem gesuchten Namen
if similar(page.title, name) < SIMILARITY_THRESHOLD:
continue
# Hole HTML-Content und überprüfe Domain-Schlüssel
response = requests.get(page.url)
html_content = response.text
if domain_key and domain_key.lower() not in html_content.lower():
continue
# Parse HTML mit BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# Extrahiere Branche und Umsatz aus der Infobox
branche, umsatz = extract_infobox_data(soup)
print(f"Gefunden: {page.title} - Branche: {branche}, Umsatz: {umsatz}")
return page.url, branche, umsatz
except (wikipedia.exceptions.DisambiguationError, wikipedia.exceptions.PageError):
continue
if similar(page.title, name) < SIMILARITY_THRESHOLD:
except Exception as e:
print(f"Fehler bei {title}: {str(e)}")
continue
dom = lh.fromstring(html_raw)
branche = dom.xpath("//th[contains(text(),'Branche') or contains(text(),'Tätigkeitsfeld')]/following-sibling::td[1]/text()")
umsatz = dom.xpath("//th[contains(text(),'Umsatz')]/following-sibling::td[1]/text()")
branche_clean = branche[0].strip() if branche else "k.A."
umsatz_clean = umsatz[0].strip() if umsatz else "k.A."
return page.url, branche_clean, umsatz_clean
except:
continue
except Exception as e:
print(f"Fehler bei Suche nach {suchbegriff}: {str(e)}")
continue
return "", "k.A.", "k.A."
# === VERARBEITUNG ===
for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))):
row = sheet_values[i]
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {i+1}: {row[0]}")
url, branche, umsatz = get_wikipedia_data(row[0], row[1])
branche_final = branche if url else "k.A."
umsatz_final = umsatz if url else "k.A."
# Fehlersichere Abrufung von Website
website = row[1] if len(row) > 1 else ""
# Mehrere Versuche beim Abrufen der Wikipedia-Daten
for attempt in range(MAX_RETRIES):
try:
url, branche, umsatz = get_wikipedia_data(row[0], website)
break
except Exception as e:
print(f"⚠️ Fehler bei Wikipedia-Abruf (Versuch {attempt+1}): {str(e)[:100]}")
time.sleep(RETRY_DELAY)
if attempt == MAX_RETRIES - 1:
url, branche, umsatz = "", "k.A.", "k.A."
# Hole aktuelle Werte aus dem Sheet, um sie nur zu ändern, wenn wir neue Daten haben
current_values = sheet.row_values(i+1)
# Vorbereitung der zu aktualisierenden Werte
values = [
branche_final,
"k.A.",
umsatz_final,
"k.A.", "k.A.", "k.A.",
url,
branche if branche != "k.A." else (current_values[6] if len(current_values) > 6 else "k.A."),
"k.A.", # LinkedIn-Branche bleibt unverändert
umsatz if umsatz != "k.A." else (current_values[8] if len(current_values) > 8 else "k.A."),
"k.A.", "k.A.", "k.A.", # Die anderen Werte bleiben unverändert
url if url else (current_values[12] if len(current_values) > 12 else ""),
datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"k.A.", "k.A.",
VERSION
]
# Aktualisiere das Sheet
sheet.update(range_name=f"G{i+1}:Q{i+1}", values=[values])
print(f"✅ Aktualisiert: {values[:3]}...")
print(f"✅ Aktualisiert: Branche: {values[0]}, Umsatz: {values[2]}, URL: {values[6]}")
time.sleep(RETRY_DELAY)
print("\n✅ Wikipedia-Auswertung abgeschlossen")
@@ -110,7 +190,6 @@ print("\n✅ Wikipedia-Auswertung abgeschlossen")
# === SCHRITT 2: GPT-BEWERTUNG ===
def classify_company(row, wikipedia_url=""):
user_prompt = {