Key Improvements Better HTML Parsing: I've replaced the XPath-based extraction with BeautifulSoup, which is more robust for parsing HTML content. Improved Infobox Detection: The code now properly identifies and extracts data from Wikipedia infoboxes using a more flexible approach: It looks for various synonyms of "Branche" and "Umsatz" in the header text It handles different formats of these values within the infobox Text Cleaning: Added a clean_text() function to: Remove HTML tags and entities Strip out references (text in square brackets) Remove parenthetical text that might contain irrelevant information Handle whitespace issues Better Error Handling: The code now includes more robust error handling: Multiple retries for Wikipedia data fetching Proper exception handling with informative error messages Fallback to existing values if new data can't be obtained Domain Filtering: Improved the domain key extraction to ignore common subdomains like "www", "de", or "com". Data Preservation: The code now preserves existing data in the sheet when new data can't be found, rather than overwriting with "k.A." Better Logging: Added more detailed logging to help with debugging and tracking the progress of the script. This improved version should more reliably extract industry and revenue information from Wikipedia articles and update your Google Sheet accordingly.
261 lines
9.6 KiB
Python
261 lines
9.6 KiB
Python
import os
|
||
import time
|
||
import re
|
||
import gspread
|
||
import wikipedia
|
||
import requests
|
||
import openai
|
||
import csv
|
||
from bs4 import BeautifulSoup
|
||
from lxml import html as lh
|
||
from oauth2client.service_account import ServiceAccountCredentials
|
||
from datetime import datetime
|
||
from difflib import SequenceMatcher
|
||
|
||
# === KONFIGURATION ===
|
||
VERSION = "1.1.0"
|
||
LANG = "de"
|
||
CREDENTIALS = "service_account.json"
|
||
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
|
||
DURCHLÄUFE = int(input("Wieviele Zeilen sollen überprüft werden? "))
|
||
MAX_RETRIES = 3
|
||
RETRY_DELAY = 5
|
||
LOG_CSV = "gpt_antworten_log.csv"
|
||
SIMILARITY_THRESHOLD = 0.6
|
||
|
||
# === OpenAI API-KEY LADEN ===
|
||
with open("api_key.txt", "r") as f:
|
||
openai.api_key = f.read().strip()
|
||
|
||
# === GOOGLE SHEET VERBINDUNG ===
|
||
scope = ["https://www.googleapis.com/auth/spreadsheets"]
|
||
creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS, scope)
|
||
sheet = gspread.authorize(creds).open_by_url(SHEET_URL).sheet1
|
||
sheet_values = sheet.get_all_values()
|
||
|
||
# === STARTINDEX SUCHEN (Spalte N = Index 13) ===
|
||
filled_n = [row[13] if len(row) > 13 else '' for row in sheet_values[1:]]
|
||
start = next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip()), len(filled_n) + 1)
|
||
print(f"Starte bei Zeile {start+1}")
|
||
|
||
wikipedia.set_lang(LANG)
|
||
|
||
def similar(a, b):
|
||
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
|
||
|
||
def clean_text(text):
|
||
"""Bereinigt Text von HTML-Entitäten und überflüssigen Whitespaces"""
|
||
if not text:
|
||
return "k.A."
|
||
# Entfernen von HTML-Tags und Klammern mit Inhalt
|
||
text = re.sub(r'\[.*?\]', '', text)
|
||
text = re.sub(r'\(.*?\)', '', text)
|
||
# Entfernen von überflüssigen Whitespaces
|
||
text = re.sub(r'\s+', ' ', text).strip()
|
||
return text if text else "k.A."
|
||
|
||
def extract_infobox_data(soup):
|
||
"""Extrahiert Daten aus der Wikipedia-Infobox mit BeautifulSoup"""
|
||
branche = "k.A."
|
||
umsatz = "k.A."
|
||
|
||
# Suche nach der Infobox (table mit class=infobox)
|
||
infobox = soup.find('table', class_='infobox')
|
||
if not infobox:
|
||
return branche, umsatz
|
||
|
||
# Durchsuche alle Zeilen der Infobox
|
||
rows = infobox.find_all('tr')
|
||
for row in rows:
|
||
# Überprüfe, ob die Zeile einen Header (th) enthält
|
||
header = row.find('th')
|
||
if not header:
|
||
continue
|
||
|
||
header_text = header.get_text().lower()
|
||
|
||
# Suche nach Branche
|
||
if any(term in header_text for term in ['branche', 'tätigkeitsfeld', 'geschäftsfeld', 'sektor']):
|
||
value_cell = row.find('td')
|
||
if value_cell:
|
||
branche = clean_text(value_cell.get_text())
|
||
|
||
# Suche nach Umsatz
|
||
elif 'umsatz' in header_text:
|
||
value_cell = row.find('td')
|
||
if value_cell:
|
||
umsatz_text = value_cell.get_text()
|
||
# Versuche, den Umsatz zu extrahieren (z.B. "123,4 Mio. €")
|
||
umsatz = clean_text(umsatz_text)
|
||
|
||
return branche, umsatz
|
||
|
||
def get_wikipedia_data(name, website_hint=""):
|
||
begriffe = [name.strip(), " ".join(name.split()[:2])]
|
||
domain_key = ""
|
||
if website_hint:
|
||
parts = website_hint.replace("https://", "").replace("http://", "").split(".")
|
||
if len(parts) > 1:
|
||
domain_key = parts[0]
|
||
if domain_key not in ["www", "de", "com"]: # Ignoriere generische Domains
|
||
begriffe.append(domain_key)
|
||
|
||
for suchbegriff in begriffe:
|
||
try:
|
||
results = wikipedia.search(suchbegriff, results=5)
|
||
for title in results:
|
||
try:
|
||
page = wikipedia.page(title, auto_suggest=False)
|
||
|
||
# Prüfe Ähnlichkeit des Titels mit dem gesuchten Namen
|
||
if similar(page.title, name) < SIMILARITY_THRESHOLD:
|
||
continue
|
||
|
||
# Hole HTML-Content und überprüfe Domain-Schlüssel
|
||
response = requests.get(page.url)
|
||
html_content = response.text
|
||
if domain_key and domain_key.lower() not in html_content.lower():
|
||
continue
|
||
|
||
# Parse HTML mit BeautifulSoup
|
||
soup = BeautifulSoup(html_content, 'html.parser')
|
||
|
||
# Extrahiere Branche und Umsatz aus der Infobox
|
||
branche, umsatz = extract_infobox_data(soup)
|
||
|
||
print(f"Gefunden: {page.title} - Branche: {branche}, Umsatz: {umsatz}")
|
||
return page.url, branche, umsatz
|
||
|
||
except (wikipedia.exceptions.DisambiguationError, wikipedia.exceptions.PageError):
|
||
continue
|
||
except Exception as e:
|
||
print(f"Fehler bei {title}: {str(e)}")
|
||
continue
|
||
except Exception as e:
|
||
print(f"Fehler bei Suche nach {suchbegriff}: {str(e)}")
|
||
continue
|
||
|
||
return "", "k.A.", "k.A."
|
||
|
||
# === VERARBEITUNG ===
|
||
for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))):
|
||
row = sheet_values[i]
|
||
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {i+1}: {row[0]}")
|
||
|
||
# Fehlersichere Abrufung von Website
|
||
website = row[1] if len(row) > 1 else ""
|
||
|
||
# Mehrere Versuche beim Abrufen der Wikipedia-Daten
|
||
for attempt in range(MAX_RETRIES):
|
||
try:
|
||
url, branche, umsatz = get_wikipedia_data(row[0], website)
|
||
break
|
||
except Exception as e:
|
||
print(f"⚠️ Fehler bei Wikipedia-Abruf (Versuch {attempt+1}): {str(e)[:100]}")
|
||
time.sleep(RETRY_DELAY)
|
||
if attempt == MAX_RETRIES - 1:
|
||
url, branche, umsatz = "", "k.A.", "k.A."
|
||
|
||
# Hole aktuelle Werte aus dem Sheet, um sie nur zu ändern, wenn wir neue Daten haben
|
||
current_values = sheet.row_values(i+1)
|
||
|
||
# Vorbereitung der zu aktualisierenden Werte
|
||
values = [
|
||
branche if branche != "k.A." else (current_values[6] if len(current_values) > 6 else "k.A."),
|
||
"k.A.", # LinkedIn-Branche bleibt unverändert
|
||
umsatz if umsatz != "k.A." else (current_values[8] if len(current_values) > 8 else "k.A."),
|
||
"k.A.", "k.A.", "k.A.", # Die anderen Werte bleiben unverändert
|
||
url if url else (current_values[12] if len(current_values) > 12 else ""),
|
||
datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||
"k.A.", "k.A.",
|
||
VERSION
|
||
]
|
||
|
||
# Aktualisiere das Sheet
|
||
sheet.update(range_name=f"G{i+1}:Q{i+1}", values=[values])
|
||
print(f"✅ Aktualisiert: Branche: {values[0]}, Umsatz: {values[2]}, URL: {values[6]}")
|
||
time.sleep(RETRY_DELAY)
|
||
|
||
print("\n✅ Wikipedia-Auswertung abgeschlossen")
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
# === SCHRITT 2: GPT-BEWERTUNG ===
|
||
def classify_company(row, wikipedia_url=""):
|
||
user_prompt = {
|
||
"role": "user",
|
||
"content": f"{row[0]};{row[1]};{row[2]};{row[4]};{row[5]}\nWikipedia-Link: {wikipedia_url}"
|
||
}
|
||
for attempt in range(MAX_RETRIES):
|
||
try:
|
||
response = openai.chat.completions.create(
|
||
model="gpt-3.5-turbo",
|
||
messages=[
|
||
{
|
||
"role": "system",
|
||
"content": (
|
||
"Du bist ein Experte für Brancheneinstufung und FSM-Potenzialbewertung.\n"
|
||
"Bitte beziehe dich ausschließlich auf das konkret genannte Unternehmen.\n"
|
||
"FSM steht für Field Service Management. Ziel ist es, Unternehmen mit >50 Technikern im Außendienst zu identifizieren.\n\n"
|
||
"Struktur: Firmenname; Website; Ort; Aktuelle Einstufung; Beschreibung der Branche Extern\n\n"
|
||
"Gib deine Antwort im CSV-Format zurück (1 Zeile, 8 Spalten):\n"
|
||
"Wikipedia-Branche;LinkedIn-Branche;Umsatz (Mio €);Empfohlene Neueinstufung;Begründung;FSM-Relevanz;Techniker-Einschätzung;Techniker-Begründung"
|
||
)
|
||
},
|
||
user_prompt
|
||
],
|
||
temperature=0,
|
||
timeout=15
|
||
)
|
||
full_text = response.choices[0].message.content.strip()
|
||
break
|
||
except Exception as e:
|
||
print(f"⚠️ GPT-Fehler (Versuch {attempt+1}): {str(e)[:100]}")
|
||
time.sleep(RETRY_DELAY)
|
||
else:
|
||
print("❌ GPT 3x fehlgeschlagen – Standardwerte")
|
||
full_text = "k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A."
|
||
|
||
lines = full_text.splitlines()
|
||
csv_line = next((l for l in lines if ";" in l), "")
|
||
parts = [v.strip() for v in csv_line.split(";")] if csv_line else ["k.A."] * 8
|
||
|
||
with open(LOG_CSV, "a", newline="", encoding="utf-8") as log:
|
||
writer = csv.writer(log, delimiter=";")
|
||
writer.writerow([datetime.now().strftime("%Y-%m-%d %H:%M:%S"), row[0], *parts, full_text])
|
||
|
||
return parts
|
||
|
||
# === SCHRITT 2 DURCHFÜHREN ===
|
||
for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))):
|
||
row = sheet_values[i]
|
||
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] GPT-Bewertung für Zeile {i+1}: {row[0]}")
|
||
wiki_url = row[12] if len(row) > 12 else ""
|
||
wiki, linkedin, umsatz_chat, new_cat, reason, fsm, techniker, techniker_reason = classify_company(row, wikipedia_url=wiki_url)
|
||
values = [
|
||
wiki,
|
||
linkedin,
|
||
umsatz_chat,
|
||
new_cat,
|
||
reason,
|
||
fsm,
|
||
wiki_url,
|
||
datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||
techniker,
|
||
techniker_reason
|
||
]
|
||
sheet.update(range_name=f"G{i+1}:P{i+1}", values=[values])
|
||
time.sleep(RETRY_DELAY)
|
||
|
||
print("\n✅ GPT-Bewertung abgeschlossen")
|