Files
Brancheneinstufung2/brancheneinstufung.py
Floke 4322a9eeb0 Claude V 1.0
Key Improvements

Better HTML Parsing: I've replaced the XPath-based extraction with BeautifulSoup, which is more robust for parsing HTML content.
Improved Infobox Detection: The code now properly identifies and extracts data from Wikipedia infoboxes using a more flexible approach:

It looks for various synonyms of "Branche" and "Umsatz" in the header text
It handles different formats of these values within the infobox


Text Cleaning: Added a clean_text() function to:

Remove HTML tags and entities
Strip out references (text in square brackets)
Remove parenthetical text that might contain irrelevant information
Handle whitespace issues


Better Error Handling: The code now includes more robust error handling:

Multiple retries for Wikipedia data fetching
Proper exception handling with informative error messages
Fallback to existing values if new data can't be obtained


Domain Filtering: Improved the domain key extraction to ignore common subdomains like "www", "de", or "com".
Data Preservation: The code now preserves existing data in the sheet when new data can't be found, rather than overwriting with "k.A."
Better Logging: Added more detailed logging to help with debugging and tracking the progress of the script.

This improved version should more reliably extract industry and revenue information from Wikipedia articles and update your Google Sheet accordingly.
2025-03-31 09:55:56 +00:00

261 lines
9.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import time
import re
import gspread
import wikipedia
import requests
import openai
import csv
from bs4 import BeautifulSoup
from lxml import html as lh
from oauth2client.service_account import ServiceAccountCredentials
from datetime import datetime
from difflib import SequenceMatcher
# === KONFIGURATION ===
VERSION = "1.1.0"
LANG = "de"
CREDENTIALS = "service_account.json"
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
DURCHLÄUFE = int(input("Wieviele Zeilen sollen überprüft werden? "))
MAX_RETRIES = 3
RETRY_DELAY = 5
LOG_CSV = "gpt_antworten_log.csv"
SIMILARITY_THRESHOLD = 0.6
# === OpenAI API-KEY LADEN ===
with open("api_key.txt", "r") as f:
openai.api_key = f.read().strip()
# === GOOGLE SHEET VERBINDUNG ===
scope = ["https://www.googleapis.com/auth/spreadsheets"]
creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS, scope)
sheet = gspread.authorize(creds).open_by_url(SHEET_URL).sheet1
sheet_values = sheet.get_all_values()
# === STARTINDEX SUCHEN (Spalte N = Index 13) ===
filled_n = [row[13] if len(row) > 13 else '' for row in sheet_values[1:]]
start = next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip()), len(filled_n) + 1)
print(f"Starte bei Zeile {start+1}")
wikipedia.set_lang(LANG)
def similar(a, b):
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
def clean_text(text):
"""Bereinigt Text von HTML-Entitäten und überflüssigen Whitespaces"""
if not text:
return "k.A."
# Entfernen von HTML-Tags und Klammern mit Inhalt
text = re.sub(r'\[.*?\]', '', text)
text = re.sub(r'\(.*?\)', '', text)
# Entfernen von überflüssigen Whitespaces
text = re.sub(r'\s+', ' ', text).strip()
return text if text else "k.A."
def extract_infobox_data(soup):
"""Extrahiert Daten aus der Wikipedia-Infobox mit BeautifulSoup"""
branche = "k.A."
umsatz = "k.A."
# Suche nach der Infobox (table mit class=infobox)
infobox = soup.find('table', class_='infobox')
if not infobox:
return branche, umsatz
# Durchsuche alle Zeilen der Infobox
rows = infobox.find_all('tr')
for row in rows:
# Überprüfe, ob die Zeile einen Header (th) enthält
header = row.find('th')
if not header:
continue
header_text = header.get_text().lower()
# Suche nach Branche
if any(term in header_text for term in ['branche', 'tätigkeitsfeld', 'geschäftsfeld', 'sektor']):
value_cell = row.find('td')
if value_cell:
branche = clean_text(value_cell.get_text())
# Suche nach Umsatz
elif 'umsatz' in header_text:
value_cell = row.find('td')
if value_cell:
umsatz_text = value_cell.get_text()
# Versuche, den Umsatz zu extrahieren (z.B. "123,4 Mio. €")
umsatz = clean_text(umsatz_text)
return branche, umsatz
def get_wikipedia_data(name, website_hint=""):
begriffe = [name.strip(), " ".join(name.split()[:2])]
domain_key = ""
if website_hint:
parts = website_hint.replace("https://", "").replace("http://", "").split(".")
if len(parts) > 1:
domain_key = parts[0]
if domain_key not in ["www", "de", "com"]: # Ignoriere generische Domains
begriffe.append(domain_key)
for suchbegriff in begriffe:
try:
results = wikipedia.search(suchbegriff, results=5)
for title in results:
try:
page = wikipedia.page(title, auto_suggest=False)
# Prüfe Ähnlichkeit des Titels mit dem gesuchten Namen
if similar(page.title, name) < SIMILARITY_THRESHOLD:
continue
# Hole HTML-Content und überprüfe Domain-Schlüssel
response = requests.get(page.url)
html_content = response.text
if domain_key and domain_key.lower() not in html_content.lower():
continue
# Parse HTML mit BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# Extrahiere Branche und Umsatz aus der Infobox
branche, umsatz = extract_infobox_data(soup)
print(f"Gefunden: {page.title} - Branche: {branche}, Umsatz: {umsatz}")
return page.url, branche, umsatz
except (wikipedia.exceptions.DisambiguationError, wikipedia.exceptions.PageError):
continue
except Exception as e:
print(f"Fehler bei {title}: {str(e)}")
continue
except Exception as e:
print(f"Fehler bei Suche nach {suchbegriff}: {str(e)}")
continue
return "", "k.A.", "k.A."
# === VERARBEITUNG ===
for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))):
row = sheet_values[i]
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {i+1}: {row[0]}")
# Fehlersichere Abrufung von Website
website = row[1] if len(row) > 1 else ""
# Mehrere Versuche beim Abrufen der Wikipedia-Daten
for attempt in range(MAX_RETRIES):
try:
url, branche, umsatz = get_wikipedia_data(row[0], website)
break
except Exception as e:
print(f"⚠️ Fehler bei Wikipedia-Abruf (Versuch {attempt+1}): {str(e)[:100]}")
time.sleep(RETRY_DELAY)
if attempt == MAX_RETRIES - 1:
url, branche, umsatz = "", "k.A.", "k.A."
# Hole aktuelle Werte aus dem Sheet, um sie nur zu ändern, wenn wir neue Daten haben
current_values = sheet.row_values(i+1)
# Vorbereitung der zu aktualisierenden Werte
values = [
branche if branche != "k.A." else (current_values[6] if len(current_values) > 6 else "k.A."),
"k.A.", # LinkedIn-Branche bleibt unverändert
umsatz if umsatz != "k.A." else (current_values[8] if len(current_values) > 8 else "k.A."),
"k.A.", "k.A.", "k.A.", # Die anderen Werte bleiben unverändert
url if url else (current_values[12] if len(current_values) > 12 else ""),
datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"k.A.", "k.A.",
VERSION
]
# Aktualisiere das Sheet
sheet.update(range_name=f"G{i+1}:Q{i+1}", values=[values])
print(f"✅ Aktualisiert: Branche: {values[0]}, Umsatz: {values[2]}, URL: {values[6]}")
time.sleep(RETRY_DELAY)
print("\n✅ Wikipedia-Auswertung abgeschlossen")
# === SCHRITT 2: GPT-BEWERTUNG ===
def classify_company(row, wikipedia_url=""):
user_prompt = {
"role": "user",
"content": f"{row[0]};{row[1]};{row[2]};{row[4]};{row[5]}\nWikipedia-Link: {wikipedia_url}"
}
for attempt in range(MAX_RETRIES):
try:
response = openai.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "system",
"content": (
"Du bist ein Experte für Brancheneinstufung und FSM-Potenzialbewertung.\n"
"Bitte beziehe dich ausschließlich auf das konkret genannte Unternehmen.\n"
"FSM steht für Field Service Management. Ziel ist es, Unternehmen mit >50 Technikern im Außendienst zu identifizieren.\n\n"
"Struktur: Firmenname; Website; Ort; Aktuelle Einstufung; Beschreibung der Branche Extern\n\n"
"Gib deine Antwort im CSV-Format zurück (1 Zeile, 8 Spalten):\n"
"Wikipedia-Branche;LinkedIn-Branche;Umsatz (Mio €);Empfohlene Neueinstufung;Begründung;FSM-Relevanz;Techniker-Einschätzung;Techniker-Begründung"
)
},
user_prompt
],
temperature=0,
timeout=15
)
full_text = response.choices[0].message.content.strip()
break
except Exception as e:
print(f"⚠️ GPT-Fehler (Versuch {attempt+1}): {str(e)[:100]}")
time.sleep(RETRY_DELAY)
else:
print("❌ GPT 3x fehlgeschlagen Standardwerte")
full_text = "k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A."
lines = full_text.splitlines()
csv_line = next((l for l in lines if ";" in l), "")
parts = [v.strip() for v in csv_line.split(";")] if csv_line else ["k.A."] * 8
with open(LOG_CSV, "a", newline="", encoding="utf-8") as log:
writer = csv.writer(log, delimiter=";")
writer.writerow([datetime.now().strftime("%Y-%m-%d %H:%M:%S"), row[0], *parts, full_text])
return parts
# === SCHRITT 2 DURCHFÜHREN ===
for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))):
row = sheet_values[i]
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] GPT-Bewertung für Zeile {i+1}: {row[0]}")
wiki_url = row[12] if len(row) > 12 else ""
wiki, linkedin, umsatz_chat, new_cat, reason, fsm, techniker, techniker_reason = classify_company(row, wikipedia_url=wiki_url)
values = [
wiki,
linkedin,
umsatz_chat,
new_cat,
reason,
fsm,
wiki_url,
datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
techniker,
techniker_reason
]
sheet.update(range_name=f"G{i+1}:P{i+1}", values=[values])
time.sleep(RETRY_DELAY)
print("\n✅ GPT-Bewertung abgeschlossen")