Files
Brancheneinstufung2/brancheneinstufung.py
Floke 21d9eb41ef Wiki Debugging
Wikipedia-Validierung über validate_wikipedia_page ergänzt (stellt sicher, dass Seiteninhalt oder Domain zum Firmennamen passen).

Fallback-Parsing parse_infobox_with_fallback() eingebaut für robustere Extraktion von Branche/Umsatz.

Branchenbegriff-Erkennung verbessert (Synonyme wie „Tätigkeitsfeld“, „Industriezweig“ etc.).

Bedingte Auswertung verbessert: Wenn kein Wikipedia-Link → "k.A." für Branche/Umsatz.

Selektor-basierte Extraktion entfernt, da sie fehleranfällig war → durch flexible Infobox-Logik ersetzt.
2025-03-30 18:25:03 +00:00

210 lines
7.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# Schritt 1: Nur Wikipedia-Daten extrahieren und in Google Sheet schreiben
import os
import time
import re
import gspread
import wikipedia
import requests
import openai
import csv
from bs4 import BeautifulSoup
from oauth2client.service_account import ServiceAccountCredentials
from datetime import datetime
# === KONFIGURATION ===
LANG = "de"
CREDENTIALS = "service_account.json"
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
DURCHLÄUFE = int(input("Wieviele Zeilen sollen überprüft werden? "))
MAX_RETRIES = 3
RETRY_DELAY = 5
LOG_CSV = "gpt_antworten_log.csv"
# === OpenAI API-KEY LADEN ===
with open("api_key.txt", "r") as f:
openai.api_key = f.read().strip()
# === GOOGLE SHEET VERBINDUNG ===
scope = ["https://www.googleapis.com/auth/spreadsheets"]
creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS, scope)
sheet = gspread.authorize(creds).open_by_url(SHEET_URL).sheet1
sheet_values = sheet.get_all_values()
# === STARTINDEX SUCHEN (Spalte N = Index 13) ===
filled_n = [row[13] if len(row) > 13 else '' for row in sheet_values[1:]]
start = next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip()), len(filled_n) + 1)
print(f"Starte bei Zeile {start+1}")
wikipedia.set_lang(LANG)
# === DOMAIN SCHLÜSSEL ===
def extract_domain_key(url):
if not url:
return ""
clean_url = url.replace("https://", "").replace("http://", "").split("/")[0]
parts = clean_url.split(".")
return parts[0] if len(parts) > 1 else ""
# === INFOBOX-PARSING ===
def parse_infobox_with_fallback(soup):
infobox = soup.find("table", class_="infobox")
branche = "k.A."
umsatz = "k.A."
if infobox:
for row in infobox.find_all("tr"):
th = row.find("th")
td = row.find("td")
if not th or not td:
continue
label = th.get_text(strip=True).lower()
value = td.get_text(strip=True)
if any(b in label for b in ["branche", "tätigkeitsfeld", "industriezweig", "wirtschaftszweig"]):
branche = value
if "umsatz" in label and "mio" in value.lower():
match = re.search(r"(\d+[\d.,]*)\s*Mio", value)
if match:
umsatz = match.group(1).replace(",", ".")
return branche, umsatz
# === WIKIPEDIA DATEN ===
WHITELIST_KATEGORIEN = [
"unternehmen", "hersteller", "produktion", "industrie",
"maschinenbau", "technik", "dienstleistung", "chemie",
"pharma", "elektro", "medizin", "bau", "energie",
"logistik", "automobil"
]
def validate_wikipedia_page(content, name, domain_key):
name_fragments = name.lower().split()[:2]
return (
any(frag in content.lower() for frag in name_fragments) or
(domain_key and domain_key.lower() in content.lower())
)
def get_wikipedia_data(name, website_hint=""):
domain_key = extract_domain_key(website_hint)
search_terms = [name, domain_key] if domain_key else [name]
for term in search_terms:
if not term:
continue
for attempt in range(MAX_RETRIES):
try:
results = wikipedia.search(term, results=3)
for title in results:
try:
page = wikipedia.page(title, auto_suggest=False)
html = requests.get(page.url, timeout=10).text
if not validate_wikipedia_page(page.content, name, domain_key):
continue
soup = BeautifulSoup(html, "html.parser")
branche, umsatz = parse_infobox_with_fallback(soup)
if not branche or branche == "k.A.":
for category in page.categories:
if any(kw in category.lower() for kw in WHITELIST_KATEGORIEN):
branche = category
break
return page.url, branche or "k.A.", umsatz or "k.A."
except:
continue
except Exception as e:
print(f"⚠️ Wikipedia-Fehler ({term}, Versuch {attempt+1}): {str(e)[:100]}")
time.sleep(RETRY_DELAY)
return "", "k.A.", "k.A."
# === SCHRITT 1: WIKIPEDIA VERARBEITUNG ===
for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))):
row = sheet_values[i]
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {i+1}: {row[0]}")
url, wiki_branche, umsatz = get_wikipedia_data(row[0], row[1])
wiki_final = wiki_branche if url else "k.A."
umsatz_final = umsatz if url else "k.A."
values = [
wiki_final,
"k.A.", # LinkedIn-Branche leer
umsatz_final,
"k.A.", "k.A.", "k.A.",
url,
datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"k.A.", "k.A."
]
sheet.update(range_name=f"G{i+1}:P{i+1}", values=[values])
print(f"✅ Aktualisiert: {values[:3]}...")
time.sleep(RETRY_DELAY)
print("\n✅ Wikipedia-Auswertung abgeschlossen")
# === SCHRITT 2: GPT-BEWERTUNG ===
def classify_company(row, wikipedia_url=""):
user_prompt = {
"role": "user",
"content": f"{row[0]};{row[1]};{row[2]};{row[4]};{row[5]}\nWikipedia-Link: {wikipedia_url}"
}
for attempt in range(MAX_RETRIES):
try:
response = openai.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "system",
"content": (
"Du bist ein Experte für Brancheneinstufung und FSM-Potenzialbewertung.\n"
"Bitte beziehe dich ausschließlich auf das konkret genannte Unternehmen.\n"
"FSM steht für Field Service Management. Ziel ist es, Unternehmen mit >50 Technikern im Außendienst zu identifizieren.\n\n"
"Struktur: Firmenname; Website; Ort; Aktuelle Einstufung; Beschreibung der Branche Extern\n\n"
"Gib deine Antwort im CSV-Format zurück (1 Zeile, 8 Spalten):\n"
"Wikipedia-Branche;LinkedIn-Branche;Umsatz (Mio €);Empfohlene Neueinstufung;Begründung;FSM-Relevanz;Techniker-Einschätzung;Techniker-Begründung"
)
},
user_prompt
],
temperature=0,
timeout=15
)
full_text = response.choices[0].message.content.strip()
break
except Exception as e:
print(f"⚠️ GPT-Fehler (Versuch {attempt+1}): {str(e)[:100]}")
time.sleep(RETRY_DELAY)
else:
print("❌ GPT 3x fehlgeschlagen Standardwerte")
full_text = "k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A."
lines = full_text.splitlines()
csv_line = next((l for l in lines if ";" in l), "")
parts = [v.strip() for v in csv_line.split(";")] if csv_line else ["k.A."] * 8
with open(LOG_CSV, "a", newline="", encoding="utf-8") as log:
writer = csv.writer(log, delimiter=";")
writer.writerow([datetime.now().strftime("%Y-%m-%d %H:%M:%S"), row[0], *parts, full_text])
return parts
# === SCHRITT 2 DURCHFÜHREN ===
for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))):
row = sheet_values[i]
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] GPT-Bewertung für Zeile {i+1}: {row[0]}")
wiki_url = row[12] if len(row) > 12 else ""
wiki, linkedin, umsatz_chat, new_cat, reason, fsm, techniker, techniker_reason = classify_company(row, wikipedia_url=wiki_url)
values = [
wiki,
linkedin,
umsatz_chat,
new_cat,
reason,
fsm,
wiki_url,
datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
techniker,
techniker_reason
]
sheet.update(range_name=f"G{i+1}:P{i+1}", values=[values])
time.sleep(RETRY_DELAY)
print("\n✅ GPT-Bewertung abgeschlossen")