188 lines
6.6 KiB
Python
188 lines
6.6 KiB
Python
import os
|
|
import time
|
|
import csv
|
|
import re
|
|
import gspread
|
|
import openai
|
|
import wikipedia
|
|
from bs4 import BeautifulSoup
|
|
import requests
|
|
from oauth2client.service_account import ServiceAccountCredentials
|
|
from datetime import datetime
|
|
|
|
# === KONFIGURATION ===
|
|
EXCEL = "Bestandsfirmen.xlsx"
|
|
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
|
|
CREDENTIALS = "service_account.json"
|
|
LANG = "de"
|
|
LOG_CSV = "gpt_antworten_log.csv"
|
|
DURCHLÄUFE = int(input("Wieviele Zeilen sollen überprüft werden? "))
|
|
MAX_RETRIES = 3
|
|
RETRY_DELAY = 5
|
|
|
|
# === OpenAI INIT ===
|
|
with open("api_key.txt", "r") as f:
|
|
openai.api_key = f.read().strip()
|
|
|
|
# === GOOGLE SHEETS ===
|
|
scope = ["https://www.googleapis.com/auth/spreadsheets"]
|
|
creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS, scope)
|
|
client = gspread.authorize(creds)
|
|
sheet = client.open_by_url(SHEET_URL).sheet1
|
|
sheet_values = sheet.get_all_values()
|
|
|
|
# === WIKIPEDIA KONFIG ===
|
|
wikipedia.set_lang(LANG)
|
|
WHITELIST_KATEGORIEN = [
|
|
"unternehmen", "hersteller", "produktion", "industrie",
|
|
"maschinenbau", "technik", "dienstleistung", "chemie",
|
|
"pharma", "elektro", "medizin", "bau", "energie",
|
|
"logistik", "automobil", "handel", "textil", "klima"
|
|
]
|
|
|
|
# === SYSTEM PROMPT ===
|
|
branches = [...] # Branchenliste hier einfügen
|
|
|
|
system_prompt = {
|
|
"role": "system",
|
|
"content": (
|
|
"Du bist ein Experte für Brancheneinstufung. Beantworte ausschließlich "
|
|
"basierend auf den gegebenen Unternehmensdaten. Format: "
|
|
"Wikipedia-Branche;LinkedIn-Branche;Umsatz (Mio €);Empfohlene Neueinstufung;"
|
|
"Begründung;FSM-Relevanz;Techniker-Einschätzung;Techniker-Begründung"
|
|
)
|
|
}
|
|
|
|
# === HELFERFUNKTIONEN ===
|
|
def extract_domain(url):
|
|
"""Extrahiert den Domain-Schlüssel aus der URL"""
|
|
if not url.startswith("http"):
|
|
url = f"https://{url}"
|
|
return url.split("//")[-1].split("/")[0].split(".")[0]
|
|
|
|
def validate_wikipedia_content(content, name, domain):
|
|
"""Prüft ob der Artikel zum Unternehmen gehört"""
|
|
name_fragments = name.lower().split()[:2]
|
|
return (
|
|
any(frag in content.lower() for frag in name_fragments) or
|
|
(domain and domain.lower() in content.lower())
|
|
|
|
def parse_infobox(soup):
|
|
"""Extrahiert Branche und Umsatz aus der Infobox"""
|
|
branche = umsatz = ""
|
|
|
|
for row in soup.find_all("tr"):
|
|
th = row.find("th")
|
|
td = row.find("td")
|
|
if not th or not td:
|
|
continue
|
|
|
|
header = th.get_text(strip=True).lower()
|
|
value = td.get_text(separator=" ", strip=True)
|
|
|
|
# Branche erkennen
|
|
if any(s in header for s in ["branche", "industrie", "tätigkeitsfeld"]):
|
|
branche = value
|
|
|
|
# Umsatz erkennen
|
|
if "umsatz" in header:
|
|
if "mio" in value.lower():
|
|
match = re.search(r"(\d{1,3}(?:[.,]\d{3})*(?:[.,]\d+)?)", value)
|
|
if match:
|
|
umsatz = match.group(1).replace(",", ".")
|
|
|
|
return branche, umsatz
|
|
|
|
def get_wikipedia_data(name, website):
|
|
"""Holt validierte Wikipedia-Daten"""
|
|
domain = extract_domain(website) if website else ""
|
|
|
|
for attempt in range(MAX_RETRIES):
|
|
try:
|
|
results = wikipedia.search(name, results=3)
|
|
for title in results:
|
|
try:
|
|
page = wikipedia.page(title, auto_suggest=False)
|
|
if not validate_wikipedia_content(page.content, name, domain):
|
|
continue
|
|
|
|
soup = BeautifulSoup(requests.get(page.url).text, "html.parser")
|
|
branche, umsatz = parse_infobox(soup)
|
|
|
|
# Fallback auf Kategorien
|
|
if not branche:
|
|
for cat in page.categories:
|
|
if any(kw in cat.lower() for kw in WHITELIST_KATEGORIEN):
|
|
branche = cat
|
|
break
|
|
|
|
return page.url, branche or "k.A.", umsatz or "k.A."
|
|
|
|
except wikipedia.exceptions.PageError:
|
|
continue
|
|
except Exception as e:
|
|
print(f"⚠️ Wikipedia-Fehler ({name}): {str(e)[:100]}")
|
|
time.sleep(RETRY_DELAY)
|
|
|
|
return "", "k.A.", "k.A."
|
|
|
|
def query_gpt(row, wiki_url):
|
|
"""Verarbeitet die GPT-Abfrage mit verbessertem Error-Handling"""
|
|
user_content = f"{row[0]};{row[1]};{row[2]};{row[4]};{row[5]}\nWikipedia: {wiki_url}"
|
|
|
|
for attempt in range(MAX_RETRIES):
|
|
try:
|
|
response = openai.chat.completions.create(
|
|
model="gpt-3.5-turbo",
|
|
messages=[system_prompt, {"role": "user", "content": user_content}],
|
|
temperature=0,
|
|
timeout=15
|
|
)
|
|
return response.choices[0].message.content.strip()
|
|
except Exception as e:
|
|
print(f"⚠️ GPT-Fehler (Versuch {attempt+1}): {str(e)[:100]}")
|
|
time.sleep(RETRY_DELAY)
|
|
|
|
print("❌ GPT-Abfrage fehlgeschlagen")
|
|
return "k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A."
|
|
|
|
# === HAUPTLOGIK ===
|
|
start_index = next((i for i, row in enumerate(sheet_values[1:], start=1) if not row[13].strip()), 1)
|
|
|
|
for i in range(start_index, min(start_index + DURCHLÄUFE, len(sheet_values))):
|
|
row = sheet_values[i]
|
|
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {i+1}: {row[0]}")
|
|
|
|
# Wikipedia-Daten holen
|
|
wiki_url, wiki_branche, wiki_umsatz = get_wikipedia_data(row[0], row[1])
|
|
|
|
# GPT-Abfrage
|
|
gpt_response = query_gpt(row, wiki_url)
|
|
gpt_data = [x.strip('"') for x in gpt_response.split(";")][:8]
|
|
|
|
# Finale Werte
|
|
final_branche = wiki_branche if wiki_url else "k.A."
|
|
final_umsatz = wiki_umsatz if wiki_url else "k.A."
|
|
|
|
# Google Sheet aktualisieren
|
|
update_values = [
|
|
final_branche, # G: Wikipedia-Branche
|
|
gpt_data[1], # H: LinkedIn-Branche
|
|
final_umsatz, # I: Umsatz
|
|
gpt_data[3], # J: Neueinstufung
|
|
gpt_data[4], # K: Begründung
|
|
gpt_data[5], # L: FSM-Relevanz
|
|
wiki_url, # M: Wikipedia-URL
|
|
datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
gpt_data[6], # O: Techniker-Einschätzung
|
|
gpt_data[7] # P: Techniker-Begründung
|
|
]
|
|
|
|
sheet.update(
|
|
range_name=f"G{i+1}:P{i+1}",
|
|
values=[update_values]
|
|
)
|
|
print(f"✅ Aktualisiert: {update_values[:3]}...")
|
|
time.sleep(RETRY_DELAY)
|
|
|
|
print("\n✅ Prozess erfolgreich abgeschlossen") |