Files
Brancheneinstufung2/brancheneinstufung.py
Floke 80a70fcf3b Code in zwei Bereiche aufgeteilt
Aufteilung des Codes in zwei unabhängige Verarbeitungsschritte.

Wikipedia-Branche und Umsatz werden nur geschrieben, wenn Wikipedia-URL vorhanden ist.

GPT-Aufruf überarbeitet (inkl. Timeout und Retry-Logik).

gpt_antworten_log.csv wird mit Zeitstempel aktualisiert.
2025-03-30 17:42:46 +00:00

193 lines
7.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# Schritt 1: Nur Wikipedia-Daten extrahieren und in Google Sheet schreiben
import os
import time
import re
import gspread
import wikipedia
import requests
import openai
import csv
from bs4 import BeautifulSoup
from oauth2client.service_account import ServiceAccountCredentials
from datetime import datetime
# === KONFIGURATION ===
LANG = "de"
CREDENTIALS = "service_account.json"
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
DURCHLÄUFE = int(input("Wieviele Zeilen sollen überprüft werden? "))
MAX_RETRIES = 3
RETRY_DELAY = 5
LOG_CSV = "gpt_antworten_log.csv"
# === OpenAI API-KEY LADEN ===
with open("api_key.txt", "r") as f:
openai.api_key = f.read().strip()
# === GOOGLE SHEET VERBINDUNG ===
scope = ["https://www.googleapis.com/auth/spreadsheets"]
creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS, scope)
sheet = gspread.authorize(creds).open_by_url(SHEET_URL).sheet1
sheet_values = sheet.get_all_values()
# === STARTINDEX SUCHEN (Spalte N = Index 13) ===
filled_n = [row[13] if len(row) > 13 else '' for row in sheet_values[1:]]
start = next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip()), len(filled_n) + 1)
print(f"Starte bei Zeile {start+1}")
wikipedia.set_lang(LANG)
# === DOMAIN SCHLÜSSEL ===
def extract_domain_key(url):
if not url:
return ""
clean_url = url.replace("https://", "").replace("http://", "").split("/")[0]
parts = clean_url.split(".")
return parts[0] if len(parts) > 1 else ""
# === INFOBOX-PARSING ===
def parse_infobox(soup):
infobox = soup.find("table", class_=["infobox", "infobox vcard"])
branche = umsatz = ""
if infobox:
for row in infobox.find_all("tr"):
th, td = row.find("th"), row.find("td")
if not th or not td:
continue
if "branche" in th.text.lower():
branche = td.get_text(separator=" ", strip=True)
if "umsatz" in th.text.lower():
umsatz_text = td.get_text(strip=True)
if "Mio" in umsatz_text:
match = re.search(r"(\d+[\d.,]*)\s*Mio", umsatz_text)
if match:
umsatz = match.group(1).replace(",", ".")
return branche, umsatz
# === WIKIPEDIA DATEN ===
WHITELIST_KATEGORIEN = [
"unternehmen", "hersteller", "produktion", "industrie",
"maschinenbau", "technik", "dienstleistung", "chemie",
"pharma", "elektro", "medizin", "bau", "energie",
"logistik", "automobil"
]
def get_wikipedia_data(name, website_hint=""):
domain_key = extract_domain_key(website_hint)
search_terms = [name, domain_key] if domain_key else [name]
for term in search_terms:
if not term:
continue
for attempt in range(MAX_RETRIES):
try:
results = wikipedia.search(term, results=3)
for title in results:
try:
page = wikipedia.page(title, auto_suggest=False)
html = requests.get(page.url, timeout=10).text
if name.split()[0].lower() in page.content.lower() or (domain_key and domain_key.lower() in html.lower()):
soup = BeautifulSoup(html, "html.parser")
branche, umsatz = parse_infobox(soup)
if not branche:
for category in page.categories:
if any(kw in category.lower() for kw in WHITELIST_KATEGORIEN):
branche = category
break
return page.url, branche or "k.A.", umsatz or "k.A."
except:
continue
except Exception as e:
print(f"⚠️ Wikipedia-Fehler ({term}, Versuch {attempt+1}): {str(e)[:100]}")
time.sleep(RETRY_DELAY)
return "", "k.A.", "k.A."
# === SCHRITT 1: WIKIPEDIA VERARBEITUNG ===
for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))):
row = sheet_values[i]
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {i+1}: {row[0]}")
url, wiki_branche, umsatz = get_wikipedia_data(row[0], row[1])
wiki_final = wiki_branche if url else "k.A."
umsatz_final = umsatz if url else "k.A."
values = [
wiki_final,
"k.A.", # LinkedIn-Branche leer
umsatz_final,
"k.A.", "k.A.", "k.A.",
url,
datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"k.A.", "k.A."
]
sheet.update(range_name=f"G{i+1}:P{i+1}", values=[values])
print(f"✅ Aktualisiert: {values[:3]}...")
time.sleep(RETRY_DELAY)
# === SCHRITT 2: GPT-BEWERTUNG ===
def classify_company(row, wikipedia_url=""):
user_prompt = {
"role": "user",
"content": f"{row[0]};{row[1]};{row[2]};{row[4]};{row[5]}\nWikipedia-Link: {wikipedia_url}"
}
for attempt in range(MAX_RETRIES):
try:
response = openai.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "system",
"content": (
"Du bist ein Experte für Brancheneinstufung und FSM-Potenzialbewertung.\n"
"Bitte beziehe dich ausschließlich auf das konkret genannte Unternehmen.\n"
"FSM steht für Field Service Management. Ziel ist es, Unternehmen mit >50 Technikern im Außendienst zu identifizieren.\n\n"
"Struktur: Firmenname; Website; Ort; Aktuelle Einstufung; Beschreibung der Branche Extern\n\n"
"Gib deine Antwort im CSV-Format zurück (1 Zeile, 8 Spalten):\n"
"Wikipedia-Branche;LinkedIn-Branche;Umsatz (Mio €);Empfohlene Neueinstufung;Begründung;FSM-Relevanz;Techniker-Einschätzung;Techniker-Begründung"
)
},
user_prompt
],
temperature=0,
timeout=15
)
full_text = response.choices[0].message.content.strip()
break
except Exception as e:
print(f"⚠️ GPT-Fehler (Versuch {attempt+1}): {str(e)[:100]}")
time.sleep(RETRY_DELAY)
else:
print("❌ GPT 3x fehlgeschlagen Standardwerte")
full_text = "k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A."
lines = full_text.splitlines()
csv_line = next((l for l in lines if ";" in l), "")
parts = [v.strip() for v in csv_line.split(";")] if csv_line else ["k.A."] * 8
with open(LOG_CSV, "a", newline="", encoding="utf-8") as log:
writer = csv.writer(log, delimiter=";")
writer.writerow([datetime.now().strftime("%Y-%m-%d %H:%M:%S"), row[0], *parts, full_text])
return parts
# === SCHRITT 2 DURCHFÜHREN ===
for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))):
row = sheet_values[i]
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] GPT-Bewertung für Zeile {i+1}: {row[0]}")
wiki_url = row[12] if len(row) > 12 else ""
wiki, linkedin, umsatz_chat, new_cat, reason, fsm, techniker, techniker_reason = classify_company(row, wikipedia_url=wiki_url)
values = [
wiki,
linkedin,
umsatz_chat,
new_cat,
reason,
fsm,
wiki_url,
datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
techniker,
techniker_reason
]
sheet.update(range_name=f"G{i+1}:P{i+1}", values=[values])
time.sleep(RETRY_DELAY)
print("\n✅ GPT-Bewertung abgeschlossen")