Files
Brancheneinstufung2/brancheneinstufung.py
2025-03-31 11:33:49 +00:00

191 lines
6.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# Neue Version (1.0.11) mit optimierter Wikipedia-Suchlogik, Original-Auswertung bleibt erhalten
import os
import time
import re
import gspread
import wikipedia
import requests
import openai
import csv
from bs4 import BeautifulSoup
from lxml import html as lh
from oauth2client.service_account import ServiceAccountCredentials
from datetime import datetime
from difflib import SequenceMatcher
# === KONFIGURATION ===
VERSION = "1.0.11"
LANG = "de"
CREDENTIALS = "service_account.json"
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
DURCHLÄUFE = int(input("Wieviele Zeilen sollen überprüft werden? "))
MAX_RETRIES = 3
RETRY_DELAY = 5
LOG_CSV = "gpt_antworten_log.csv"
SIMILARITY_THRESHOLD = 0.6
# === OpenAI API-KEY LADEN ===
with open("api_key.txt", "r") as f:
openai.api_key = f.read().strip()
# === GOOGLE SHEET VERBINDUNG ===
scope = ["https://www.googleapis.com/auth/spreadsheets"]
creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS, scope)
sheet = gspread.authorize(creds).open_by_url(SHEET_URL).sheet1
sheet_values = sheet.get_all_values()
# === STARTINDEX SUCHEN (Spalte N = Index 13) ===
filled_n = [row[13] if len(row) > 13 else '' for row in sheet_values[1:]]
start = next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip()), len(filled_n) + 1)
print(f"Starte bei Zeile {start+1}")
wikipedia.set_lang(LANG)
def similar(a, b):
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
def get_wikipedia_data(name, website_hint=""):
begriffe = [name.strip(), " ".join(name.split()[:2])]
domain_key = ""
if website_hint:
parts = website_hint.replace("https://", "").replace("http://", "").split(".")
if len(parts) > 1:
domain_key = parts[0]
begriffe.append(domain_key)
for suchbegriff in begriffe:
try:
results = wikipedia.search(suchbegriff, results=5)
except:
continue
for title in results:
try:
page = wikipedia.page(title, auto_suggest=False)
html_raw = requests.get(page.url).text
if domain_key and domain_key not in html_raw.lower():
continue
if similar(page.title, name) < SIMILARITY_THRESHOLD:
continue
soup = BeautifulSoup(html_raw, 'html.parser')
infobox = soup.find("table", class_="infobox")
branche = umsatz = ""
if infobox:
for row in infobox.find_all("tr"):
th, td = row.find("th"), row.find("td")
if not th or not td:
continue
if "branche" in th.text.lower():
branche = td.text.strip()
if "umsatz" in th.text.lower():
umsatz_raw = td.text.strip()
umsatz = re.sub(r"\[[^\]]*\]", "", umsatz_raw)
if not branche:
cats = page.categories
branche = cats[0] if cats else "k.A."
return page.url, branche or "k.A.", umsatz or "k.A."
except:
continue
return "", "k.A.", "k.A."
# === VERARBEITUNG ===
for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))):
row = sheet_values[i]
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {i+1}: {row[0]}")
url, branche, umsatz = get_wikipedia_data(row[0], row[1])
branche_final = branche if url else "k.A."
umsatz_final = umsatz if url else "k.A."
values = [
branche_final,
"k.A.",
umsatz_final,
"k.A.", "k.A.", "k.A.",
url,
datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"k.A.", "k.A.",
VERSION
]
sheet.update(range_name=f"G{i+1}:Q{i+1}", values=[values])
print(f"✅ Aktualisiert: {values[:3]}...")
time.sleep(RETRY_DELAY)
print("\n✅ Wikipedia-Auswertung abgeschlossen")
# === SCHRITT 2: GPT-BEWERTUNG ===
def classify_company(row, wikipedia_url=""):
user_prompt = {
"role": "user",
"content": f"{row[0]};{row[1]};{row[2]};{row[4]};{row[5]}\nWikipedia-Link: {wikipedia_url}"
}
for attempt in range(MAX_RETRIES):
try:
response = openai.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "system",
"content": (
"Du bist ein Experte für Brancheneinstufung und FSM-Potenzialbewertung.\n"
"Bitte beziehe dich ausschließlich auf das konkret genannte Unternehmen.\n"
"FSM steht für Field Service Management. Ziel ist es, Unternehmen mit >50 Technikern im Außendienst zu identifizieren.\n\n"
"Struktur: Firmenname; Website; Ort; Aktuelle Einstufung; Beschreibung der Branche Extern\n\n"
"Gib deine Antwort im CSV-Format zurück (1 Zeile, 8 Spalten):\n"
"Wikipedia-Branche;LinkedIn-Branche;Umsatz (Mio €);Empfohlene Neueinstufung;Begründung;FSM-Relevanz;Techniker-Einschätzung;Techniker-Begründung"
)
},
user_prompt
],
temperature=0,
timeout=15
)
full_text = response.choices[0].message.content.strip()
break
except Exception as e:
print(f"⚠️ GPT-Fehler (Versuch {attempt+1}): {str(e)[:100]}")
time.sleep(RETRY_DELAY)
else:
print("❌ GPT 3x fehlgeschlagen Standardwerte")
full_text = "k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A."
lines = full_text.splitlines()
csv_line = next((l for l in lines if ";" in l), "")
parts = [v.strip() for v in csv_line.split(";")] if csv_line else ["k.A."] * 8
with open(LOG_CSV, "a", newline="", encoding="utf-8") as log:
writer = csv.writer(log, delimiter=";")
writer.writerow([datetime.now().strftime("%Y-%m-%d %H:%M:%S"), row[0], *parts, full_text])
return parts
# === SCHRITT 2 DURCHFÜHREN ===
for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))):
row = sheet_values[i]
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] GPT-Bewertung für Zeile {i+1}: {row[0]}")
wiki_url = row[12] if len(row) > 12 else ""
wiki, linkedin, umsatz_chat, new_cat, reason, fsm, techniker, techniker_reason = classify_company(row, wikipedia_url=wiki_url)
values = [
wiki,
linkedin,
umsatz_chat,
new_cat,
reason,
fsm,
wiki_url,
datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
techniker,
techniker_reason
]
sheet.update(range_name=f"G{i+1}:P{i+1}", values=[values])
time.sleep(RETRY_DELAY)
print("\n✅ GPT-Bewertung abgeschlossen")