Files
Brancheneinstufung2/brancheneinstufung.py
Floke 09885848ec Verbessert: Wikipedia-Suchlogik erneut optimiert (Version 1.0.2)
- Wieder eingeführt: Kombination aus vollständigem Firmennamen, ersten zwei Wörtern und Domain-Fragment zur Suche
- Erhöhte Trefferwahrscheinlichkeit durch Titel-, Inhalts-, Domain- und Ähnlichkeitsprüfung
- Info aus Infobox (Branche & Umsatz) wird bevorzugt; Kategorien nur als Fallback
- Version in Spalte Q geschrieben zur Nachverfolgbarkeit
2025-03-31 06:12:53 +00:00

229 lines
8.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# Schritt 1: Nur Wikipedia-Daten extrahieren und in Google Sheet schreiben
import os
import time
import re
import gspread
import wikipedia
import requests
import openai
import csv
from bs4 import BeautifulSoup
from oauth2client.service_account import ServiceAccountCredentials
from datetime import datetime
from difflib import SequenceMatcher
# === KONFIGURATION ===
VERSION = "1.0.2-wiki-only"
LANG = "de"
CREDENTIALS = "service_account.json"
SHEET_URL = "https://docs.google.com/spreadsheets/d/1u_gHr9JUfmV1-iviRzbSe3575QEp7KLhK5jFV_gJcgo"
DURCHLÄUFE = int(input("Wieviele Zeilen sollen überprüft werden? "))
MAX_RETRIES = 3
RETRY_DELAY = 5
LOG_CSV = "gpt_antworten_log.csv"
# === OpenAI API-KEY LADEN ===
with open("api_key.txt", "r") as f:
openai.api_key = f.read().strip()
# === GOOGLE SHEET VERBINDUNG ===
scope = ["https://www.googleapis.com/auth/spreadsheets"]
creds = ServiceAccountCredentials.from_json_keyfile_name(CREDENTIALS, scope)
sheet = gspread.authorize(creds).open_by_url(SHEET_URL).sheet1
sheet_values = sheet.get_all_values()
# === STARTINDEX SUCHEN (Spalte N = Index 13) ===
filled_n = [row[13] if len(row) > 13 else '' for row in sheet_values[1:]]
start = next((i + 1 for i, v in enumerate(filled_n, start=1) if not str(v).strip()), len(filled_n) + 1)
print(f"Starte bei Zeile {start+1}")
wikipedia.set_lang(LANG)
# === DOMAIN SCHLÜSSEL ===
def extract_domain_key(url):
if not url:
return ""
clean_url = url.replace("https://", "").replace("http://", "").split("/")[0]
parts = clean_url.split(".")
return parts[0] if len(parts) > 1 else ""
# === INFOBOX-PARSING ===
def parse_infobox_with_fallback(soup):
infobox = soup.find("table", class_="infobox")
branche = "k.A."
umsatz = "k.A."
if infobox:
for row in infobox.find_all("tr"):
th = row.find("th")
td = row.find("td")
if not th or not td:
continue
label = th.get_text(strip=True).lower()
value = td.get_text(strip=True)
if any(b in label for b in ["branche", "tätigkeitsfeld", "industriezweig", "wirtschaftszweig"]):
branche = value
if "umsatz" in label and "mio" in value.lower():
match = re.search(r"(\d+[\d.,]*)\\s*Mio", value)
if match:
umsatz = match.group(1).replace(",", ".")
return branche, umsatz
# === WIKIPEDIA DATEN ===
WHITELIST_KATEGORIEN = [
"unternehmen", "hersteller", "produktion", "industrie",
"maschinenbau", "technik", "dienstleistung", "chemie",
"pharma", "elektro", "medizin", "bau", "energie",
"logistik", "automobil"
]
def similarity(a, b):
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
def validate_wikipedia_page(content, title, name, domain_key):
name_fragments = name.lower().split()[:2]
title_check = any(frag in title.lower() for frag in name_fragments)
content_check = any(frag in content.lower() for frag in name_fragments)
domain_check = domain_key and domain_key.lower() in content.lower()
sim_check = similarity(name, title) > 0.6
return (title_check or content_check or domain_check or sim_check)
def get_wikipedia_data(name, website_hint=""):
begriffe = [name.strip(), " ".join(name.split()[:2])]
domain_key = extract_domain_key(website_hint)
if domain_key:
begriffe.append(domain_key)
best_score = 0
best_result = ("", "k.A.", "k.A.")
for suchbegriff in begriffe:
if not suchbegriff:
continue
for attempt in range(MAX_RETRIES):
try:
results = wikipedia.search(suchbegriff, results=5)
for title in results:
try:
page = wikipedia.page(title, auto_suggest=False)
html = requests.get(page.url, timeout=10).text
if not validate_wikipedia_page(page.content, title, name, domain_key):
continue
soup = BeautifulSoup(html, "html.parser")
branche, umsatz = parse_infobox_with_fallback(soup)
score = similarity(name, title)
if branche != "k.A.":
score += 0.1
if domain_key and domain_key in page.content.lower():
score += 0.1
if score > best_score:
best_score = score
best_result = (page.url, branche or "k.A.", umsatz or "k.A.")
except:
continue
except Exception as e:
print(f"⚠️ Wikipedia-Fehler ({suchbegriff}, Versuch {attempt+1}): {str(e)[:100]}")
time.sleep(RETRY_DELAY)
return best_result
# === SCHRITT 1: WIKIPEDIA VERARBEITUNG ===
for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))):
row = sheet_values[i]
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Verarbeite Zeile {i+1}: {row[0]}")
url, wiki_branche, umsatz = get_wikipedia_data(row[0], row[1])
wiki_final = wiki_branche if url else "k.A."
umsatz_final = umsatz if url else "k.A."
values = [
wiki_final,
"k.A.", # LinkedIn-Branche leer
umsatz_final,
"k.A.", "k.A.", "k.A.",
url,
datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"k.A.", "k.A.",
VERSION
]
sheet.update(range_name=f"G{i+1}:Q{i+1}", values=[values])
print(f"✅ Aktualisiert: {values[:3]}...")
time.sleep(RETRY_DELAY)
print("\n✅ Wikipedia-Auswertung abgeschlossen")
# === SCHRITT 2: GPT-BEWERTUNG ===
def classify_company(row, wikipedia_url=""):
user_prompt = {
"role": "user",
"content": f"{row[0]};{row[1]};{row[2]};{row[4]};{row[5]}\nWikipedia-Link: {wikipedia_url}"
}
for attempt in range(MAX_RETRIES):
try:
response = openai.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "system",
"content": (
"Du bist ein Experte für Brancheneinstufung und FSM-Potenzialbewertung.\n"
"Bitte beziehe dich ausschließlich auf das konkret genannte Unternehmen.\n"
"FSM steht für Field Service Management. Ziel ist es, Unternehmen mit >50 Technikern im Außendienst zu identifizieren.\n\n"
"Struktur: Firmenname; Website; Ort; Aktuelle Einstufung; Beschreibung der Branche Extern\n\n"
"Gib deine Antwort im CSV-Format zurück (1 Zeile, 8 Spalten):\n"
"Wikipedia-Branche;LinkedIn-Branche;Umsatz (Mio €);Empfohlene Neueinstufung;Begründung;FSM-Relevanz;Techniker-Einschätzung;Techniker-Begründung"
)
},
user_prompt
],
temperature=0,
timeout=15
)
full_text = response.choices[0].message.content.strip()
break
except Exception as e:
print(f"⚠️ GPT-Fehler (Versuch {attempt+1}): {str(e)[:100]}")
time.sleep(RETRY_DELAY)
else:
print("❌ GPT 3x fehlgeschlagen Standardwerte")
full_text = "k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A.;k.A."
lines = full_text.splitlines()
csv_line = next((l for l in lines if ";" in l), "")
parts = [v.strip() for v in csv_line.split(";")] if csv_line else ["k.A."] * 8
with open(LOG_CSV, "a", newline="", encoding="utf-8") as log:
writer = csv.writer(log, delimiter=";")
writer.writerow([datetime.now().strftime("%Y-%m-%d %H:%M:%S"), row[0], *parts, full_text])
return parts
# === SCHRITT 2 DURCHFÜHREN ===
for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))):
row = sheet_values[i]
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] GPT-Bewertung für Zeile {i+1}: {row[0]}")
wiki_url = row[12] if len(row) > 12 else ""
wiki, linkedin, umsatz_chat, new_cat, reason, fsm, techniker, techniker_reason = classify_company(row, wikipedia_url=wiki_url)
values = [
wiki,
linkedin,
umsatz_chat,
new_cat,
reason,
fsm,
wiki_url,
datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
techniker,
techniker_reason
]
sheet.update(range_name=f"G{i+1}:P{i+1}", values=[values])
time.sleep(RETRY_DELAY)
print("\n✅ GPT-Bewertung abgeschlossen")