diff --git a/brancheneinstufung.py b/brancheneinstufung.py index 7706d7a1..bda631b3 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -96,6 +96,32 @@ system_prompt = { ) } +# === GPT BEWERTUNG === +def classify_company(row, wikipedia_url=""): + user_prompt = { + "role": "user", + "content": ( + f"{row[0]};{row[1]};{row[2]};{row[4]};{row[5]}\n" + f"Wikipedia-Link: {wikipedia_url}" + ) + } + response = openai.chat.completions.create( + model="gpt-3.5-turbo", + messages=[system_prompt, user_prompt], + temperature=0 + ) + full_text = response.choices[0].message.content.strip() + lines = full_text.splitlines() + csv_line = next((l for l in lines if ";" in l and not l.lower().startswith("wikipedia-branche")), "") + parts = [v.strip().strip('"') for v in csv_line.split(";")] if csv_line else [] + if len(parts) != 8: + print("⚠️ Antwort unvollständig → Setze alles auf 'k.A.'") + parts = ["k.A."] * 8 + with open(LOG_CSV, "a", newline="", encoding="utf-8") as log: + writer = csv.writer(log, delimiter=";") + writer.writerow([row[0], *parts, full_text]) + return parts + # === WIKIPEDIA DATEN LADEN === # Positivliste für Wikipedia-Kategorien, die auf Unternehmen hinweisen können WHITELIST_KATEGORIEN = [ @@ -120,7 +146,6 @@ def get_wikipedia_data(name, website_hint=""): for title in results: try: page = wikipedia.page(title) - # Titelprüfung verbessern if any(x in page.title.lower() for x in ["krankenkasse", "versicherung"]): continue url = page.url @@ -131,7 +156,6 @@ def get_wikipedia_data(name, website_hint=""): continue if name.lower().split()[0] not in page.title.lower(): continue - url = page.url soup = BeautifulSoup(html, 'html.parser') infobox = soup.find("table", class_=["infobox", "infobox vcard"]) if not infobox: @@ -165,33 +189,6 @@ def get_wikipedia_data(name, website_hint=""): continue return "", "k.A.", "k.A." -# === GPT BEWERTUNG === -def classify_company(row, wikipedia_url=""): - user_prompt = { - "role": "user", - "content": ( - f"{row[0]};{row[1]};{row[2]};{row[4]};{row[5]} -" - f"Wikipedia-Link: {wikipedia_url}" - ) - } - response = openai.chat.completions.create( - model="gpt-3.5-turbo", - messages=[system_prompt, user_prompt], - temperature=0 - ) - full_text = response.choices[0].message.content.strip() - lines = full_text.splitlines() - csv_line = next((l for l in lines if ";" in l and not l.lower().startswith("wikipedia-branche")), "") - parts = [v.strip().strip('"') for v in csv_line.split(";")] if csv_line else [] - if len(parts) != 8: - print("⚠️ Antwort unvollständig → Setze alles auf 'k.A.'") - parts = ["k.A."] * 8 - with open(LOG_CSV, "a", newline="", encoding="utf-8") as log: - writer = csv.writer(log, delimiter=";") - writer.writerow([row[0], *parts, full_text]) - return parts - # === VERARBEITUNG === for i in range(start, min(start + DURCHLÄUFE, len(sheet_values))): row = sheet_values[i]