bugfix
This commit is contained in:
@@ -1556,107 +1556,103 @@ def get_website_raw(url, max_length=1000, verify_cert=False):
|
|||||||
# (Code dafür bleibt wie in der Antwort von 16:24 Uhr)
|
# (Code dafür bleibt wie in der Antwort von 16:24 Uhr)
|
||||||
@retry_on_failure
|
@retry_on_failure
|
||||||
def summarize_batch_openai(tasks_data):
|
def summarize_batch_openai(tasks_data):
|
||||||
# ... (Implementierung wie zuvor) ...
|
"""
|
||||||
|
Fasst eine Liste von Rohtexten in einem einzigen OpenAI API Call zusammen.
|
||||||
|
Die Prüfung auf das Token-Limit wird jetzt primär der API überlassen.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tasks_data (list): Eine Liste von Dictionaries, jedes enthält:
|
||||||
|
{'row_num': int, 'raw_text': str}
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Ein Dictionary, das Zeilennummern auf ihre Zusammenfassungen mappt.
|
||||||
|
z.B. {2122: "Zusammenfassung A", 2123: "Zusammenfassung B"}
|
||||||
|
Bei Fehlern oder fehlenden Zusammenfassungen wird "k.A." verwendet.
|
||||||
|
"""
|
||||||
if not tasks_data: return {}
|
if not tasks_data: return {}
|
||||||
valid_tasks = [t for t in tasks_data if t.get("raw_text") and t["raw_text"] != "k.A." and t["raw_text"].strip()]
|
|
||||||
if not valid_tasks: return {t['row_num']: "k.A. (Kein gültiger Rohtext)" for t in tasks_data}
|
# Filtere Tasks, die gültigen Text haben
|
||||||
debug_print(f"Starte Batch-Zusammenfassung für {len(valid_tasks)} gültige Texte...")
|
valid_tasks = [t for t in tasks_data if t.get("raw_text") and t["raw_text"] not in ["k.A.", "k.A. (Nur Cookie-Banner erkannt)", "k.A. (Fehler)"] and str(t.get("raw_text")).strip()]
|
||||||
|
if not valid_tasks:
|
||||||
|
debug_print("Keine gültigen Rohtexte für Batch-Zusammenfassung gefunden.")
|
||||||
|
return {t['row_num']: "k.A. (Kein gültiger Rohtext)" for t in tasks_data}
|
||||||
|
|
||||||
|
debug_print(f"Starte Batch-Zusammenfassung für {len(valid_tasks)} gültige Texte (Zeilen: {[t['row_num'] for t in valid_tasks]})...")
|
||||||
|
|
||||||
|
# --- Aggregierten Prompt erstellen ---
|
||||||
prompt_parts = [
|
prompt_parts = [
|
||||||
"Du bist ein KI-Assistent...", # Gekürzt für Lesbarkeit
|
"Du bist ein KI-Assistent...", # (Rest des Prompts wie gehabt)
|
||||||
"RESULTAT <Zeilennummer>: <Zusammenfassung für diese Zeilennummer>",
|
"RESULTAT <Zeilennummer>: <Zusammenfassung für diese Zeilennummer>",
|
||||||
"\n--- Texte zur Zusammenfassung ---"
|
"\n--- Texte zur Zusammenfassung ---"
|
||||||
]
|
]
|
||||||
text_block = ""
|
text_block = ""
|
||||||
row_numbers_in_batch = []
|
row_numbers_in_batch = [] # Zeilen, die tatsächlich im Prompt landen
|
||||||
total_chars = 0
|
|
||||||
max_chars_per_batch = 6000
|
# Baue den Textblock ohne interne Längenprüfung zusammen
|
||||||
for task in valid_tasks:
|
for task in valid_tasks:
|
||||||
row_num = task['row_num']
|
row_num = task['row_num']
|
||||||
raw_text = task['raw_text']
|
raw_text = task['raw_text']
|
||||||
|
# Kürzen sollte in get_website_raw passieren, aber zur Sicherheit:
|
||||||
|
raw_text = raw_text[:1500] # Limitiere jeden Text auf max 1500 Zeichen im Prompt
|
||||||
|
|
||||||
entry_text = f"\n--- TEXT Zeile {row_num} ---\n{raw_text}\n--- ENDE TEXT Zeile {row_num} ---\n"
|
entry_text = f"\n--- TEXT Zeile {row_num} ---\n{raw_text}\n--- ENDE TEXT Zeile {row_num} ---\n"
|
||||||
if total_chars + len(entry_text) > max_chars_per_batch:
|
|
||||||
debug_print(f"WARNUNG: Batch-Zeichenlimit ({max_chars_per_batch}) erreicht bei Zeile {row_num}.")
|
|
||||||
continue
|
|
||||||
text_block += entry_text
|
text_block += entry_text
|
||||||
total_chars += len(entry_text)
|
row_numbers_in_batch.append(row_num) # Füge die Zeilennummer hinzu
|
||||||
row_numbers_in_batch.append(row_num)
|
|
||||||
|
# --- Interne Längenprüfung ENTFERNT ---
|
||||||
|
# max_chars_per_batch = 15000 # Nicht mehr relevant für die Logik hier
|
||||||
|
# if total_chars + len(entry_text) > max_chars_per_batch: # ENTFERNT
|
||||||
|
# debug_print(f"WARNUNG: ...") # ENTFERNT
|
||||||
|
# continue # ENTFERNT
|
||||||
|
|
||||||
if not row_numbers_in_batch:
|
if not row_numbers_in_batch:
|
||||||
debug_print("Keine Texte im Batch nach Längenprüfung für OpenAI.")
|
# Sollte nur passieren, wenn valid_tasks leer war
|
||||||
return {t['row_num']: "k.A. (Batch-Limit erreicht)" for t in valid_tasks}
|
debug_print("Keine Texte im Batch für OpenAI.")
|
||||||
|
return {t['row_num']: "k.A. (Validierungsfehler?)" for t in tasks_data}
|
||||||
|
|
||||||
prompt_parts.append(text_block)
|
prompt_parts.append(text_block)
|
||||||
prompt_parts.append("--- Ende der Texte ---")
|
prompt_parts.append("--- Ende der Texte ---")
|
||||||
prompt_parts.append("Bitte gib NUR die 'RESULTAT <Zeilennummer>: ...' Zeilen zurück.")
|
prompt_parts.append("Bitte gib NUR die 'RESULTAT <Zeilennummer>: ...' Zeilen zurück.")
|
||||||
final_prompt = "\n".join(prompt_parts)
|
final_prompt = "\n".join(prompt_parts)
|
||||||
|
|
||||||
|
# Optional: Token zählen zur Info, aber nicht zur Blockade
|
||||||
|
try:
|
||||||
|
prompt_tokens = token_count(final_prompt)
|
||||||
|
debug_print(f"Geschätzte Prompt-Tokens für Batch: {prompt_tokens} (Limit ca. 4096 für gpt-3.5-turbo)")
|
||||||
|
if prompt_tokens > 3500: # Nur eine Warnung
|
||||||
|
debug_print("WARNUNG: Geschätzte Prompt-Tokens hoch, API könnte Fehler werfen.")
|
||||||
|
except Exception as e_tc:
|
||||||
|
debug_print(f"Fehler beim Token-Zählen: {e_tc}")
|
||||||
|
|
||||||
|
|
||||||
|
# --- OpenAI API Call (Die API wirft Fehler bei Token-Limit) ---
|
||||||
chat_response = call_openai_chat(final_prompt, temperature=0.2)
|
chat_response = call_openai_chat(final_prompt, temperature=0.2)
|
||||||
|
|
||||||
|
# --- Antwort parsen (wie gehabt) ---
|
||||||
summaries = {row_num: "k.A. (Keine Antwort geparst)" for row_num in row_numbers_in_batch}
|
summaries = {row_num: "k.A. (Keine Antwort geparst)" for row_num in row_numbers_in_batch}
|
||||||
if chat_response:
|
if chat_response:
|
||||||
lines = chat_response.strip().split('\n')
|
# ... (Parsing-Logik bleibt gleich) ...
|
||||||
parsed_count = 0
|
lines = chat_response.strip().split('\n'); parsed_count = 0
|
||||||
for line in lines:
|
for line in lines:
|
||||||
match = re.match(r"RESULTAT (\d+): (.*)", line.strip())
|
match = re.match(r"RESULTAT (\d+): (.*)", line.strip())
|
||||||
if match:
|
if match:
|
||||||
row_num = int(match.group(1))
|
row_num = int(match.group(1)); summary_text = match.group(2).strip()
|
||||||
summary_text = match.group(2).strip()
|
|
||||||
if row_num in summaries: summaries[row_num] = summary_text; parsed_count += 1
|
if row_num in summaries: summaries[row_num] = summary_text; parsed_count += 1
|
||||||
debug_print(f"Batch-Zusammenfassung: {parsed_count} von {len(row_numbers_in_batch)} erfolgreich geparst.")
|
debug_print(f"Batch-Zusammenfassung: {parsed_count} von {len(row_numbers_in_batch)} erfolgreich geparst.")
|
||||||
if parsed_count < len(row_numbers_in_batch): debug_print(f"WARNUNG: Nicht alle Zusammenfassungen geparst. Antwort: {chat_response[:500]}...")
|
if parsed_count < len(row_numbers_in_batch): debug_print(f"WARNUNG: Nicht alle Zusammenfassungen geparst. Antwort: {chat_response[:500]}...")
|
||||||
else: debug_print("Fehler: Keine gültige Antwort von OpenAI für Batch-Zusammenfassung.")
|
else:
|
||||||
# Füge k.A. für Tasks hinzu, die ursprünglich gültigen Text hatten, aber evtl. wegen Limit nicht im Batch waren
|
debug_print("Fehler: Keine gültige Antwort von OpenAI für Batch-Zusammenfassung erhalten.")
|
||||||
for task in valid_tasks:
|
# Wenn der API Call fehlschlägt (z.B. Token Limit), ist chat_response None,
|
||||||
if task['row_num'] not in summaries: summaries[task['row_num']] = "k.A. (Nicht im OpenAI-Batch)"
|
# alle summaries bleiben "k.A."
|
||||||
# Füge k.A. für Tasks hinzu, die ungültigen Rohtext hatten
|
|
||||||
|
# Füge k.A. für Tasks hinzu, die ungültigen Rohtext hatten (aus valid_tasks gefiltert)
|
||||||
for task in tasks_data:
|
for task in tasks_data:
|
||||||
if task['row_num'] not in summaries: summaries[task['row_num']] = "k.A. (Ungültiger Rohtext)"
|
if task['row_num'] not in summaries:
|
||||||
|
summaries[task['row_num']] = "k.A. (Ungültiger Rohtext o.ä.)"
|
||||||
|
|
||||||
return summaries
|
return summaries
|
||||||
|
|
||||||
|
|
||||||
@retry_on_failure
|
|
||||||
def scrape_website_details(url):
|
|
||||||
"""Extrahiert Title, Description, H1-H3 von einer Website."""
|
|
||||||
if not url or not isinstance(url, str) or url.strip().lower() == 'k.a.':
|
|
||||||
return "k.A."
|
|
||||||
|
|
||||||
if not url.lower().startswith("http"):
|
|
||||||
url = "https://" + url
|
|
||||||
|
|
||||||
headers = {"User-Agent": "Mozilla/5.0"}
|
|
||||||
try:
|
|
||||||
response = requests.get(url, timeout=10, headers=headers, verify=False) # Oft nötig bei vielen Seiten
|
|
||||||
response.raise_for_status()
|
|
||||||
response.encoding = response.apparent_encoding
|
|
||||||
soup = BeautifulSoup(response.text, Config.HTML_PARSER)
|
|
||||||
|
|
||||||
# Title
|
|
||||||
title_tag = soup.find("title")
|
|
||||||
title = clean_text(title_tag.get_text()) if title_tag else "k.A."
|
|
||||||
|
|
||||||
# Description
|
|
||||||
meta_tag = soup.find("meta", attrs={"name": lambda x: x and x.lower() == "description"})
|
|
||||||
description = clean_text(meta_tag["content"]) if meta_tag and meta_tag.get("content") else "k.A."
|
|
||||||
|
|
||||||
# Headers H1-H3
|
|
||||||
headers_data = {}
|
|
||||||
for tag in ["h1", "h2", "h3"]:
|
|
||||||
elements = soup.find_all(tag)
|
|
||||||
header_texts = [clean_text(el.get_text()) for el in elements]
|
|
||||||
header_texts = [h for h in header_texts if h != "k.A." and len(h) > 2] # Filtern
|
|
||||||
headers_data[tag] = ", ".join(header_texts[:5]) if header_texts else "k.A." # Max 5 pro Typ
|
|
||||||
|
|
||||||
combined = (
|
|
||||||
f"Title: {title} | Description: {description} | "
|
|
||||||
f"H1: {headers_data['h1']} | H2: {headers_data['h2']} | H3: {headers_data['h3']}"
|
|
||||||
)
|
|
||||||
# Kürze ggf. das Gesamtergebnis
|
|
||||||
return combined[:1500] # Limit Gesamtstring
|
|
||||||
|
|
||||||
except requests.exceptions.RequestException as e:
|
|
||||||
debug_print(f"Netzwerk-/HTTP-Fehler beim Detail-Scraping von {url}: {e}")
|
|
||||||
return "k.A."
|
|
||||||
except Exception as e:
|
|
||||||
debug_print(f"Allgemeiner Fehler beim Detail-Scraping von {url}: {e}")
|
|
||||||
return "k.A."
|
|
||||||
|
|
||||||
|
|
||||||
# ==================== OPENAI / CHATGPT FUNCTIONS ====================
|
# ==================== OPENAI / CHATGPT FUNCTIONS ====================
|
||||||
|
|
||||||
@retry_on_failure
|
@retry_on_failure
|
||||||
@@ -2412,8 +2408,8 @@ def process_website_summarization_batch(sheet_handler, start_row_index_in_sheet,
|
|||||||
debug_print(f"Starte Website-Zusammenfassung (OpenAI Batch) für Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}...")
|
debug_print(f"Starte Website-Zusammenfassung (OpenAI Batch) für Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}...")
|
||||||
|
|
||||||
# --- Konfiguration ---
|
# --- Konfiguration ---
|
||||||
OPENAI_BATCH_SIZE_LIMIT = 8
|
openai_batch_size = Config.OPENAI_BATCH_SIZE_LIMIT # Holt Wert aus Config (jetzt z.B. 1)
|
||||||
update_batch_row_limit = 50
|
update_batch_row_limit = Config.UPDATE_BATCH_ROW_LIMIT # z.B. 50
|
||||||
|
|
||||||
# --- Lade Daten ---
|
# --- Lade Daten ---
|
||||||
if not sheet_handler.load_data(): return
|
if not sheet_handler.load_data(): return
|
||||||
@@ -2425,9 +2421,7 @@ def process_website_summarization_batch(sheet_handler, start_row_index_in_sheet,
|
|||||||
rohtext_col_idx = COLUMN_MAP.get("Website Rohtext")
|
rohtext_col_idx = COLUMN_MAP.get("Website Rohtext")
|
||||||
summary_col_idx = COLUMN_MAP.get("Website Zusammenfassung")
|
summary_col_idx = COLUMN_MAP.get("Website Zusammenfassung")
|
||||||
version_col_idx = COLUMN_MAP.get("Version")
|
version_col_idx = COLUMN_MAP.get("Version")
|
||||||
if None in [rohtext_col_idx, summary_col_idx, version_col_idx]:
|
if None in [rohtext_col_idx, summary_col_idx, version_col_idx]: return debug_print(f"FEHLER: Indizes fehlen.")
|
||||||
debug_print(f"FEHLER: Benötigte Indizes für process_website_summarization_batch fehlen.")
|
|
||||||
return
|
|
||||||
summary_col_letter = sheet_handler._get_col_letter(summary_col_idx + 1)
|
summary_col_letter = sheet_handler._get_col_letter(summary_col_idx + 1)
|
||||||
version_col_letter = sheet_handler._get_col_letter(version_col_idx + 1)
|
version_col_letter = sheet_handler._get_col_letter(version_col_idx + 1)
|
||||||
|
|
||||||
@@ -2446,56 +2440,48 @@ def process_website_summarization_batch(sheet_handler, start_row_index_in_sheet,
|
|||||||
|
|
||||||
# Prüfung 1: Ist Rohtext vorhanden und gültig?
|
# Prüfung 1: Ist Rohtext vorhanden und gültig?
|
||||||
raw_text = ""
|
raw_text = ""
|
||||||
if len(row) > rohtext_col_idx:
|
if len(row) > rohtext_col_idx: raw_text = str(row[rohtext_col_idx]).strip()
|
||||||
raw_text = str(row[rohtext_col_idx]).strip()
|
|
||||||
if not raw_text or raw_text == "k.A." or raw_text == "k.A. (Nur Cookie-Banner erkannt)" or raw_text == "k.A. (Fehler)":
|
if not raw_text or raw_text == "k.A." or raw_text == "k.A. (Nur Cookie-Banner erkannt)" or raw_text == "k.A. (Fehler)":
|
||||||
skipped_no_rohtext += 1
|
skipped_no_rohtext += 1; continue
|
||||||
continue
|
|
||||||
|
|
||||||
# Prüfung 2: Fehlt die Zusammenfassung (AS)?
|
# Prüfung 2: Fehlt die Zusammenfassung (AS)?
|
||||||
summary_exists = False
|
summary_exists = False
|
||||||
if len(row) > summary_col_idx:
|
if len(row) > summary_col_idx and str(row[summary_col_idx]).strip() and str(row[summary_col_idx]).strip() != "k.A.":
|
||||||
if str(row[summary_col_idx]).strip() and str(row[summary_col_idx]).strip() != "k.A.":
|
summary_exists = True
|
||||||
summary_exists = True
|
if summary_exists: skipped_summary_exists += 1; continue
|
||||||
|
|
||||||
if summary_exists:
|
# Task hinzufügen
|
||||||
skipped_summary_exists += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Wenn Rohtext da ist und Zusammenfassung fehlt -> Aufgabe hinzufügen
|
|
||||||
tasks_for_openai_batch.append({'row_num': i, 'raw_text': raw_text})
|
tasks_for_openai_batch.append({'row_num': i, 'raw_text': raw_text})
|
||||||
processed_count += 1 # Zähle Zeilen, die potenziell zusammengefasst werden
|
processed_count += 1
|
||||||
|
|
||||||
# --- OpenAI Batch verarbeiten, wenn voll oder letzte Zeile ---
|
# --- OpenAI Batch verarbeiten, wenn voll oder letzte Zeile ---
|
||||||
if len(tasks_for_openai_batch) >= OPENAI_BATCH_SIZE_LIMIT or (processed_count > 0 and i == end_row_index_in_sheet):
|
if tasks_for_openai_batch and \
|
||||||
if tasks_for_openai_batch:
|
(len(tasks_for_openai_batch) >= openai_batch_size or (processed_count > 0 and i == end_row_index_in_sheet)):
|
||||||
debug_print(f" Verarbeite OpenAI Batch für {len(tasks_for_openai_batch)} Aufgaben (Start: {tasks_for_openai_batch[0]['row_num']})...")
|
debug_print(f" Verarbeite OpenAI Batch für {len(tasks_for_openai_batch)} Aufgaben (Start: {tasks_for_openai_batch[0]['row_num']})...")
|
||||||
summaries_result = summarize_batch_openai(tasks_for_openai_batch)
|
summaries_result = summarize_batch_openai(tasks_for_openai_batch) # Ruft modifizierte Funktion auf
|
||||||
|
|
||||||
# Sheet Updates für diesen OpenAI Batch vorbereiten
|
# Sheet Updates für diesen OpenAI Batch vorbereiten
|
||||||
current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # Nur für Version hier relevant?
|
current_version = Config.VERSION
|
||||||
current_version = Config.VERSION
|
for task in tasks_for_openai_batch: # Iteriere über die *gesendeten* Tasks
|
||||||
for task in tasks_for_openai_batch:
|
row_num = task['row_num']
|
||||||
row_num = task['row_num']
|
summary = summaries_result.get(row_num, "k.A. (Fehler Batch Zuordnung)")
|
||||||
summary = summaries_result.get(row_num, "k.A. (Fehler Batch Zuordnung)")
|
row_updates = [
|
||||||
row_updates = [
|
{'range': f'{summary_col_letter}{row_num}', 'values': [[summary]]},
|
||||||
{'range': f'{summary_col_letter}{row_num}', 'values': [[summary]]},
|
{'range': f'{version_col_letter}{row_num}', 'values': [[current_version]]}
|
||||||
{'range': f'{version_col_letter}{row_num}', 'values': [[current_version]]} # Setze Version hier
|
]
|
||||||
]
|
all_sheet_updates.extend(row_updates)
|
||||||
all_sheet_updates.extend(row_updates)
|
rows_in_current_update_batch += 1
|
||||||
rows_in_current_update_batch += 1 # Zähle Zeilen für Sheet Update Batch
|
|
||||||
|
|
||||||
tasks_for_openai_batch = [] # OpenAI Batch leeren
|
tasks_for_openai_batch = [] # OpenAI Batch leeren
|
||||||
|
|
||||||
# --- Gesammelte Sheet Updates senden, wenn Limit erreicht oder letzte Zeile ---
|
# --- Gesammelte Sheet Updates senden ---
|
||||||
if all_sheet_updates and \
|
if all_sheet_updates and \
|
||||||
(rows_in_current_update_batch >= update_batch_row_limit or (processed_count > 0 and i == end_row_index_in_sheet)):
|
(rows_in_current_update_batch >= update_batch_row_limit or (processed_count > 0 and i == end_row_index_in_sheet)):
|
||||||
debug_print(f" Sende Sheet-Update für {rows_in_current_update_batch} Zusammenfassungen...")
|
debug_print(f" Sende Sheet-Update für {rows_in_current_update_batch} Zusammenfassungen...")
|
||||||
success = sheet_handler.batch_update_cells(all_sheet_updates)
|
success = sheet_handler.batch_update_cells(all_sheet_updates)
|
||||||
if success: debug_print(f" Sheet-Update bis Zeile {i} erfolgreich.")
|
if success: debug_print(f" Sheet-Update bis Zeile {i} erfolgreich.")
|
||||||
else: debug_print(f" FEHLER beim Sheet-Update bis Zeile {i}.")
|
else: debug_print(f" FEHLER beim Sheet-Update bis Zeile {i}.")
|
||||||
all_sheet_updates = []
|
all_sheet_updates = []; rows_in_current_update_batch = 0
|
||||||
rows_in_current_update_batch = 0
|
|
||||||
|
|
||||||
# Letzten Sheet Update Batch senden
|
# Letzten Sheet Update Batch senden
|
||||||
if all_sheet_updates:
|
if all_sheet_updates:
|
||||||
|
|||||||
Reference in New Issue
Block a user