bugfix
This commit is contained in:
@@ -4283,62 +4283,187 @@ class DataProcessor:
|
||||
f"{skipped_url_count} Zeilen ohne URL übersprungen."
|
||||
)
|
||||
|
||||
# process_summarization_batch Methode
|
||||
# Kopieren Sie die Logik aus Ihrer globalen process_website_summarization_batch Funktion hierher und passen Sie sie an self an.
|
||||
# Sie braucht Zugriff auf summarize_batch_openai (global oder private helper).
|
||||
def process_summarization_batch(self, limit=None):
|
||||
"""
|
||||
Batch-Prozess NUR für Website-Zusammenfassung (AS).
|
||||
Findet Startzeile ab erster Zelle mit leerem AS, wo AR gefüllt ist.
|
||||
"""
|
||||
logging.info(f"Starte Website-Zusammenfassung Batch. Limit: {limit if limit is not None else 'Unbegrenzt'}")
|
||||
if not self.sheet_handler.load_data(): return logging.error("FEHLER beim Laden der Daten.")
|
||||
all_data = self.sheet_handler.get_all_data_with_headers(); header_rows = 5
|
||||
if not all_data or len(all_data) <= header_rows: return logging.warning("Keine Daten gefunden.")
|
||||
logging.info(
|
||||
f"Starte Website-Zusammenfassung Batch. "
|
||||
f"Limit: {limit if limit is not None else 'Unbegrenzt'}"
|
||||
)
|
||||
if not self.sheet_handler.load_data():
|
||||
return logging.error("FEHLER beim Laden der Daten.")
|
||||
|
||||
rohtext_col_idx = COLUMN_MAP.get("Website Rohtext"); summary_col_idx = COLUMN_MAP.get("Website Zusammenfassung"); version_col_idx = COLUMN_MAP.get("Version");
|
||||
if None in [rohtext_col_idx, summary_col_idx, version_col_idx]: return logging.critical(f"FEHLER: Benötigte Indizes fehlen.");
|
||||
summary_col_letter = self.sheet_handler._get_col_letter(summary_col_idx + 1); version_col_letter = self.sheet_handler._get_col_letter(version_col_idx + 1);
|
||||
all_data = self.sheet_handler.get_all_data_with_headers()
|
||||
header_rows = 5
|
||||
if not all_data or len(all_data) <= header_rows:
|
||||
return logging.warning("Keine Daten gefunden.")
|
||||
|
||||
start_sheet_row = header_rows + 1; logging.info(f"Suche Startzeile für Zusammenfassungs-Batch (leeres AS, gefülltes AR)..."); found_start_row = None
|
||||
# Spalten-Indizes holen
|
||||
rohtext_col_idx = COLUMN_MAP.get("Website Rohtext")
|
||||
summary_col_idx = COLUMN_MAP.get("Website Zusammenfassung")
|
||||
version_col_idx = COLUMN_MAP.get("Version")
|
||||
if None in [rohtext_col_idx, summary_col_idx, version_col_idx]:
|
||||
return logging.critical("FEHLER: Benötigte Indizes fehlen.")
|
||||
|
||||
summary_col_letter = self.sheet_handler._get_col_letter(summary_col_idx + 1)
|
||||
version_col_letter = self.sheet_handler._get_col_letter(version_col_idx + 1)
|
||||
|
||||
# Startzeile suchen: erstes gefülltes AR, leeres AS
|
||||
found_start_row = None
|
||||
for i in range(header_rows, len(all_data)):
|
||||
row = all_data[i]; row_num_in_sheet = i + 1;
|
||||
if len(row) <= max(rohtext_col_idx, summary_col_idx): continue;
|
||||
ar_value = str(row[rohtext_col_idx]).strip(); as_value = str(row[summary_col_idx]).strip();
|
||||
ar_is_filled = bool(ar_value) and ar_value.lower() not in ["k.a.", "k.a. (nur cookie-banner erkannt)", "k.a. (fehler)"]; as_is_empty = not bool(as_value);
|
||||
if ar_is_filled and as_is_empty: found_start_row = row_num_in_sheet; logging.info(f"Startzeile gefunden: {found_start_row}."); break;
|
||||
if found_start_row is None: logging.info("Keine Zeilen gefunden, die Zusammenfassung benötigen."); return;
|
||||
row = all_data[i]
|
||||
sheet_row_num = i + 1
|
||||
|
||||
start_sheet_row = found_start_row; total_sheet_rows = len(all_data); end_sheet_row = total_sheet_rows;
|
||||
if limit is not None and limit >= 0: end_sheet_row = min(start_sheet_row + limit - 1, total_sheet_rows); if limit == 0: logging.info("Limit 0."); return;
|
||||
if start_sheet_row > end_sheet_row: logging.warning("Start nach Ende (Limit)."); return;
|
||||
logging.info(f"Verarbeite Sheet-Zeilen {start_sheet_row} bis {end_sheet_row} für Website Zusammenfassung (Batch).")
|
||||
# Sicherstellen, dass beide Spalten in dieser Zeile existieren
|
||||
if len(row) <= max(rohtext_col_idx, summary_col_idx):
|
||||
continue
|
||||
|
||||
tasks_for_openai_batch = []; all_sheet_updates = []; processed_count = 0; openai_batch_size = Config.OPENAI_BATCH_SIZE_LIMIT;
|
||||
ar_value = str(row[rohtext_col_idx]).strip()
|
||||
as_value = str(row[summary_col_idx]).strip()
|
||||
|
||||
ar_filled = (
|
||||
bool(ar_value)
|
||||
and ar_value.lower()
|
||||
not in ["k.a.", "k.a. (nur cookie-banner erkannt)", "k.a. (fehler)"]
|
||||
)
|
||||
as_empty = not bool(as_value)
|
||||
|
||||
if ar_filled and as_empty:
|
||||
found_start_row = sheet_row_num
|
||||
logging.info(f"Startzeile gefunden: {found_start_row}.")
|
||||
break
|
||||
|
||||
if found_start_row is None:
|
||||
logging.info("Keine Zeilen gefunden, die Zusammenfassung benötigen.")
|
||||
return
|
||||
|
||||
# Bereich definieren
|
||||
start_sheet_row = found_start_row
|
||||
total_sheet_rows = len(all_data)
|
||||
end_sheet_row = total_sheet_rows
|
||||
|
||||
# Limit auswerten
|
||||
if limit is not None and limit >= 0:
|
||||
end_sheet_row = min(start_sheet_row + limit - 1, total_sheet_rows)
|
||||
if limit == 0:
|
||||
logging.info("Limit 0.")
|
||||
return
|
||||
if start_sheet_row > end_sheet_row:
|
||||
logging.warning("Start nach Ende (Limit).")
|
||||
return
|
||||
|
||||
logging.info(
|
||||
f"Verarbeite Sheet-Zeilen {start_sheet_row} bis {end_sheet_row} "
|
||||
"für Website-Zusammenfassung (Batch)."
|
||||
)
|
||||
|
||||
tasks_for_openai_batch = []
|
||||
all_sheet_updates = []
|
||||
processed_count = 0
|
||||
openai_batch_size = Config.OPENAI_BATCH_SIZE_LIMIT
|
||||
|
||||
# Durch die Zeilen iterieren
|
||||
for i in range(start_sheet_row, end_sheet_row + 1):
|
||||
row_index_in_list = i - 1; row = all_data[row_index_in_list];
|
||||
if len(row) <= max(rohtext_col_idx, summary_col_idx): continue;
|
||||
ar_value = str(row[rohtext_col_idx]).strip(); as_value = str(row[summary_col_idx]).strip();
|
||||
ar_is_filled = bool(ar_value) and ar_value.lower() not in ["k.a.", "k.a. (nur cookie-banner erkannt)", "k.a. (fehler)"]; as_is_empty = not bool(as_value);
|
||||
if not (ar_is_filled and as_is_empty): logging.debug(f"Zeile {i}: Kriterium passt nicht mehr, übersprungen."); continue;
|
||||
row_index = i - 1
|
||||
row = all_data[row_index]
|
||||
|
||||
tasks_for_openai_batch.append({'row_num': i, 'raw_text': ar_value}); processed_count += 1;
|
||||
# Sicherstellen, dass die Spalten existieren
|
||||
if len(row) <= max(rohtext_col_idx, summary_col_idx):
|
||||
continue
|
||||
|
||||
if tasks_for_openai_batch and (len(tasks_for_openai_batch) >= openai_batch_size or i == end_sheet_row):
|
||||
debug_print(f" Verarbeite OpenAI Batch für {len(tasks_for_openai_batch)} Aufgaben (Start: {tasks_for_openai_batch[0]['row_num']})...")
|
||||
try: summaries_result = summarize_batch_openai(tasks_for_openai_batch); # Globale Funktion mit Retry
|
||||
current_version = Config.VERSION;
|
||||
for task in tasks_for_openai_batch:
|
||||
row_num = task['row_num']; summary = summaries_result.get(row_num, "k.A. (Fehler Batch Zuordnung)");
|
||||
batch_sheet_updates = [ {'range': f'{summary_col_letter}{row_num}', 'values': [[summary]]}, # {'range': f'{version_col_letter}{row_num}', 'values': [[current_version]]} # AP setzen
|
||||
]; all_sheet_updates.extend(batch_sheet_updates);
|
||||
if all_sheet_updates: logging.info(f" Sende Sheet-Update für {len(tasks_for_openai_batch)} Zusammenfassungen ({len(all_sheet_updates)} Zellen)..."); success = self.sheet_handler.batch_update_cells(all_sheet_updates); if success: logging.info(f" Sheet-Update erfolgreich."); else: logging.error(f" FEHLER beim Sheet-Update."); all_sheet_updates = [];
|
||||
except Exception as e_batch: logging.error(f"FEHLER bei Verarbeitung von OpenAI Batch {tasks_for_openai_batch[0]['row_num']}-{tasks_for_openai_batch[-1]['row_num']}: {e_batch}"); pass;
|
||||
tasks_for_openai_batch = []; time.sleep(Config.RETRY_DELAY);
|
||||
ar_value = str(row[rohtext_col_idx]).strip()
|
||||
as_value = str(row[summary_col_idx]).strip()
|
||||
|
||||
if all_sheet_updates: logging.info(f"Sende finalen Sheet-Update ({len(all_sheet_updates)} Zellen)..."); self.sheet_handler.batch_update_cells(all_sheet_updates);
|
||||
logging.info(f"Website-Zusammenfassung Batch abgeschlossen. {processed_count} Tasks erstellt.")
|
||||
ar_filled = (
|
||||
bool(ar_value)
|
||||
and ar_value.lower()
|
||||
not in ["k.a.", "k.a. (nur cookie-banner erkannt)", "k.a. (fehler)"]
|
||||
)
|
||||
as_empty = not bool(as_value)
|
||||
|
||||
if not (ar_filled and as_empty):
|
||||
logging.debug(f"Zeile {i}: Kriterium passt nicht mehr, übersprungen.")
|
||||
continue
|
||||
|
||||
# Task für OpenAI-Batch anlegen
|
||||
tasks_for_openai_batch.append({
|
||||
'row_num': i,
|
||||
'raw_text': ar_value
|
||||
})
|
||||
processed_count += 1
|
||||
|
||||
# Batch an OpenAI senden, wenn voll oder am Ende
|
||||
if (len(tasks_for_openai_batch) >= openai_batch_size
|
||||
or i == end_sheet_row):
|
||||
|
||||
logging.debug(
|
||||
f"Verarbeite OpenAI Batch mit "
|
||||
f"{len(tasks_for_openai_batch)} Aufgaben "
|
||||
f"(Startzeile: {tasks_for_openai_batch[0]['row_num']})..."
|
||||
)
|
||||
try:
|
||||
summaries_result = summarize_batch_openai(tasks_for_openai_batch)
|
||||
current_version = Config.VERSION
|
||||
|
||||
# Ergebnisse in Sheet-Updates umwandeln
|
||||
for task in tasks_for_openai_batch:
|
||||
row_num = task['row_num']
|
||||
summary = summaries_result.get(
|
||||
row_num,
|
||||
"k.A. (Fehler Batch Zuordnung)"
|
||||
)
|
||||
batch_updates = [
|
||||
{
|
||||
'range': f'{summary_col_letter}{row_num}',
|
||||
'values': [[ summary ]]
|
||||
},
|
||||
# Optional Version setzen:
|
||||
# {
|
||||
# 'range': f'{version_col_letter}{row_num}',
|
||||
# 'values': [[ current_version ]]
|
||||
# }
|
||||
]
|
||||
all_sheet_updates.extend(batch_updates)
|
||||
|
||||
# Updates senden
|
||||
if all_sheet_updates:
|
||||
logging.info(
|
||||
f"Sende Sheet-Update für "
|
||||
f"{len(tasks_for_openai_batch)} Zusammenfassungen "
|
||||
f"({len(all_sheet_updates)} Zellen)..."
|
||||
)
|
||||
success = self.sheet_handler.batch_update_cells(all_sheet_updates)
|
||||
if success:
|
||||
logging.info("Sheet-Update erfolgreich.")
|
||||
else:
|
||||
logging.error("FEHLER beim Sheet-Update.")
|
||||
all_sheet_updates = []
|
||||
|
||||
except Exception as e_batch:
|
||||
logging.error(
|
||||
f"FEHLER bei OpenAI Batch "
|
||||
f"{tasks_for_openai_batch[0]['row_num']}–"
|
||||
f"{tasks_for_openai_batch[-1]['row_num']}: {e_batch}"
|
||||
)
|
||||
|
||||
# Für den nächsten Batch zurücksetzen
|
||||
tasks_for_openai_batch = []
|
||||
time.sleep(Config.RETRY_DELAY)
|
||||
|
||||
# Abschließender Push, falls Reste da sind
|
||||
if all_sheet_updates:
|
||||
logging.info(
|
||||
f"Sende finalen Sheet-Update "
|
||||
f"({len(all_sheet_updates)} Zellen)..."
|
||||
)
|
||||
self.sheet_handler.batch_update_cells(all_sheet_updates)
|
||||
|
||||
logging.info(
|
||||
f"Website-Zusammenfassung Batch abgeschlossen. "
|
||||
f"{processed_count} Tasks erstellt."
|
||||
)
|
||||
|
||||
# process_branch_batch Methode
|
||||
# Kopieren Sie die Logik aus Ihrer globalen process_branch_batch Funktion hierher und passen Sie sie an self an.
|
||||
|
||||
Reference in New Issue
Block a user