From f0084493e2153a12fa3f8abb56e1e8b727d8d042 Mon Sep 17 00:00:00 2001 From: Floke Date: Thu, 24 Apr 2025 18:02:08 +0000 Subject: [PATCH] bugfix --- brancheneinstufung.py | 207 +++++++++++++++++++++++++++++++++--------- 1 file changed, 166 insertions(+), 41 deletions(-) diff --git a/brancheneinstufung.py b/brancheneinstufung.py index cf31767d..a70948ec 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -4283,62 +4283,187 @@ class DataProcessor: f"{skipped_url_count} Zeilen ohne URL übersprungen." ) - # process_summarization_batch Methode - # Kopieren Sie die Logik aus Ihrer globalen process_website_summarization_batch Funktion hierher und passen Sie sie an self an. - # Sie braucht Zugriff auf summarize_batch_openai (global oder private helper). def process_summarization_batch(self, limit=None): """ Batch-Prozess NUR für Website-Zusammenfassung (AS). Findet Startzeile ab erster Zelle mit leerem AS, wo AR gefüllt ist. """ - logging.info(f"Starte Website-Zusammenfassung Batch. Limit: {limit if limit is not None else 'Unbegrenzt'}") - if not self.sheet_handler.load_data(): return logging.error("FEHLER beim Laden der Daten.") - all_data = self.sheet_handler.get_all_data_with_headers(); header_rows = 5 - if not all_data or len(all_data) <= header_rows: return logging.warning("Keine Daten gefunden.") + logging.info( + f"Starte Website-Zusammenfassung Batch. " + f"Limit: {limit if limit is not None else 'Unbegrenzt'}" + ) + if not self.sheet_handler.load_data(): + return logging.error("FEHLER beim Laden der Daten.") - rohtext_col_idx = COLUMN_MAP.get("Website Rohtext"); summary_col_idx = COLUMN_MAP.get("Website Zusammenfassung"); version_col_idx = COLUMN_MAP.get("Version"); - if None in [rohtext_col_idx, summary_col_idx, version_col_idx]: return logging.critical(f"FEHLER: Benötigte Indizes fehlen."); - summary_col_letter = self.sheet_handler._get_col_letter(summary_col_idx + 1); version_col_letter = self.sheet_handler._get_col_letter(version_col_idx + 1); + all_data = self.sheet_handler.get_all_data_with_headers() + header_rows = 5 + if not all_data or len(all_data) <= header_rows: + return logging.warning("Keine Daten gefunden.") - start_sheet_row = header_rows + 1; logging.info(f"Suche Startzeile für Zusammenfassungs-Batch (leeres AS, gefülltes AR)..."); found_start_row = None + # Spalten-Indizes holen + rohtext_col_idx = COLUMN_MAP.get("Website Rohtext") + summary_col_idx = COLUMN_MAP.get("Website Zusammenfassung") + version_col_idx = COLUMN_MAP.get("Version") + if None in [rohtext_col_idx, summary_col_idx, version_col_idx]: + return logging.critical("FEHLER: Benötigte Indizes fehlen.") + + summary_col_letter = self.sheet_handler._get_col_letter(summary_col_idx + 1) + version_col_letter = self.sheet_handler._get_col_letter(version_col_idx + 1) + + # Startzeile suchen: erstes gefülltes AR, leeres AS + found_start_row = None for i in range(header_rows, len(all_data)): - row = all_data[i]; row_num_in_sheet = i + 1; - if len(row) <= max(rohtext_col_idx, summary_col_idx): continue; - ar_value = str(row[rohtext_col_idx]).strip(); as_value = str(row[summary_col_idx]).strip(); - ar_is_filled = bool(ar_value) and ar_value.lower() not in ["k.a.", "k.a. (nur cookie-banner erkannt)", "k.a. (fehler)"]; as_is_empty = not bool(as_value); - if ar_is_filled and as_is_empty: found_start_row = row_num_in_sheet; logging.info(f"Startzeile gefunden: {found_start_row}."); break; - if found_start_row is None: logging.info("Keine Zeilen gefunden, die Zusammenfassung benötigen."); return; + row = all_data[i] + sheet_row_num = i + 1 - start_sheet_row = found_start_row; total_sheet_rows = len(all_data); end_sheet_row = total_sheet_rows; - if limit is not None and limit >= 0: end_sheet_row = min(start_sheet_row + limit - 1, total_sheet_rows); if limit == 0: logging.info("Limit 0."); return; - if start_sheet_row > end_sheet_row: logging.warning("Start nach Ende (Limit)."); return; - logging.info(f"Verarbeite Sheet-Zeilen {start_sheet_row} bis {end_sheet_row} für Website Zusammenfassung (Batch).") + # Sicherstellen, dass beide Spalten in dieser Zeile existieren + if len(row) <= max(rohtext_col_idx, summary_col_idx): + continue - tasks_for_openai_batch = []; all_sheet_updates = []; processed_count = 0; openai_batch_size = Config.OPENAI_BATCH_SIZE_LIMIT; + ar_value = str(row[rohtext_col_idx]).strip() + as_value = str(row[summary_col_idx]).strip() + ar_filled = ( + bool(ar_value) + and ar_value.lower() + not in ["k.a.", "k.a. (nur cookie-banner erkannt)", "k.a. (fehler)"] + ) + as_empty = not bool(as_value) + + if ar_filled and as_empty: + found_start_row = sheet_row_num + logging.info(f"Startzeile gefunden: {found_start_row}.") + break + + if found_start_row is None: + logging.info("Keine Zeilen gefunden, die Zusammenfassung benötigen.") + return + + # Bereich definieren + start_sheet_row = found_start_row + total_sheet_rows = len(all_data) + end_sheet_row = total_sheet_rows + + # Limit auswerten + if limit is not None and limit >= 0: + end_sheet_row = min(start_sheet_row + limit - 1, total_sheet_rows) + if limit == 0: + logging.info("Limit 0.") + return + if start_sheet_row > end_sheet_row: + logging.warning("Start nach Ende (Limit).") + return + + logging.info( + f"Verarbeite Sheet-Zeilen {start_sheet_row} bis {end_sheet_row} " + "für Website-Zusammenfassung (Batch)." + ) + + tasks_for_openai_batch = [] + all_sheet_updates = [] + processed_count = 0 + openai_batch_size = Config.OPENAI_BATCH_SIZE_LIMIT + + # Durch die Zeilen iterieren for i in range(start_sheet_row, end_sheet_row + 1): - row_index_in_list = i - 1; row = all_data[row_index_in_list]; - if len(row) <= max(rohtext_col_idx, summary_col_idx): continue; - ar_value = str(row[rohtext_col_idx]).strip(); as_value = str(row[summary_col_idx]).strip(); - ar_is_filled = bool(ar_value) and ar_value.lower() not in ["k.a.", "k.a. (nur cookie-banner erkannt)", "k.a. (fehler)"]; as_is_empty = not bool(as_value); - if not (ar_is_filled and as_is_empty): logging.debug(f"Zeile {i}: Kriterium passt nicht mehr, übersprungen."); continue; + row_index = i - 1 + row = all_data[row_index] - tasks_for_openai_batch.append({'row_num': i, 'raw_text': ar_value}); processed_count += 1; + # Sicherstellen, dass die Spalten existieren + if len(row) <= max(rohtext_col_idx, summary_col_idx): + continue - if tasks_for_openai_batch and (len(tasks_for_openai_batch) >= openai_batch_size or i == end_sheet_row): - debug_print(f" Verarbeite OpenAI Batch für {len(tasks_for_openai_batch)} Aufgaben (Start: {tasks_for_openai_batch[0]['row_num']})...") - try: summaries_result = summarize_batch_openai(tasks_for_openai_batch); # Globale Funktion mit Retry - current_version = Config.VERSION; - for task in tasks_for_openai_batch: - row_num = task['row_num']; summary = summaries_result.get(row_num, "k.A. (Fehler Batch Zuordnung)"); - batch_sheet_updates = [ {'range': f'{summary_col_letter}{row_num}', 'values': [[summary]]}, # {'range': f'{version_col_letter}{row_num}', 'values': [[current_version]]} # AP setzen - ]; all_sheet_updates.extend(batch_sheet_updates); - if all_sheet_updates: logging.info(f" Sende Sheet-Update für {len(tasks_for_openai_batch)} Zusammenfassungen ({len(all_sheet_updates)} Zellen)..."); success = self.sheet_handler.batch_update_cells(all_sheet_updates); if success: logging.info(f" Sheet-Update erfolgreich."); else: logging.error(f" FEHLER beim Sheet-Update."); all_sheet_updates = []; - except Exception as e_batch: logging.error(f"FEHLER bei Verarbeitung von OpenAI Batch {tasks_for_openai_batch[0]['row_num']}-{tasks_for_openai_batch[-1]['row_num']}: {e_batch}"); pass; - tasks_for_openai_batch = []; time.sleep(Config.RETRY_DELAY); + ar_value = str(row[rohtext_col_idx]).strip() + as_value = str(row[summary_col_idx]).strip() - if all_sheet_updates: logging.info(f"Sende finalen Sheet-Update ({len(all_sheet_updates)} Zellen)..."); self.sheet_handler.batch_update_cells(all_sheet_updates); - logging.info(f"Website-Zusammenfassung Batch abgeschlossen. {processed_count} Tasks erstellt.") + ar_filled = ( + bool(ar_value) + and ar_value.lower() + not in ["k.a.", "k.a. (nur cookie-banner erkannt)", "k.a. (fehler)"] + ) + as_empty = not bool(as_value) + + if not (ar_filled and as_empty): + logging.debug(f"Zeile {i}: Kriterium passt nicht mehr, übersprungen.") + continue + + # Task für OpenAI-Batch anlegen + tasks_for_openai_batch.append({ + 'row_num': i, + 'raw_text': ar_value + }) + processed_count += 1 + + # Batch an OpenAI senden, wenn voll oder am Ende + if (len(tasks_for_openai_batch) >= openai_batch_size + or i == end_sheet_row): + + logging.debug( + f"Verarbeite OpenAI Batch mit " + f"{len(tasks_for_openai_batch)} Aufgaben " + f"(Startzeile: {tasks_for_openai_batch[0]['row_num']})..." + ) + try: + summaries_result = summarize_batch_openai(tasks_for_openai_batch) + current_version = Config.VERSION + + # Ergebnisse in Sheet-Updates umwandeln + for task in tasks_for_openai_batch: + row_num = task['row_num'] + summary = summaries_result.get( + row_num, + "k.A. (Fehler Batch Zuordnung)" + ) + batch_updates = [ + { + 'range': f'{summary_col_letter}{row_num}', + 'values': [[ summary ]] + }, + # Optional Version setzen: + # { + # 'range': f'{version_col_letter}{row_num}', + # 'values': [[ current_version ]] + # } + ] + all_sheet_updates.extend(batch_updates) + + # Updates senden + if all_sheet_updates: + logging.info( + f"Sende Sheet-Update für " + f"{len(tasks_for_openai_batch)} Zusammenfassungen " + f"({len(all_sheet_updates)} Zellen)..." + ) + success = self.sheet_handler.batch_update_cells(all_sheet_updates) + if success: + logging.info("Sheet-Update erfolgreich.") + else: + logging.error("FEHLER beim Sheet-Update.") + all_sheet_updates = [] + + except Exception as e_batch: + logging.error( + f"FEHLER bei OpenAI Batch " + f"{tasks_for_openai_batch[0]['row_num']}–" + f"{tasks_for_openai_batch[-1]['row_num']}: {e_batch}" + ) + + # Für den nächsten Batch zurücksetzen + tasks_for_openai_batch = [] + time.sleep(Config.RETRY_DELAY) + + # Abschließender Push, falls Reste da sind + if all_sheet_updates: + logging.info( + f"Sende finalen Sheet-Update " + f"({len(all_sheet_updates)} Zellen)..." + ) + self.sheet_handler.batch_update_cells(all_sheet_updates) + + logging.info( + f"Website-Zusammenfassung Batch abgeschlossen. " + f"{processed_count} Tasks erstellt." + ) # process_branch_batch Methode # Kopieren Sie die Logik aus Ihrer globalen process_branch_batch Funktion hierher und passen Sie sie an self an.