diff --git a/data_processor.py b/data_processor.py index 04bdd03c..757c5cb1 100644 --- a/data_processor.py +++ b/data_processor.py @@ -2354,90 +2354,69 @@ class DataProcessor: # Die Pause kommt erst nach dem Batch-Update (oder am Ende des Modus). # time.sleep(0.1) # Optionale kurze Pause - # --- Verarbeitung des letzten unvollstaendigen Scraping-Batches nach der Schleife --- - # Fuehre den letzten Batch aus, wenn nach der Hauptschleife noch Tasks - # in der Batch-Liste sind. + # --- Verarbeitung des letzten unvollstaendigen Scraping-Batches nach der Schleife --- if tasks_for_processing_batch: - # Logge den Start des finalen Batches batch_start_row = tasks_for_processing_batch[0]['row_num'] batch_end_row = tasks_for_processing_batch[-1]['row_num'] self.logger.debug( - f"\n--- Starte FINALEN Website-Scraping Batch ({len(tasks_for_processing_batch)} Tasks, Zeilen {batch_start_row}-{batch_end_row}) ---") # <<< GEÄNDERT + f"\n--- Starte FINALEN Website-Scraping Batch ({len(tasks_for_processing_batch)} Tasks, Zeilen {batch_start_row}-{batch_end_row}) ---") - scraping_results = {} # Dictionary fuer die Ergebnisse - batch_error_count = 0 # Fehlerzaehler + scraping_results = {} + batch_error_count = 0 self.logger.debug( - f" Scrape {len(tasks_for_processing_batch)} Websites parallel (max {max_scraping_workers} worker)...") # <<< GEÄNDERT + f" Scrape {len(tasks_for_processing_batch)} Websites parallel (max {max_scraping_workers} worker)...") with concurrent.futures.ThreadPoolExecutor(max_workers=max_scraping_workers) as executor: - # Map tasks to futures. Ruft die INTERNE Worker-Funktion auf. future_to_task = { executor.submit( self._scrape_raw_text_task, task, - get_website_raw): task for task in tasks_for_processing_batch} # <<< Korrigiert: interne Methode + get_website_raw): task for task in tasks_for_processing_batch} - # Verarbeite die Ergebnisse for future in concurrent.futures.as_completed(future_to_task): - # Holen Sie die urspruenglichen Task-Daten task = future_to_task[future] try: - result = future.result() # Holen Sie das Ergebnis - scraping_results[result['row_num'] - ] = result['raw_text'] - # Pruefe, ob der Worker einen Fehler gemeldet hat + result = future.result() + # HINWEIS: Hier speichern wir das ganze dict, nicht nur den Text + scraping_results[result['row_num']] = result if result.get('error'): batch_error_count += 1 except Exception as exc: - # Faengt unerwartete Fehler bei der Ergebnisabfrage ab row_num = task['row_num'] - # Gekuerzt loggen err_msg = f"Unerwarteter Fehler bei Ergebnisabfrage Scraping Task Zeile {row_num} ({task['url'][:100]}): {type(exc).__name__} - {exc}" - self.logger.error(err_msg) # <<< GEÄNDERT - # Setze einen Standard-Fehlerwert - scraping_results[row_num] = "k.A. (Unerwarteter Fehler Task)" + self.logger.error(err_msg) + scraping_results[row_num] = {"raw_text": "k.A. (Unerwarteter Fehler Task)", "meta_details": "k.A.", "error": True} batch_error_count += 1 self.logger.debug( - f" FINALER Scraping Batch beendet. {len(scraping_results)} Ergebnisse erhalten ({batch_error_count} Fehler).") # <<< GEÄNDERT + f" FINALER Scraping Batch beendet. {len(scraping_results)} Ergebnisse erhalten ({batch_error_count} Fehler).") - # Sammle Sheet Updates (AR, AT, AP) fuer diesen finalen Batch. if scraping_results: current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - current_version = getattr( - Config, 'VERSION', 'unknown') # Block 1 Config Attribut - batch_sheet_updates = [] # Updates fuer diesen spezifischen Batch - # Iteriere ueber die Zeilennummern im Batch, fuer die - # Ergebnisse vorliegen. - for row_num, result_dict in scraping_results.items(): - # Füge Updates für Rohtext, Meta-Details, Timestamp und Version hinzu - batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Rohtext"] + 1)}{row_num}', 'values': [[result_dict['raw_text']]]}) - batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Meta-Details"] + 1)}{row_num}', 'values': [[result_dict['meta_details']]]}) - batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Scrape Timestamp"] + 1)}{row_num}', 'values': [[current_timestamp]]}) - batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Version"] + 1)}{row_num}', 'values': [[current_version]]}) # Block 1 Column Map - # Fuege diese Updates zur globalen Liste hinzu (wird dann nur - # noch einmal gesendet) + current_version = getattr(Config, 'VERSION', 'unknown') + batch_sheet_updates = [] + + # ANPASSUNG AN NEUE LOGIK + for row_num, result_dict in scraping_results.items(): + batch_sheet_updates.extend([ + {'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Rohtext"] + 1)}{row_num}', 'values': [[result_dict.get('raw_text', 'k.A.')]]}, + {'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Meta-Details"] + 1)}{row_num}', 'values': [[result_dict.get('meta_details', 'k.A.')]]}, + {'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Scrape Timestamp"] + 1)}{row_num}', 'values': [[current_timestamp]]}, + {'range': f'{self.sheet_handler._get_col_letter(col_indices["Version"] + 1)}{row_num}', 'values': [[current_version]]} + ]) all_sheet_updates.extend(batch_sheet_updates) # --- Finale Sheet Updates senden --- - # Sende alle verbleibenden gesammelten Updates in einem letzten - # Batch-Update. if all_sheet_updates: - rows_in_final_update_batch = len( - all_sheet_updates) // 3 # Updates pro Zeile ist 3 + rows_in_final_update_batch = len(all_sheet_updates) // 4 # 4 Updates pro Zeile self.logger.info( - f"Sende FINALE gesammelte Sheet-Updates ({rows_in_final_update_batch} Zeilen, {len(all_sheet_updates)} Zellen)...") # <<< GEÄNDERT - # Nutzt die batch_update_cells Methode des Sheet Handlers (Block - # 14) mit Retry. + f"Sende FINALE gesammelte Sheet-Updates ({rows_in_final_update_batch} Zeilen, {len(all_sheet_updates)} Zellen)...") success = self.sheet_handler.batch_update_cells(all_sheet_updates) if success: - # <<< GEÄNDERT - self.logger.info(f"FINALES Sheet-Update erfolgreich.") - # Der Fehlerfall wird von batch_update_cells geloggt + self.logger.info("FINALES Sheet-Update erfolgreich.") - # Logge den Abschluss des Modus self.logger.info( - f"Website-Scraping (Batch) abgeschlossen. {processed_count} Zeilen verarbeitet (in Batch aufgenommen), {skipped_count} Zeilen uebersprungen.") # <<< GEÄNDERT + f"Website-Scraping (Batch) abgeschlossen. {processed_count} Zeilen verarbeitet, {skipped_count} Zeilen uebersprungen.") def process_summarization_batch( self,