data_processor.py aktualisiert
This commit is contained in:
@@ -2354,90 +2354,69 @@ class DataProcessor:
|
||||
# Die Pause kommt erst nach dem Batch-Update (oder am Ende des Modus).
|
||||
# time.sleep(0.1) # Optionale kurze Pause
|
||||
|
||||
# --- Verarbeitung des letzten unvollstaendigen Scraping-Batches nach der Schleife ---
|
||||
# Fuehre den letzten Batch aus, wenn nach der Hauptschleife noch Tasks
|
||||
# in der Batch-Liste sind.
|
||||
# --- Verarbeitung des letzten unvollstaendigen Scraping-Batches nach der Schleife ---
|
||||
if tasks_for_processing_batch:
|
||||
# Logge den Start des finalen Batches
|
||||
batch_start_row = tasks_for_processing_batch[0]['row_num']
|
||||
batch_end_row = tasks_for_processing_batch[-1]['row_num']
|
||||
self.logger.debug(
|
||||
f"\n--- Starte FINALEN Website-Scraping Batch ({len(tasks_for_processing_batch)} Tasks, Zeilen {batch_start_row}-{batch_end_row}) ---") # <<< GEÄNDERT
|
||||
f"\n--- Starte FINALEN Website-Scraping Batch ({len(tasks_for_processing_batch)} Tasks, Zeilen {batch_start_row}-{batch_end_row}) ---")
|
||||
|
||||
scraping_results = {} # Dictionary fuer die Ergebnisse
|
||||
batch_error_count = 0 # Fehlerzaehler
|
||||
scraping_results = {}
|
||||
batch_error_count = 0
|
||||
|
||||
self.logger.debug(
|
||||
f" Scrape {len(tasks_for_processing_batch)} Websites parallel (max {max_scraping_workers} worker)...") # <<< GEÄNDERT
|
||||
f" Scrape {len(tasks_for_processing_batch)} Websites parallel (max {max_scraping_workers} worker)...")
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_scraping_workers) as executor:
|
||||
# Map tasks to futures. Ruft die INTERNE Worker-Funktion auf.
|
||||
future_to_task = {
|
||||
executor.submit(
|
||||
self._scrape_raw_text_task,
|
||||
task,
|
||||
get_website_raw): task for task in tasks_for_processing_batch} # <<< Korrigiert: interne Methode
|
||||
get_website_raw): task for task in tasks_for_processing_batch}
|
||||
|
||||
# Verarbeite die Ergebnisse
|
||||
for future in concurrent.futures.as_completed(future_to_task):
|
||||
# Holen Sie die urspruenglichen Task-Daten
|
||||
task = future_to_task[future]
|
||||
try:
|
||||
result = future.result() # Holen Sie das Ergebnis
|
||||
scraping_results[result['row_num']
|
||||
] = result['raw_text']
|
||||
# Pruefe, ob der Worker einen Fehler gemeldet hat
|
||||
result = future.result()
|
||||
# HINWEIS: Hier speichern wir das ganze dict, nicht nur den Text
|
||||
scraping_results[result['row_num']] = result
|
||||
if result.get('error'):
|
||||
batch_error_count += 1
|
||||
except Exception as exc:
|
||||
# Faengt unerwartete Fehler bei der Ergebnisabfrage ab
|
||||
row_num = task['row_num']
|
||||
# Gekuerzt loggen
|
||||
err_msg = f"Unerwarteter Fehler bei Ergebnisabfrage Scraping Task Zeile {row_num} ({task['url'][:100]}): {type(exc).__name__} - {exc}"
|
||||
self.logger.error(err_msg) # <<< GEÄNDERT
|
||||
# Setze einen Standard-Fehlerwert
|
||||
scraping_results[row_num] = "k.A. (Unerwarteter Fehler Task)"
|
||||
self.logger.error(err_msg)
|
||||
scraping_results[row_num] = {"raw_text": "k.A. (Unerwarteter Fehler Task)", "meta_details": "k.A.", "error": True}
|
||||
batch_error_count += 1
|
||||
|
||||
self.logger.debug(
|
||||
f" FINALER Scraping Batch beendet. {len(scraping_results)} Ergebnisse erhalten ({batch_error_count} Fehler).") # <<< GEÄNDERT
|
||||
f" FINALER Scraping Batch beendet. {len(scraping_results)} Ergebnisse erhalten ({batch_error_count} Fehler).")
|
||||
|
||||
# Sammle Sheet Updates (AR, AT, AP) fuer diesen finalen Batch.
|
||||
if scraping_results:
|
||||
current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
current_version = getattr(
|
||||
Config, 'VERSION', 'unknown') # Block 1 Config Attribut
|
||||
batch_sheet_updates = [] # Updates fuer diesen spezifischen Batch
|
||||
# Iteriere ueber die Zeilennummern im Batch, fuer die
|
||||
# Ergebnisse vorliegen.
|
||||
for row_num, result_dict in scraping_results.items():
|
||||
# Füge Updates für Rohtext, Meta-Details, Timestamp und Version hinzu
|
||||
batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Rohtext"] + 1)}{row_num}', 'values': [[result_dict['raw_text']]]})
|
||||
batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Meta-Details"] + 1)}{row_num}', 'values': [[result_dict['meta_details']]]})
|
||||
batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Scrape Timestamp"] + 1)}{row_num}', 'values': [[current_timestamp]]})
|
||||
batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Version"] + 1)}{row_num}', 'values': [[current_version]]}) # Block 1 Column Map
|
||||
# Fuege diese Updates zur globalen Liste hinzu (wird dann nur
|
||||
# noch einmal gesendet)
|
||||
current_version = getattr(Config, 'VERSION', 'unknown')
|
||||
batch_sheet_updates = []
|
||||
|
||||
# ANPASSUNG AN NEUE LOGIK
|
||||
for row_num, result_dict in scraping_results.items():
|
||||
batch_sheet_updates.extend([
|
||||
{'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Rohtext"] + 1)}{row_num}', 'values': [[result_dict.get('raw_text', 'k.A.')]]},
|
||||
{'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Meta-Details"] + 1)}{row_num}', 'values': [[result_dict.get('meta_details', 'k.A.')]]},
|
||||
{'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Scrape Timestamp"] + 1)}{row_num}', 'values': [[current_timestamp]]},
|
||||
{'range': f'{self.sheet_handler._get_col_letter(col_indices["Version"] + 1)}{row_num}', 'values': [[current_version]]}
|
||||
])
|
||||
all_sheet_updates.extend(batch_sheet_updates)
|
||||
|
||||
# --- Finale Sheet Updates senden ---
|
||||
# Sende alle verbleibenden gesammelten Updates in einem letzten
|
||||
# Batch-Update.
|
||||
if all_sheet_updates:
|
||||
rows_in_final_update_batch = len(
|
||||
all_sheet_updates) // 3 # Updates pro Zeile ist 3
|
||||
rows_in_final_update_batch = len(all_sheet_updates) // 4 # 4 Updates pro Zeile
|
||||
self.logger.info(
|
||||
f"Sende FINALE gesammelte Sheet-Updates ({rows_in_final_update_batch} Zeilen, {len(all_sheet_updates)} Zellen)...") # <<< GEÄNDERT
|
||||
# Nutzt die batch_update_cells Methode des Sheet Handlers (Block
|
||||
# 14) mit Retry.
|
||||
f"Sende FINALE gesammelte Sheet-Updates ({rows_in_final_update_batch} Zeilen, {len(all_sheet_updates)} Zellen)...")
|
||||
success = self.sheet_handler.batch_update_cells(all_sheet_updates)
|
||||
if success:
|
||||
# <<< GEÄNDERT
|
||||
self.logger.info(f"FINALES Sheet-Update erfolgreich.")
|
||||
# Der Fehlerfall wird von batch_update_cells geloggt
|
||||
self.logger.info("FINALES Sheet-Update erfolgreich.")
|
||||
|
||||
# Logge den Abschluss des Modus
|
||||
self.logger.info(
|
||||
f"Website-Scraping (Batch) abgeschlossen. {processed_count} Zeilen verarbeitet (in Batch aufgenommen), {skipped_count} Zeilen uebersprungen.") # <<< GEÄNDERT
|
||||
f"Website-Scraping (Batch) abgeschlossen. {processed_count} Zeilen verarbeitet, {skipped_count} Zeilen uebersprungen.")
|
||||
|
||||
def process_summarization_batch(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user