data_processor.py aktualisiert

This commit is contained in:
2025-07-18 13:50:31 +00:00
parent f79b2c0490
commit 09bfa1ae81

View File

@@ -2354,90 +2354,69 @@ class DataProcessor:
# Die Pause kommt erst nach dem Batch-Update (oder am Ende des Modus).
# time.sleep(0.1) # Optionale kurze Pause
# --- Verarbeitung des letzten unvollstaendigen Scraping-Batches nach der Schleife ---
# Fuehre den letzten Batch aus, wenn nach der Hauptschleife noch Tasks
# in der Batch-Liste sind.
# --- Verarbeitung des letzten unvollstaendigen Scraping-Batches nach der Schleife ---
if tasks_for_processing_batch:
# Logge den Start des finalen Batches
batch_start_row = tasks_for_processing_batch[0]['row_num']
batch_end_row = tasks_for_processing_batch[-1]['row_num']
self.logger.debug(
f"\n--- Starte FINALEN Website-Scraping Batch ({len(tasks_for_processing_batch)} Tasks, Zeilen {batch_start_row}-{batch_end_row}) ---") # <<< GEÄNDERT
f"\n--- Starte FINALEN Website-Scraping Batch ({len(tasks_for_processing_batch)} Tasks, Zeilen {batch_start_row}-{batch_end_row}) ---")
scraping_results = {} # Dictionary fuer die Ergebnisse
batch_error_count = 0 # Fehlerzaehler
scraping_results = {}
batch_error_count = 0
self.logger.debug(
f" Scrape {len(tasks_for_processing_batch)} Websites parallel (max {max_scraping_workers} worker)...") # <<< GEÄNDERT
f" Scrape {len(tasks_for_processing_batch)} Websites parallel (max {max_scraping_workers} worker)...")
with concurrent.futures.ThreadPoolExecutor(max_workers=max_scraping_workers) as executor:
# Map tasks to futures. Ruft die INTERNE Worker-Funktion auf.
future_to_task = {
executor.submit(
self._scrape_raw_text_task,
task,
get_website_raw): task for task in tasks_for_processing_batch} # <<< Korrigiert: interne Methode
get_website_raw): task for task in tasks_for_processing_batch}
# Verarbeite die Ergebnisse
for future in concurrent.futures.as_completed(future_to_task):
# Holen Sie die urspruenglichen Task-Daten
task = future_to_task[future]
try:
result = future.result() # Holen Sie das Ergebnis
scraping_results[result['row_num']
] = result['raw_text']
# Pruefe, ob der Worker einen Fehler gemeldet hat
result = future.result()
# HINWEIS: Hier speichern wir das ganze dict, nicht nur den Text
scraping_results[result['row_num']] = result
if result.get('error'):
batch_error_count += 1
except Exception as exc:
# Faengt unerwartete Fehler bei der Ergebnisabfrage ab
row_num = task['row_num']
# Gekuerzt loggen
err_msg = f"Unerwarteter Fehler bei Ergebnisabfrage Scraping Task Zeile {row_num} ({task['url'][:100]}): {type(exc).__name__} - {exc}"
self.logger.error(err_msg) # <<< GEÄNDERT
# Setze einen Standard-Fehlerwert
scraping_results[row_num] = "k.A. (Unerwarteter Fehler Task)"
self.logger.error(err_msg)
scraping_results[row_num] = {"raw_text": "k.A. (Unerwarteter Fehler Task)", "meta_details": "k.A.", "error": True}
batch_error_count += 1
self.logger.debug(
f" FINALER Scraping Batch beendet. {len(scraping_results)} Ergebnisse erhalten ({batch_error_count} Fehler).") # <<< GEÄNDERT
f" FINALER Scraping Batch beendet. {len(scraping_results)} Ergebnisse erhalten ({batch_error_count} Fehler).")
# Sammle Sheet Updates (AR, AT, AP) fuer diesen finalen Batch.
if scraping_results:
current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
current_version = getattr(
Config, 'VERSION', 'unknown') # Block 1 Config Attribut
batch_sheet_updates = [] # Updates fuer diesen spezifischen Batch
# Iteriere ueber die Zeilennummern im Batch, fuer die
# Ergebnisse vorliegen.
for row_num, result_dict in scraping_results.items():
# Füge Updates für Rohtext, Meta-Details, Timestamp und Version hinzu
batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Rohtext"] + 1)}{row_num}', 'values': [[result_dict['raw_text']]]})
batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Meta-Details"] + 1)}{row_num}', 'values': [[result_dict['meta_details']]]})
batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Scrape Timestamp"] + 1)}{row_num}', 'values': [[current_timestamp]]})
batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Version"] + 1)}{row_num}', 'values': [[current_version]]}) # Block 1 Column Map
# Fuege diese Updates zur globalen Liste hinzu (wird dann nur
# noch einmal gesendet)
current_version = getattr(Config, 'VERSION', 'unknown')
batch_sheet_updates = []
# ANPASSUNG AN NEUE LOGIK
for row_num, result_dict in scraping_results.items():
batch_sheet_updates.extend([
{'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Rohtext"] + 1)}{row_num}', 'values': [[result_dict.get('raw_text', 'k.A.')]]},
{'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Meta-Details"] + 1)}{row_num}', 'values': [[result_dict.get('meta_details', 'k.A.')]]},
{'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Scrape Timestamp"] + 1)}{row_num}', 'values': [[current_timestamp]]},
{'range': f'{self.sheet_handler._get_col_letter(col_indices["Version"] + 1)}{row_num}', 'values': [[current_version]]}
])
all_sheet_updates.extend(batch_sheet_updates)
# --- Finale Sheet Updates senden ---
# Sende alle verbleibenden gesammelten Updates in einem letzten
# Batch-Update.
if all_sheet_updates:
rows_in_final_update_batch = len(
all_sheet_updates) // 3 # Updates pro Zeile ist 3
rows_in_final_update_batch = len(all_sheet_updates) // 4 # 4 Updates pro Zeile
self.logger.info(
f"Sende FINALE gesammelte Sheet-Updates ({rows_in_final_update_batch} Zeilen, {len(all_sheet_updates)} Zellen)...") # <<< GEÄNDERT
# Nutzt die batch_update_cells Methode des Sheet Handlers (Block
# 14) mit Retry.
f"Sende FINALE gesammelte Sheet-Updates ({rows_in_final_update_batch} Zeilen, {len(all_sheet_updates)} Zellen)...")
success = self.sheet_handler.batch_update_cells(all_sheet_updates)
if success:
# <<< GEÄNDERT
self.logger.info(f"FINALES Sheet-Update erfolgreich.")
# Der Fehlerfall wird von batch_update_cells geloggt
self.logger.info("FINALES Sheet-Update erfolgreich.")
# Logge den Abschluss des Modus
self.logger.info(
f"Website-Scraping (Batch) abgeschlossen. {processed_count} Zeilen verarbeitet (in Batch aufgenommen), {skipped_count} Zeilen uebersprungen.") # <<< GEÄNDERT
f"Website-Scraping (Batch) abgeschlossen. {processed_count} Zeilen verarbeitet, {skipped_count} Zeilen uebersprungen.")
def process_summarization_batch(
self,