This commit is contained in:
2025-05-06 12:32:51 +00:00
parent 5ca839608f
commit 589b4757b9

View File

@@ -4261,22 +4261,22 @@ class DataProcessor:
# --- Worker-Funktion für Scraping ---
# Diese Funktion läuft in einem separaten Thread
# def scrape_raw_text_task(task_info):
# row_num = task_info['row_num']
# url = task_info['url']
# raw_text = "k.A."
# error = None
# try:
# # Nutzt die globale Funktion get_website_raw mit Retry Decorator
# raw_text = get_website_raw(url) # Annahme: get_website_raw in utils.py
# except Exception as e:
# # Fängt Fehler beim Scraping, damit der Thread nicht abstürzt
# error = f"Scraping Fehler Zeile {row_num} ({url}): {e}"
# self.logger.error(error)
# raw_text = "k.A. (Fehler)" # Setze einen Fehlerwert in den Rohtext
def scrape_raw_text_task(task_info):
row_num = task_info['row_num']
url = task_info['url']
raw_text = "k.A."
error = None
try:
# Nutzt die globale Funktion get_website_raw mit Retry Decorator
raw_text = get_website_raw(url) # Annahme: get_website_raw in utils.py
except Exception as e:
# Fängt Fehler beim Scraping, damit der Thread nicht abstürzt
error = f"Scraping Fehler Zeile {row_num} ({url}): {e}"
self.logger.error(error)
raw_text = "k.A. (Fehler)" # Setze einen Fehlerwert in den Rohtext
#logger.debug(f"Scraping Task Zeile {row_num} abgeschlossen. Textlänge: {len(str(raw_text))}.") # Zu viel Lärm
# return {"row_num": row_num, "raw_text": raw_text, "error": error}
return {"row_num": row_num, "raw_text": raw_text, "error": error}
# --- Hauptlogik: Iteriere und sammle Batches ---
@@ -4354,7 +4354,8 @@ class DataProcessor:
# Nutzt concurrent.futures für paralleles Scraping
with concurrent.futures.ThreadPoolExecutor(max_workers=max_scraping_workers) as executor:
# Map tasks to futures
future_to_task = {executor.submit(_scrape_raw_text_task_global, task): task for task in tasks_for_processing_batch} # Auf globalen Namen geändert
future_to_task = {executor.submit(scrape_raw_text_task, task): task for task in tasks_for_processing_batch}
#future_to_task = {executor.submit(_scrape_raw_text_task_global, task): task for task in tasks_for_processing_batch} # Auf globalen Namen geändert
# Process results as they complete
for future in concurrent.futures.as_completed(future_to_task):