bugfix
This commit is contained in:
@@ -4261,22 +4261,22 @@ class DataProcessor:
|
||||
|
||||
# --- Worker-Funktion für Scraping ---
|
||||
# Diese Funktion läuft in einem separaten Thread
|
||||
# def scrape_raw_text_task(task_info):
|
||||
# row_num = task_info['row_num']
|
||||
# url = task_info['url']
|
||||
# raw_text = "k.A."
|
||||
# error = None
|
||||
# try:
|
||||
# # Nutzt die globale Funktion get_website_raw mit Retry Decorator
|
||||
# raw_text = get_website_raw(url) # Annahme: get_website_raw in utils.py
|
||||
# except Exception as e:
|
||||
# # Fängt Fehler beim Scraping, damit der Thread nicht abstürzt
|
||||
# error = f"Scraping Fehler Zeile {row_num} ({url}): {e}"
|
||||
# self.logger.error(error)
|
||||
# raw_text = "k.A. (Fehler)" # Setze einen Fehlerwert in den Rohtext
|
||||
def scrape_raw_text_task(task_info):
|
||||
row_num = task_info['row_num']
|
||||
url = task_info['url']
|
||||
raw_text = "k.A."
|
||||
error = None
|
||||
try:
|
||||
# Nutzt die globale Funktion get_website_raw mit Retry Decorator
|
||||
raw_text = get_website_raw(url) # Annahme: get_website_raw in utils.py
|
||||
except Exception as e:
|
||||
# Fängt Fehler beim Scraping, damit der Thread nicht abstürzt
|
||||
error = f"Scraping Fehler Zeile {row_num} ({url}): {e}"
|
||||
self.logger.error(error)
|
||||
raw_text = "k.A. (Fehler)" # Setze einen Fehlerwert in den Rohtext
|
||||
|
||||
#logger.debug(f"Scraping Task Zeile {row_num} abgeschlossen. Textlänge: {len(str(raw_text))}.") # Zu viel Lärm
|
||||
# return {"row_num": row_num, "raw_text": raw_text, "error": error}
|
||||
return {"row_num": row_num, "raw_text": raw_text, "error": error}
|
||||
|
||||
|
||||
# --- Hauptlogik: Iteriere und sammle Batches ---
|
||||
@@ -4354,7 +4354,8 @@ class DataProcessor:
|
||||
# Nutzt concurrent.futures für paralleles Scraping
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_scraping_workers) as executor:
|
||||
# Map tasks to futures
|
||||
future_to_task = {executor.submit(_scrape_raw_text_task_global, task): task for task in tasks_for_processing_batch} # Auf globalen Namen geändert
|
||||
future_to_task = {executor.submit(scrape_raw_text_task, task): task for task in tasks_for_processing_batch}
|
||||
#future_to_task = {executor.submit(_scrape_raw_text_task_global, task): task for task in tasks_for_processing_batch} # Auf globalen Namen geändert
|
||||
|
||||
# Process results as they complete
|
||||
for future in concurrent.futures.as_completed(future_to_task):
|
||||
|
||||
Reference in New Issue
Block a user