bugfix
This commit is contained in:
@@ -4261,22 +4261,22 @@ class DataProcessor:
|
|||||||
|
|
||||||
# --- Worker-Funktion für Scraping ---
|
# --- Worker-Funktion für Scraping ---
|
||||||
# Diese Funktion läuft in einem separaten Thread
|
# Diese Funktion läuft in einem separaten Thread
|
||||||
# def scrape_raw_text_task(task_info):
|
def scrape_raw_text_task(task_info):
|
||||||
# row_num = task_info['row_num']
|
row_num = task_info['row_num']
|
||||||
# url = task_info['url']
|
url = task_info['url']
|
||||||
# raw_text = "k.A."
|
raw_text = "k.A."
|
||||||
# error = None
|
error = None
|
||||||
# try:
|
try:
|
||||||
# # Nutzt die globale Funktion get_website_raw mit Retry Decorator
|
# Nutzt die globale Funktion get_website_raw mit Retry Decorator
|
||||||
# raw_text = get_website_raw(url) # Annahme: get_website_raw in utils.py
|
raw_text = get_website_raw(url) # Annahme: get_website_raw in utils.py
|
||||||
# except Exception as e:
|
except Exception as e:
|
||||||
# # Fängt Fehler beim Scraping, damit der Thread nicht abstürzt
|
# Fängt Fehler beim Scraping, damit der Thread nicht abstürzt
|
||||||
# error = f"Scraping Fehler Zeile {row_num} ({url}): {e}"
|
error = f"Scraping Fehler Zeile {row_num} ({url}): {e}"
|
||||||
# self.logger.error(error)
|
self.logger.error(error)
|
||||||
# raw_text = "k.A. (Fehler)" # Setze einen Fehlerwert in den Rohtext
|
raw_text = "k.A. (Fehler)" # Setze einen Fehlerwert in den Rohtext
|
||||||
|
|
||||||
#logger.debug(f"Scraping Task Zeile {row_num} abgeschlossen. Textlänge: {len(str(raw_text))}.") # Zu viel Lärm
|
#logger.debug(f"Scraping Task Zeile {row_num} abgeschlossen. Textlänge: {len(str(raw_text))}.") # Zu viel Lärm
|
||||||
# return {"row_num": row_num, "raw_text": raw_text, "error": error}
|
return {"row_num": row_num, "raw_text": raw_text, "error": error}
|
||||||
|
|
||||||
|
|
||||||
# --- Hauptlogik: Iteriere und sammle Batches ---
|
# --- Hauptlogik: Iteriere und sammle Batches ---
|
||||||
@@ -4354,7 +4354,8 @@ class DataProcessor:
|
|||||||
# Nutzt concurrent.futures für paralleles Scraping
|
# Nutzt concurrent.futures für paralleles Scraping
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_scraping_workers) as executor:
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_scraping_workers) as executor:
|
||||||
# Map tasks to futures
|
# Map tasks to futures
|
||||||
future_to_task = {executor.submit(_scrape_raw_text_task_global, task): task for task in tasks_for_processing_batch} # Auf globalen Namen geändert
|
future_to_task = {executor.submit(scrape_raw_text_task, task): task for task in tasks_for_processing_batch}
|
||||||
|
#future_to_task = {executor.submit(_scrape_raw_text_task_global, task): task for task in tasks_for_processing_batch} # Auf globalen Namen geändert
|
||||||
|
|
||||||
# Process results as they complete
|
# Process results as they complete
|
||||||
for future in concurrent.futures.as_completed(future_to_task):
|
for future in concurrent.futures.as_completed(future_to_task):
|
||||||
|
|||||||
Reference in New Issue
Block a user