From 589b4757b94901a0df45cb4d57c4db9cba1af779 Mon Sep 17 00:00:00 2001 From: Floke Date: Tue, 6 May 2025 12:32:51 +0000 Subject: [PATCH] bugfix --- brancheneinstufung.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/brancheneinstufung.py b/brancheneinstufung.py index 8f5af975..8f94bc79 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -4261,22 +4261,22 @@ class DataProcessor: # --- Worker-Funktion für Scraping --- # Diese Funktion läuft in einem separaten Thread - # def scrape_raw_text_task(task_info): - # row_num = task_info['row_num'] - # url = task_info['url'] - # raw_text = "k.A." - # error = None - # try: - # # Nutzt die globale Funktion get_website_raw mit Retry Decorator - # raw_text = get_website_raw(url) # Annahme: get_website_raw in utils.py - # except Exception as e: - # # Fängt Fehler beim Scraping, damit der Thread nicht abstürzt - # error = f"Scraping Fehler Zeile {row_num} ({url}): {e}" - # self.logger.error(error) - # raw_text = "k.A. (Fehler)" # Setze einen Fehlerwert in den Rohtext + def scrape_raw_text_task(task_info): + row_num = task_info['row_num'] + url = task_info['url'] + raw_text = "k.A." + error = None + try: + # Nutzt die globale Funktion get_website_raw mit Retry Decorator + raw_text = get_website_raw(url) # Annahme: get_website_raw in utils.py + except Exception as e: + # Fängt Fehler beim Scraping, damit der Thread nicht abstürzt + error = f"Scraping Fehler Zeile {row_num} ({url}): {e}" + self.logger.error(error) + raw_text = "k.A. (Fehler)" # Setze einen Fehlerwert in den Rohtext #logger.debug(f"Scraping Task Zeile {row_num} abgeschlossen. Textlänge: {len(str(raw_text))}.") # Zu viel Lärm - # return {"row_num": row_num, "raw_text": raw_text, "error": error} + return {"row_num": row_num, "raw_text": raw_text, "error": error} # --- Hauptlogik: Iteriere und sammle Batches --- @@ -4354,7 +4354,8 @@ class DataProcessor: # Nutzt concurrent.futures für paralleles Scraping with concurrent.futures.ThreadPoolExecutor(max_workers=max_scraping_workers) as executor: # Map tasks to futures - future_to_task = {executor.submit(_scrape_raw_text_task_global, task): task for task in tasks_for_processing_batch} # Auf globalen Namen geändert + future_to_task = {executor.submit(scrape_raw_text_task, task): task for task in tasks_for_processing_batch} + #future_to_task = {executor.submit(_scrape_raw_text_task_global, task): task for task in tasks_for_processing_batch} # Auf globalen Namen geändert # Process results as they complete for future in concurrent.futures.as_completed(future_to_task):