From 7fad32b2011e15ee2c93d8683717d7d78c7f4ec5 Mon Sep 17 00:00:00 2001 From: Floke Date: Fri, 18 Jul 2025 15:45:05 +0000 Subject: [PATCH] data_processor.py aktualisiert --- data_processor.py | 57 ++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 30 deletions(-) diff --git a/data_processor.py b/data_processor.py index a4feeca8..860aa649 100644 --- a/data_processor.py +++ b/data_processor.py @@ -2096,10 +2096,9 @@ class DataProcessor: # Stellen Sie sicher, dass alle benoetigten Spalten in COLUMN_MAP # (Block 1) vorhanden sind required_keys = [ - # AR, D, AP, AT, B - "Website Rohtext", "CRM Website", "Version", "Website Scrape Timestamp", "CRM Name" + "Website Rohtext", "CRM Website", "Version", "Website Scrape Timestamp", "CRM Name", "Website Meta-Details" ] - # Erstellen Sie ein Dictionary mit Schluesseln und Indizes + # Erstellen Sie ein Dictionary mit Schluesseln und den korrekten Indizes col_indices = {key: COLUMN_MAP.get(key, {}).get('index') for key in required_keys} # Pruefen Sie, ob alle benoetigten Schluessel in COLUMN_MAP gefunden @@ -2317,12 +2316,13 @@ class DataProcessor: batch_sheet_updates = [] # Iteriere über die Ergebnisse des finalen Batches for row_num, result_dict in scraping_results.items(): - # Füge Updates für Rohtext, Meta-Details, Timestamp und Version hinzu - # Wir verwenden hier die zuvor ermittelten col_indices - batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Rohtext"] + 1)}{row_num}', 'values': [[result_dict.get('raw_text', 'k.A.')]]}) - batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Meta-Details"] + 1)}{row_num}', 'values': [[result_dict.get('meta_details', 'k.A.')]]}) - batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Scrape Timestamp"] + 1)}{row_num}', 'values': [[current_timestamp]]}) - batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Version"] + 1)}{row_num}', 'values': [[current_version]]}) + # result_dict ist jetzt garantiert ein Dictionary + batch_sheet_updates.extend([ + {'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Rohtext"] + 1)}{row_num}', 'values': [[result_dict.get('raw_text', 'k.A.')]]}, + {'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Meta-Details"] + 1)}{row_num}', 'values': [[result_dict.get('meta_details', 'k.A.')]]}, + {'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Scrape Timestamp"] + 1)}{row_num}', 'values': [[current_timestamp]]}, + {'range': f'{self.sheet_handler._get_col_letter(col_indices["Version"] + 1)}{row_num}', 'values': [[current_version]]} + ]) all_sheet_updates.extend(batch_sheet_updates) @@ -2401,12 +2401,13 @@ class DataProcessor: # ANPASSUNG AN NEUE LOGIK for row_num, result_dict in scraping_results.items(): - batch_sheet_updates.extend([ - {'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Rohtext"] + 1)}{row_num}', 'values': [[result_dict.get('raw_text', 'k.A.')]]}, - {'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Meta-Details"] + 1)}{row_num}', 'values': [[result_dict.get('meta_details', 'k.A.')]]}, - {'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Scrape Timestamp"] + 1)}{row_num}', 'values': [[current_timestamp]]}, - {'range': f'{self.sheet_handler._get_col_letter(col_indices["Version"] + 1)}{row_num}', 'values': [[current_version]]} - ]) + # result_dict ist jetzt garantiert ein Dictionary + batch_sheet_updates.extend([ + {'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Rohtext"] + 1)}{row_num}', 'values': [[result_dict.get('raw_text', 'k.A.')]]}, + {'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Meta-Details"] + 1)}{row_num}', 'values': [[result_dict.get('meta_details', 'k.A.')]]}, + {'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Scrape Timestamp"] + 1)}{row_num}', 'values': [[current_timestamp]]}, + {'range': f'{self.sheet_handler._get_col_letter(col_indices["Version"] + 1)}{row_num}', 'values': [[current_version]]} + ]) all_sheet_updates.extend(batch_sheet_updates) # --- Finale Sheet Updates senden --- @@ -2423,26 +2424,22 @@ class DataProcessor: def _scrape_raw_text_task(self, task_info, scrape_function): """ - Worker-Funktion, die von jedem Thread im ThreadPoolExecutor ausgeführt wird. - Ruft die übergebene Scraping-Funktion (get_website_raw) auf. - Gibt IMMER ein Dictionary zurück, um Abstürze zu vermeiden. + Worker-Funktion. Ruft gehärtete Helper auf und gibt IMMER ein Dictionary zurück. """ url = task_info.get('url') row_num = task_info.get('row_num') self.logger.debug(f" -> Scrape Task gestartet für Zeile {row_num}: {url}") - try: - # Hier rufen wir die gehärteten Helper-Funktionen auf - raw_text_result = get_website_raw(url) - meta_details_result = scrape_website_details(url) - - # Gib immer ein Dictionary mit den Ergebnissen zurück - return { - 'row_num': row_num, - 'raw_text': raw_text_result, - 'meta_details': meta_details_result, - 'error': None - } + raw_text_result = get_website_raw(url) + # Wir müssen auch die Meta-Details scrapen, da sie jetzt erwartet werden + meta_details_result = scrape_website_details(url) + + return { + 'row_num': row_num, + 'raw_text': raw_text_result, + 'meta_details': meta_details_result, + 'error': None # Fehler werden jetzt im Text zurückgegeben + } except Exception as e: # Dieser Block ist ein zusätzliches Sicherheitsnetz, falls die Helper doch abstürzen self.logger.error(f"FATALER FEHLER im Scraping Worker für Zeile {row_num}: {e}", exc_info=True)