From 7818de3cb801e2dbbdd2ca82d59b12d984f50732 Mon Sep 17 00:00:00 2001 From: Floke Date: Sun, 20 Jul 2025 07:56:31 +0000 Subject: [PATCH] data_processor.py aktualisiert --- data_processor.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/data_processor.py b/data_processor.py index cd7d5d20..04be32f9 100644 --- a/data_processor.py +++ b/data_processor.py @@ -2077,9 +2077,9 @@ class DataProcessor: """ self.logger.info(f"Starte Website-Scraping & Summarizing (Batch). Limit: {limit or 'Unbegrenzt'}") - # --- 1. Tasks sammeln --- if start_sheet_row is None: start_data_idx = self.sheet_handler.get_start_row_index("Website Scrape Timestamp") + if start_data_idx == -1: return # Fehler wurde bereits geloggt start_sheet_row = start_data_idx + self.sheet_handler._header_rows + 1 if not self.sheet_handler.load_data(): return @@ -2091,7 +2091,7 @@ class DataProcessor: row_data = all_data[i] if self._needs_website_processing(row_data, force_reeval=False): tasks.append({ - 'row_num': i + 1, + 'row_num': i + 1, 'company_name': self._get_cell_value_safe(row_data, "CRM Name"), 'url': self._get_cell_value_safe(row_data, "CRM Website") }) @@ -2100,12 +2100,12 @@ class DataProcessor: self.logger.info("Keine Zeilen gefunden, die Website-Verarbeitung erfordern.") return - # --- 2. Worker-Funktion definieren --- def _scrape_worker(task): """Interne Worker-Funktion. Gibt IMMER ein Dictionary zurück.""" company_name = task['company_name'] website_url = task['url'] + # Initialisiere das Ergebnis-Dictionary mit Default-Werten result = {'raw_text': 'k.A.', 'meta_text': 'k.A.', 'summary': 'k.A.', 'url_pruefstatus': 'URL_UNPROCESSED', 'final_url': website_url} try: @@ -2141,10 +2141,10 @@ class DataProcessor: result['url_pruefstatus'] = "URL_SCRAPE_ERROR" return result - # --- 3. Parallele Ausführung & Ergebnisse sammeln --- all_updates = [] now_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + # Stelle sicher, dass der Import am Anfang der Datei steht from concurrent.futures import ThreadPoolExecutor, as_completed with ThreadPoolExecutor(max_workers=getattr(Config, 'MAX_SCRAPING_WORKERS', 5)) as executor: @@ -2156,23 +2156,20 @@ class DataProcessor: try: result_dict = future.result() - # Stelle sicher, dass result_dict ein Dictionary ist if not isinstance(result_dict, dict): - self.logger.error(f"Fehlerhaftes Ergebnis für Zeile {row_num}: Worker gab keinen Dictionary zurück. Bekam {type(result_dict)}.") + self.logger.error(f"Fehlerhaftes Ergebnis für Zeile {row_num}: Worker gab keinen Dictionary zurück. Bekam {type(result_dict)}. Überspringe Update.") continue - # Bereite die Updates vor - all_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("CRM Website") + 1)}{row_num}', 'values': [[result_dict.get('final_url', '')]]}) - all_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Rohtext") + 1)}{row_num}', 'values': [[result_dict.get('raw_text', '')]]}) - all_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Meta-Details") + 1)}{row_num}', 'values': [[result_dict.get('meta_text', '')]]}) - all_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Zusammenfassung") + 1)}{row_num}', 'values': [[result_dict.get('summary', '')]]}) - all_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("URL Prüfstatus") + 1)}{row_num}', 'values': [[result_dict.get('url_pruefstatus', '')]]}) + all_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("CRM Website") + 1)}{row_num}', 'values': [[result_dict.get('final_url')]]}) + all_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Rohtext") + 1)}{row_num}', 'values': [[result_dict.get('raw_text')]]}) + all_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Meta-Details") + 1)}{row_num}', 'values': [[result_dict.get('meta_text')]]}) + all_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Zusammenfassung") + 1)}{row_num}', 'values': [[result_dict.get('summary')]]}) + all_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("URL Prüfstatus") + 1)}{row_num}', 'values': [[result_dict.get('url_pruefstatus')]]}) all_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Scrape Timestamp") + 1)}{row_num}', 'values': [[now_timestamp]]}) except Exception as e_future: self.logger.error(f"Fehler beim Abrufen des Ergebnisses für Zeile {row_num}: {e_future}", exc_info=True) - # --- 4. Finales Schreiben --- if all_updates: self.logger.info(f"Sende Batch-Update für {len(tasks)} verarbeitete Websites...") self.sheet_handler.batch_update_cells(all_updates)