From 0b07cb1e50d113901df34bd50a9a5ab0dfcfca46 Mon Sep 17 00:00:00 2001 From: Floke Date: Sun, 20 Jul 2025 07:21:01 +0000 Subject: [PATCH] data_processor.py aktualisiert --- data_processor.py | 111 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) diff --git a/data_processor.py b/data_processor.py index 5105242c..846e3e67 100644 --- a/data_processor.py +++ b/data_processor.py @@ -1021,6 +1021,8 @@ class DataProcessor: self.logger.info(f"Re-Evaluierung abgeschlossen. {processed_count_actual} Zeilen verarbeitet.") + + def process_wiki_verify(self, limit=None, start_sheet_row=None, end_sheet_row=None): """ Iteriert durch die Zeilen und führt eine ChatGPT-basierte Verifizierung des @@ -2013,6 +2015,115 @@ class DataProcessor: 'meta_details': meta_details_result } + def _scrape_raw_text_task(self, task_info, scraper_function): + """ + Worker-Funktion für Threading. Gibt IMMER ein Dictionary zurück. + Basiert auf der Logik aus v1.7.9. + """ + row_num = task_info['row_num'] + url = task_info['url'] + self.logger.debug(f" -> Scrape Task gestartet für Zeile {row_num + 1}: {url}") + try: + raw_text = scraper_function(url) + return {'row_num': row_num, 'raw_text': raw_text, 'error': "k.A." in raw_text} + except Exception as e: + self.logger.error(f" -> Kritischer Fehler im Scrape-Task für Zeile {row_num + 1}: {e}") + return {'row_num': row_num, 'raw_text': f"FEHLER: {e}", 'error': True} + + def process_website_scraping_batch(self, start_sheet_row=None, end_sheet_row=None, limit=None): + """ + Batch-Prozess NUR fuer Website-Scraping (Rohtext AR). + Basiert auf der Logik aus v1.7.9, angepasst an die neue modulare Struktur und fehlerbereinigt. + """ + self.logger.info(f"Starte Website-Scraping (Batch). Bereich: {start_sheet_row or 'Start'}-{end_sheet_row or 'Ende'}, Limit: {limit or 'Unbegrenzt'}") + + # --- Daten laden und Startzeile ermitteln --- + if start_sheet_row is None: + start_data_idx = self.sheet_handler.get_start_row_index(check_column_key="Website Scrape Timestamp") + if start_data_idx == -1: + self.logger.error("FEHLER bei automatischer Ermittlung der Startzeile. Breche Batch ab.") + return + start_sheet_row = start_data_idx + self.sheet_handler._header_rows + 1 + + if not self.sheet_handler.load_data(): return + all_data = self.sheet_handler.get_all_data_with_headers() + header_rows = self.sheet_handler._header_rows + total_sheet_rows = len(all_data) + effective_end_row = end_sheet_row if end_sheet_row is not None else total_sheet_rows + + self.logger.info(f"Verarbeitungsbereich: Sheet-Zeilen {start_sheet_row} bis {effective_end_row}.") + if start_sheet_row > effective_end_row: + self.logger.info("Start liegt nach dem Ende. Keine Zeilen zu verarbeiten.") + return + + # --- Hauptlogik: Iteriere und sammle Batches --- + processing_batch_size = getattr(Config, 'PROCESSING_BATCH_SIZE', 20) + max_scraping_workers = getattr(Config, 'MAX_SCRAPING_WORKERS', 10) + update_batch_row_limit = getattr(Config, 'UPDATE_BATCH_ROW_LIMIT', 50) + + tasks_for_processing_batch = [] + all_sheet_updates = [] + processed_count = 0 + + for i in range(start_sheet_row, effective_end_row + 1): + row_index_in_list = i - 1 + if row_index_in_list >= total_sheet_rows: break + + row = all_data[row_index_in_list] + if not any(cell and str(cell).strip() for cell in row): continue + + # --- Pruefung, ob Verarbeitung noetig --- + if self._needs_website_processing(row, force_reeval=False): + website_url = self._get_cell_value_safe(row, "CRM Website").strip() + if website_url and website_url.lower() not in ["k.a.", "http:"]: + if limit is not None and processed_count >= limit: + self.logger.info(f"Verarbeitungslimit ({limit}) erreicht.") + break + + tasks_for_processing_batch.append({"row_num": i, "url": website_url}) + processed_count += 1 + + # --- Verarbeite den Batch, wenn voll --- + if len(tasks_for_processing_batch) >= processing_batch_size or (i == effective_end_row and tasks_for_processing_batch): + self.logger.debug(f"--- Starte Website-Scraping Batch ({len(tasks_for_processing_batch)} Tasks) ---") + scraping_results = {} + + with ThreadPoolExecutor(max_workers=max_scraping_workers) as executor: + future_to_task = {executor.submit(self._scrape_raw_text_task, task, get_website_raw): task for task in tasks_for_processing_batch} + for future in as_completed(future_to_task): + try: + result = future.result() + scraping_results[result['row_num']] = result['raw_text'] + except Exception as exc: + task = future_to_task[future] + self.logger.error(f"Unerwarteter Fehler bei Ergebnisabfrage für Zeile {task['row_num']}: {exc}") + scraping_results[task['row_num']] = "k.A. (Unerwarteter Fehler Task)" + + if scraping_results: + current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + current_version = getattr(Config, 'VERSION', 'unknown') + batch_sheet_updates = [] + for row_num, raw_text_res in scraping_results.items(): + # KORRIGIERT: Nutze die sichere `get_col_idx`-Funktion + batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Rohtext") + 1)}{row_num}', 'values': [[raw_text_res]]}) + batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Scrape Timestamp") + 1)}{row_num}', 'values': [[current_timestamp]]}) + batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Version") + 1)}{row_num}', 'values': [[current_version]]}) + all_sheet_updates.extend(batch_sheet_updates) + + tasks_for_processing_batch = [] + + if len(all_sheet_updates) >= (update_batch_row_limit * 3): # 3 Updates pro Zeile + self.logger.info(f"Sende gesammelte Sheet-Updates ({len(all_sheet_updates) // 3} Zeilen)...") + self.sheet_handler.batch_update_cells(all_sheet_updates) + all_sheet_updates = [] + + # --- Finale Sheet Updates senden --- + if all_sheet_updates: + self.logger.info(f"Sende FINALE gesammelte Sheet-Updates ({len(all_sheet_updates) // 3} Zeilen)...") + self.sheet_handler.batch_update_cells(all_sheet_updates) + + self.logger.info(f"Website-Scraping (Batch) abgeschlossen. {processed_count} Zeilen zur Verarbeitung ausgewählt.") + def process_website_scraping( self, start_sheet_row=None,