diff --git a/brancheneinstufung.py b/brancheneinstufung.py index 6af4c3e9..ed9728f6 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -6304,7 +6304,7 @@ class DataProcessor: def process_branch_batch(self, start_sheet_row=None, end_sheet_row=None, limit=None): self.logger.info(f"Starte Brancheneinschaetzung (Parallel Batch). Bereich: {start_sheet_row if start_sheet_row is not None else 'Start'}-{end_sheet_row if end_sheet_row is not None else 'Ende'}, Limit: {limit if limit is not None else 'Unbegrenzt'}...") - # --- Daten laden und Startzeile ermitteln --- (wie gehabt) + # --- Daten laden und Startzeile ermitteln --- if start_sheet_row is None: self.logger.info("Automatische Ermittlung der Startzeile basierend auf leeren Timestamp letzte Pruefung (BC)...") start_data_index_no_header = self.sheet_handler.get_start_row_index(check_column_key="Timestamp letzte Pruefung", min_sheet_row=7) @@ -6329,31 +6329,25 @@ class DataProcessor: self.logger.info("Berechneter Start liegt nach dem Ende des Bereichs oder Sheets. Keine Zeilen zu verarbeiten.") return - # --- Indizes und Buchstaben --- (wie gehabt) + # --- Indizes und Buchstaben --- required_keys = [ "Timestamp letzte Pruefung", "CRM Branche", "CRM Beschreibung", "Wiki Branche", "Wiki Kategorien", "Website Zusammenfassung", "Version", "Chat Vorschlag Branche", "Chat Branche Konfidenz", "Chat Konsistenz Branche", "Chat Begruendung Abweichung Branche", - "CRM Name", "Finaler Umsatz (Wiki>CRM)", "Finaler Mitarbeiter (Wiki>CRM)" # Für Konsolidierung in _process_single_row (obwohl hier nicht direkt genutzt, aber für Vollständigkeit) + "CRM Name" ] - # ... (col_indices und Fehlerprüfung bleiben) col_indices = {key: COLUMN_MAP.get(key) for key in required_keys} if None in col_indices.values(): missing = [k for k, v in col_indices.items() if v is None] self.logger.critical(f"FEHLER: Benoetigte Spaltenschluessel fehlen in COLUMN_MAP fuer process_branch_batch: {missing}. Breche ab.") return - - # --- Konfiguration fuer Parallelisierung --- (wie gehabt) MAX_BRANCH_WORKERS = getattr(Config, 'MAX_BRANCH_WORKERS', 10) OPENAI_CONCURRENCY_LIMIT = getattr(Config, 'OPENAI_CONCURRENCY_LIMIT', 3) processing_batch_size = getattr(Config, 'PROCESSING_BRANCH_BATCH_SIZE', 20) - # --- Verarbeitungsvariablen --- - tasks_to_submit_to_executor = [] # Sammelt Tasks, bis Limit oder Batch-Größe erreicht - - total_tasks_collected_for_processing = 0 # Zählt alle Tasks, die die Kriterien erfüllen - processed_tasks_in_run = 0 # Zählt Tasks, die tatsächlich an den Executor übergeben wurden + tasks_for_current_batch = [] + processed_tasks_count = 0 # Zählt Tasks, die tatsächlich verarbeitet wurden skipped_count = 0 global ALLOWED_TARGET_BRANCHES @@ -6363,10 +6357,66 @@ class DataProcessor: self.logger.critical("FEHLER: Ziel-Branchenschema konnte nicht geladen werden. Breche Batch ab.") return + # Funktion zum Verarbeiten eines einzelnen Batches (um Code-Duplikation zu reduzieren) + def _execute_and_write_batch(batch_tasks_to_run): + nonlocal processed_tasks_count # Zugriff auf die äußere Variable + if not batch_tasks_to_run: + return + + batch_start_log = batch_tasks_to_run[0]['row_num'] + batch_end_log = batch_tasks_to_run[-1]['row_num'] + self.logger.debug(f"\n--- Verarbeite Branch-Evaluation Batch ({len(batch_tasks_to_run)} Tasks, Zeilen {batch_start_log}-{batch_end_log}) ---") + + current_batch_results = [] + current_batch_errors = 0 + openai_sem = threading.Semaphore(OPENAI_CONCURRENCY_LIMIT) + + with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_BRANCH_WORKERS) as executor: + future_map = {executor.submit(self.evaluate_branch_task, task, openai_sem): task for task in batch_tasks_to_run} + for future in concurrent.futures.as_completed(future_map): + task_info = future_map[future] + try: + res_data = future.result() + current_batch_results.append(res_data) + if res_data.get('error'): current_batch_errors +=1 + except Exception as exc_future: + self.logger.error(f"Exception im Future für Zeile {task_info['row_num']}: {exc_future}") + current_batch_results.append({"row_num": task_info['row_num'], "result": {"branch": "FEHLER FUTURE", "consistency": "error_task", "justification": str(exc_future)[:100]}, "error": str(exc_future)}) + current_batch_errors += 1 + + self.logger.debug(f" Batch ({batch_start_log}-{batch_end_log}) beendet. {len(current_batch_results)} Ergebnisse, {current_batch_errors} Fehler.") + + if current_batch_results: + ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ver = getattr(Config, 'VERSION', 'unknown') + updates_this_batch = [] + current_batch_results.sort(key=lambda x: x['row_num']) + for item in current_batch_results: + rn, res = item['row_num'], item['result'] + self.logger.debug(f" Zeile {rn} (Ergebnis): {res}") + updates_this_batch.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Vorschlag Branche"] + 1)}{rn}', 'values': [[res.get("branch", "ERR BR")]]}) + updates_this_batch.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Branche Konfidenz"] + 1)}{rn}', 'values': [[res.get("confidence", "N/A CO")]]}) + updates_this_batch.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Konsistenz Branche"] + 1)}{rn}', 'values': [[res.get("consistency", "err CO")]]}) + updates_this_batch.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Begruendung Abweichung Branche"] + 1)}{rn}', 'values': [[res.get("justification", "No JU")]]}) + updates_this_batch.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Timestamp letzte Pruefung"] + 1)}{rn}', 'values': [[ts]]}) + updates_this_batch.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Version"] + 1)}{rn}', 'values': [[ver]]}) + + if updates_this_batch: + self.logger.debug(f" Sende Sheet-Update für {len(current_batch_results)} Zeilen dieses Batches...") + s_upd = self.sheet_handler.batch_update_cells(updates_this_batch) + if s_upd: self.logger.info(f" Sheet-Update für Batch Zeilen {batch_start_log}-{batch_end_log} erfolgreich.") + + processed_tasks_count += len(batch_tasks_to_run) # Zähle verarbeitete Tasks + + pause_dur = getattr(Config, 'RETRY_DELAY', 5) * 0.8 + self.logger.debug(f"--- Batch ({batch_start_log}-{batch_end_log}) abgeschlossen. Warte {pause_dur:.2f}s ---") + time.sleep(pause_dur) + # Ende der Hilfsfunktion _execute_and_write_batch + # Hauptschleife über die Zeilen for i in range(start_sheet_row, end_sheet_row + 1): - if limit is not None and total_tasks_collected_for_processing >= limit: - self.logger.info(f"Sammel-Limit ({limit}) für Branch-Tasks erreicht. Stoppe weitere Zeilenprüfung.") + if limit is not None and processed_tasks_count >= limit: # Prüfe Limit für *tatsächlich verarbeitete* Tasks + self.logger.info(f"Verarbeitungslimit ({limit}) erreicht. Stoppe weitere Zeilenprüfung.") break row_index_in_list = i - 1 @@ -6378,29 +6428,18 @@ class DataProcessor: continue company_name_log = self._get_cell_value_safe(row, "CRM Name").strip() - ao_value = self._get_cell_value_safe(row, "Timestamp letzte Pruefung").strip() - processing_needed_based_on_status = not ao_value - - if not processing_needed_based_on_status: + if ao_value: # Wenn Timestamp gesetzt ist, überspringen skipped_count += 1 continue - # --- DEBUG BLOCK für info_sources_count (wie im vorherigen Vorschlag) --- - crm_branche_val = self._get_cell_value_safe(row, "CRM Branche").strip() - # ... (Rest der _val Variablen und der detaillierte Debug-Block für info_sources_count) - crm_beschreibung_val = self._get_cell_value_safe(row, "CRM Beschreibung").strip() - wiki_branche_val = self._get_cell_value_safe(row, "Wiki Branche").strip() - wiki_kategorien_val = self._get_cell_value_safe(row, "Wiki Kategorien").strip() + # --- DEBUG BLOCK für info_sources_count (wie gehabt) --- + crm_branche_val = self._get_cell_value_safe(row, "CRM Branche").strip(); crm_beschreibung_val = self._get_cell_value_safe(row, "CRM Beschreibung").strip() + wiki_branche_val = self._get_cell_value_safe(row, "Wiki Branche").strip(); wiki_kategorien_val = self._get_cell_value_safe(row, "Wiki Kategorien").strip() website_summary_val = self._get_cell_value_safe(row, "Website Zusammenfassung").strip() - - self.logger.debug(f"Zeile {i} ({company_name_log[:30]}...) - Rohwerte für Info-Quellen:") # ... (kompletter detaillierter Debug-Block für info_sources_count hier einfügen) ... - sources_to_check = { - "CRM Branche": crm_branche_val, "CRM Beschreibung": crm_beschreibung_val, - "Wiki Branche": wiki_branche_val, "Wiki Kategorien": wiki_kategorien_val, - "Website Zusammenfassung": website_summary_val - } + self.logger.debug(f"Zeile {i} ({company_name_log[:30]}...) - Rohwerte für Info-Quellen:") + sources_to_check = {"CRM Branche": crm_branche_val, "CRM Beschreibung": crm_beschreibung_val, "Wiki Branche": wiki_branche_val, "Wiki Kategorien": wiki_kategorien_val, "Website Zusammenfassung": website_summary_val} info_sources_count = 0; counted_sources = [] for source_name, val in sources_to_check.items(): cond1 = bool(val); cond2 = isinstance(val, str); cond3 = False; cond4 = False; cond5 = False @@ -6411,96 +6450,43 @@ class DataProcessor: if is_valid_source: info_sources_count += 1; counted_sources.append(source_name) self.logger.debug(f" Prüfe Quelle '{source_name}': Wert='{str(val)[:30]}...', c1?{cond1}, c2?{cond2}, c3?{cond3}, c4?{cond4}, c5?{cond5} -> Gültig? {is_valid_source}") self.logger.debug(f"Zeile {i} ({company_name_log[:30]}...) - Gezählte valide Quellen: {info_sources_count} - {counted_sources}") - + if info_sources_count < 2: self.logger.info(f"Zeile {i} ({company_name_log[:30]}...) (Branch Check): Uebersprungen (Timestamp BC leer, aber nur {info_sources_count} Informationsquellen verfuegbar: {counted_sources}). Mindestens 2 benoetigt.") skipped_count += 1 continue - # --- ENDE DEBUG BLOCK --- - # Task zur Liste hinzufügen, wenn alle Kriterien erfüllt sind - tasks_to_submit_to_executor.append({ - "row_num": i, "crm_branche": crm_branche_val, "beschreibung": crm_beschreibung_val, - "wiki_branche": wiki_branche_val, "wiki_kategorien": wiki_kategorien_val, - "website_summary": website_summary_val - }) - total_tasks_collected_for_processing += 1 - # Ende der Hauptschleife for i in range... + # Task zur Liste hinzufügen, wenn alle Kriterien erfüllt sind UND Limit noch nicht erreicht + if limit is None or (processed_tasks_count + len(tasks_for_current_batch)) < limit: + tasks_for_current_batch.append({ + "row_num": i, "crm_branche": crm_branche_val, "beschreibung": crm_beschreibung_val, + "wiki_branche": wiki_branche_val, "wiki_kategorien": wiki_kategorien_val, + "website_summary": website_summary_val + }) + elif limit is not None and (processed_tasks_count + len(tasks_for_current_batch)) >= limit : + # Wenn das Hinzufügen dieses Tasks das Limit erreichen oder überschreiten würde, + # füge ihn noch hinzu (wird im nächsten Batch-Check gekürzt) und beende dann die Schleife + tasks_for_current_batch.append({ + "row_num": i, "crm_branche": crm_branche_val, "beschreibung": crm_beschreibung_val, + "wiki_branche": wiki_branche_val, "wiki_kategorien": wiki_kategorien_val, + "website_summary": website_summary_val + }) + self.logger.info(f"Zeile {i} wurde als letzter Task vor Erreichen des Limits ({limit}) gesammelt.") + # Die execute_and_write_batch Logik wird getriggert, wenn die Schleife endet oder der Batch voll ist. - # --- Jetzt die gesammelten Tasks in Batches verarbeiten --- - num_tasks_to_process = len(tasks_to_submit_to_executor) - self.logger.info(f"Nach Prüfung aller Zeilen im Bereich: {num_tasks_to_process} Tasks für Branch-Evaluation gesammelt.") - for batch_start_index in range(0, num_tasks_to_process, processing_batch_size): - if limit is not None and processed_tasks_in_run >= limit: - self.logger.info(f"Verarbeitungslimit ({limit}) vor Start des nächsten Batches erreicht.") - break # Verlasse die Batch-Verarbeitungsschleife - - # Aktuellen Batch extrahieren - current_batch_tasks = tasks_to_submit_to_executor[batch_start_index : batch_start_index + processing_batch_size] - - # Überprüfen, ob der aktuelle Batch das Limit überschreiten würde - if limit is not None and (processed_tasks_in_run + len(current_batch_tasks)) > limit: - can_take_in_batch = limit - processed_tasks_in_run - if can_take_in_batch <= 0: # Sollte nicht passieren, wenn Limit-Check oben greift - break - current_batch_tasks = current_batch_tasks[:can_take_in_batch] - - if not current_batch_tasks: # Sollte nicht passieren, wenn Limit-Check oben greift - continue - - batch_start_row_log = current_batch_tasks[0]['row_num'] - batch_end_row_log = current_batch_tasks[-1]['row_num'] - self.logger.debug(f"\n--- Starte Branch-Evaluation Batch ({len(current_batch_tasks)} Tasks, Zeilen {batch_start_row_log}-{batch_end_row_log}) ---") - - processed_tasks_in_run += len(current_batch_tasks) - - batch_results_data = [] - batch_error_count_this_batch = 0 - openai_semaphore_branch = threading.Semaphore(OPENAI_CONCURRENCY_LIMIT) - with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_BRANCH_WORKERS) as executor: - future_to_task = {executor.submit(self.evaluate_branch_task, task, openai_semaphore_branch): task for task in current_batch_tasks} - for future in concurrent.futures.as_completed(future_to_task): - task = future_to_task[future] - try: - result_data = future.result() - batch_results_data.append(result_data) - if result_data.get('error'): batch_error_count_this_batch += 1 - except Exception as exc: - row_num_exc = task['row_num'] - err_msg_exc = f"Unerwarteter Fehler bei Ergebnisabfrage Branch Task Zeile {row_num_exc}: {type(exc).__name__} - {exc}" - self.logger.error(err_msg_exc) - batch_results_data.append({"row_num": row_num_exc, "result": {"branch": "FEHLER", "consistency": "error_task", "justification": err_msg_exc[:500]}, "error": err_msg_exc}) - batch_error_count_this_batch += 1 - - self.logger.debug(f" Branch-Evaluation fuer Batch beendet. {len(batch_results_data)} Ergebnisse erhalten ({batch_error_count_this_batch} Fehler in diesem Batch).") - - if batch_results_data: - current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - current_version = getattr(Config, 'VERSION', 'unknown') - sheet_updates_for_this_batch = [] - batch_results_data.sort(key=lambda x: x['row_num']) - for res_data_item in batch_results_data: - row_num_item = res_data_item['row_num']; result_item = res_data_item['result'] - self.logger.debug(f" Zeile {row_num_item} (aus Batch): Ergebnis aus evaluate_branch_task: {result_item}") - # ... (alle appends für sheet_updates_for_this_batch wie im vorherigen korrekten Vorschlag) - sheet_updates_for_this_batch.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Vorschlag Branche"] + 1)}{row_num_item}', 'values': [[result_item.get("branch", "FEHLER BRANCH")]]}) - sheet_updates_for_this_batch.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Branche Konfidenz"] + 1)}{row_num_item}', 'values': [[result_item.get("confidence", "N/A CONF")]]}) - sheet_updates_for_this_batch.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Konsistenz Branche"] + 1)}{row_num_item}', 'values': [[result_item.get("consistency", "error CONS")]]}) - sheet_updates_for_this_batch.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Chat Begruendung Abweichung Branche"] + 1)}{row_num_item}', 'values': [[result_item.get("justification", "Keine Begr. JUST")]]}) - sheet_updates_for_this_batch.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Timestamp letzte Pruefung"] + 1)}{row_num_item}', 'values': [[current_timestamp]]}) - sheet_updates_for_this_batch.append({'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Version"] + 1)}{row_num_item}', 'values': [[current_version]]}) - - if sheet_updates_for_this_batch: - self.logger.debug(f" Sende Sheet-Update fuer {len(batch_results_data)} Zeilen ({len(sheet_updates_for_this_batch)} Zellen) dieses Batches...") - success = self.sheet_handler.batch_update_cells(sheet_updates_for_this_batch) - if success: self.logger.info(f" Sheet-Update fuer Batch Zeilen {batch_start_row_log}-{batch_end_row_log} erfolgreich.") - - pause_duration = getattr(Config, 'RETRY_DELAY', 5) * 0.8 - self.logger.debug(f"--- Batch Zeilen {batch_start_row_log}-{batch_end_row_log} abgeschlossen. Warte {pause_duration:.2f}s vor naechstem Batch ---") - time.sleep(pause_duration) + # Batch ausführen, wenn voll ODER es die letzte Zeile ist + if len(tasks_for_current_batch) >= processing_batch_size or i == end_sheet_row: + _execute_and_write_batch(tasks_for_current_batch) + tasks_for_current_batch = [] # Batch-Liste für den nächsten Durchlauf leeren - self.logger.info(f"Brancheneinschaetzung (Parallel Batch) abgeschlossen. {processed_tasks_in_run} Zeilen verarbeitet, {skipped_count} Zeilen uebersprungen.") + # Sicherstellen, dass ein eventuell nicht voller letzter Batch auch noch verarbeitet wird, + # falls die Schleife durch das Limit beendet wurde und noch Tasks übrig sind. + if tasks_for_current_batch: + self.logger.debug(f"Verarbeite verbleibenden Rest-Batch von {len(tasks_for_current_batch)} Tasks...") + _execute_and_write_batch(tasks_for_current_batch) + + self.logger.info(f"Brancheneinschaetzung (Parallel Batch) abgeschlossen. {processed_tasks_count} Zeilen verarbeitet, {skipped_count} Zeilen uebersprungen.") # ==============================================================================