From e26067d8158d63bd0e2f892ff2daf9bad13d5c14 Mon Sep 17 00:00:00 2001 From: Floke Date: Thu, 24 Apr 2025 17:58:25 +0000 Subject: [PATCH] bugfix --- brancheneinstufung.py | 135 +++++++++++++++++++++++++++++++++--------- 1 file changed, 107 insertions(+), 28 deletions(-) diff --git a/brancheneinstufung.py b/brancheneinstufung.py index a5572d8d..867fd153 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -3954,45 +3954,124 @@ class DataProcessor: Batch-Prozess NUR für Wikipedia-Verifizierung (Spalten S-U, AX). Findet Startzeile ab erster Zelle mit leerem AX. """ - logging.info(f"Starte Wikipedia-Verifizierungs-Batch. Limit: {limit if limit is not None else 'Unbegrenzt'}") - if not self.sheet_handler.load_data(): return logging.error("FEHLER beim Laden der Daten.") - all_data = self.sheet_handler.get_all_data_with_headers(); header_rows = 5 - if not all_data or len(all_data) <= header_rows: return logging.warning("Keine Daten gefunden.") + logging.info( + f"Starte Wikipedia-Verifizierungs-Batch. Limit: {limit if limit is not None else 'Unbegrenzt'}" + ) + if not self.sheet_handler.load_data(): + return logging.error("FEHLER beim Laden der Daten.") + + all_data = self.sheet_handler.get_all_data_with_headers() + header_rows = 5 + if not all_data or len(all_data) <= header_rows: + return logging.warning("Keine Daten gefunden.") + + # Schlüssel holen und prüfen + timestamp_col_key = "Wiki Verif. Timestamp" + timestamp_col_index = COLUMN_MAP.get(timestamp_col_key) + if timestamp_col_index is None: + return logging.critical(f"FEHLER: Schlüssel '{timestamp_col_key}' fehlt.") - timestamp_col_key = "Wiki Verif. Timestamp"; timestamp_col_index = COLUMN_MAP.get(timestamp_col_key); if timestamp_col_index is None: return logging.critical(f"FEHLER: Schlüssel '{timestamp_col_key}' fehlt.") ts_col_letter = self.sheet_handler._get_col_letter(timestamp_col_index + 1) - start_data_index = self.sheet_handler.get_start_row_index(check_column_key=timestamp_col_key, min_sheet_row=header_rows + 1); if start_data_index == -1: return logging.error(f"FEHLER bei Startzeilensuche auf Spalte '{timestamp_col_key}'."); if start_data_index >= len(self.sheet_handler.get_data()): logging.info("Alle Zeilen mit Timestamp gefüllt. Nichts zu tun."); return + # Erste Zeile finden, in der AX leer ist + start_data_index = self.sheet_handler.get_start_row_index( + check_column_key=timestamp_col_key, + min_sheet_row=header_rows + 1 + ) + if start_data_index == -1: + return logging.error(f"FEHLER bei Startzeilensuche auf Spalte '{timestamp_col_key}'.") + if start_data_index >= len(self.sheet_handler.get_data()): + logging.info("Alle Zeilen mit Timestamp gefüllt. Nichts zu tun.") + return - start_sheet_row = start_data_index + header_rows + 1; total_sheet_rows = len(all_data); end_sheet_row = total_sheet_rows - if limit is not None and limit >= 0: end_sheet_row = min(start_sheet_row + limit - 1, total_sheet_rows); if limit == 0: logging.info("Limit 0."); return - if start_sheet_row > end_sheet_row: logging.warning("Start nach Ende (Limit)."); return + # Bereich festlegen + start_sheet_row = start_data_index + header_rows + 1 + total_sheet_rows = len(all_data) + end_sheet_row = total_sheet_rows - logging.info(f"Verarbeite Sheet-Zeilen {start_sheet_row} bis {end_sheet_row} für Wiki Verifizierung (Batch).") + if limit is not None and limit >= 0: + end_sheet_row = min(start_sheet_row + limit - 1, total_sheet_rows) + if limit == 0: + logging.info("Limit 0.") + return + if start_sheet_row > end_sheet_row: + logging.warning("Start nach Ende (Limit).") + return - batch_size = Config.BATCH_SIZE; current_batch = []; current_row_numbers = []; processed_count = 0 + logging.info( + f"Verarbeite Sheet-Zeilen {start_sheet_row} bis {end_sheet_row} " + "für Wiki-Verifizierung (Batch)." + ) + + batch_size = Config.BATCH_SIZE + current_batch = [] + current_row_numbers = [] + processed_count = 0 for i in range(start_sheet_row, end_sheet_row + 1): - row_index_in_list = i - 1; row = all_data[row_index_in_list] + row_index_in_list = i - 1 + row = all_data[row_index_in_list] - company_name = self._get_cell_value(row, "CRM Name"); crm_desc = self._get_cell_value(row, "CRM Beschreibung") - wiki_url = self._get_cell_value(row, "Wiki URL"); wiki_paragraph = self._get_cell_value(row, "Wiki Absatz") - wiki_categories = self._get_cell_value(row, "Wiki Kategorien") + company_name = self._get_cell_value(row, "CRM Name") + crm_desc = self._get_cell_value(row, "CRM Beschreibung") + wiki_url = self._get_cell_value(row, "Wiki URL") + wiki_paragraph = self._get_cell_value(row, "Wiki Absatz") + wiki_categories = self._get_cell_value(row, "Wiki Kategorien") - if wiki_url != 'k.A.' or wiki_paragraph != 'k.A.' or wiki_categories != 'k.A.': - entry_text = ( f"Eintrag {i}:\n" f" Firmenname: {company_name}\n" f" CRM-Beschreibung: {crm_desc[:200]}...\n" f" Wikipedia-URL: {wiki_url}\n" f" Wiki-Absatz: {wiki_paragraph[:200]}...\n" f" Wiki-Kategorien: {wiki_categories[:200]}...\n" f"----\n" ) - current_batch.append(entry_text); current_row_numbers.append(i); processed_count += 1 + if wiki_url != 'k.A.' or wiki_paragraph != 'k.A.' or wiki_categories != 'k.A.': + entry_text = ( + f"Eintrag {i}:\n" + f" Firmenname: {company_name}\n" + f" CRM-Beschreibung: {crm_desc[:200]}...\n" + f" Wikipedia-URL: {wiki_url}\n" + f" Wiki-Absatz: {wiki_paragraph[:200]}...\n" + f" Wiki-Kategorien: {wiki_categories[:200]}...\n" + "----\n" + ) + current_batch.append(entry_text) + current_row_numbers.append(i) + processed_count += 1 - if len(current_batch) >= batch_size or i == end_sheet_row: - if current_batch: - try: _process_batch(self.sheet_handler.sheet, current_batch, current_row_numbers); # Globale Helferfunktion - wiki_ts_updates = []; current_wiki_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S"); - for row_num in current_row_numbers: wiki_ts_updates.append({'range': f'{ts_col_letter}{row_num}', 'values': [[current_wiki_timestamp]]}) - if wiki_ts_updates: success_ts = self.sheet_handler.batch_update_cells(wiki_ts_updates); if success_ts: logging.debug(f"Wiki Verif. Timestamp {ts_col_letter} für Batch {current_row_numbers[0]}-{current_row_numbers[-1]} gesetzt."); else: logging.error(f"FEHLER beim Setzen des Wiki Verif. Timestamps {ts_col_letter} für Batch."); - except Exception as e_batch: logging.error(f"FEHLER bei Verarbeitung von Batch {current_row_numbers[0]}-{current_row_numbers[-1]} in _process_batch: {e_batch}"); pass - time.sleep(Config.RETRY_DELAY) - current_batch = []; current_row_numbers = [] - logging.info(f"Wikipedia-Verifizierungs-Batch abgeschlossen. {processed_count} Zeilen in Batches verarbeitet.") + if len(current_batch) >= batch_size or i == end_sheet_row: + if current_batch: + try: + _process_batch(self.sheet_handler.sheet, + current_batch, + current_row_numbers) + wiki_ts_updates = [] + current_wiki_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + for row_num in current_row_numbers: + wiki_ts_updates.append({ + 'range': f'{ts_col_letter}{row_num}', + 'values': [[current_wiki_ts]] + }) + if wiki_ts_updates: + success_ts = self.sheet_handler.batch_update_cells(wiki_ts_updates) + if success_ts: + logging.debug( + f"Wiki Verif. Timestamp {ts_col_letter} " + f"für Batch {current_row_numbers[0]}–" + f"{current_row_numbers[-1]} gesetzt." + ) + else: + logging.error( + "FEHLER beim Setzen des Wiki Verif. Timestamps." + ) + except Exception as e_batch: + logging.error( + f"FEHLER bei Verarbeitung von Batch " + f"{current_row_numbers[0]}–" + f"{current_row_numbers[-1]} in _process_batch: {e_batch}" + ) + finally: + time.sleep(Config.RETRY_DELAY) + current_batch = [] + current_row_numbers = [] + + logging.info( + f"Wikipedia-Verifizierungs-Batch abgeschlossen. " + f"{processed_count} Zeilen in Batches verarbeitet." + ) # process_website_batch Methode def process_website_batch(self, limit=None):