This commit is contained in:
2025-04-24 17:58:25 +00:00
parent fa00feb30e
commit 7928c2dcd5

View File

@@ -3954,45 +3954,124 @@ class DataProcessor:
Batch-Prozess NUR für Wikipedia-Verifizierung (Spalten S-U, AX).
Findet Startzeile ab erster Zelle mit leerem AX.
"""
logging.info(f"Starte Wikipedia-Verifizierungs-Batch. Limit: {limit if limit is not None else 'Unbegrenzt'}")
if not self.sheet_handler.load_data(): return logging.error("FEHLER beim Laden der Daten.")
all_data = self.sheet_handler.get_all_data_with_headers(); header_rows = 5
if not all_data or len(all_data) <= header_rows: return logging.warning("Keine Daten gefunden.")
logging.info(
f"Starte Wikipedia-Verifizierungs-Batch. Limit: {limit if limit is not None else 'Unbegrenzt'}"
)
if not self.sheet_handler.load_data():
return logging.error("FEHLER beim Laden der Daten.")
all_data = self.sheet_handler.get_all_data_with_headers()
header_rows = 5
if not all_data or len(all_data) <= header_rows:
return logging.warning("Keine Daten gefunden.")
# Schlüssel holen und prüfen
timestamp_col_key = "Wiki Verif. Timestamp"
timestamp_col_index = COLUMN_MAP.get(timestamp_col_key)
if timestamp_col_index is None:
return logging.critical(f"FEHLER: Schlüssel '{timestamp_col_key}' fehlt.")
timestamp_col_key = "Wiki Verif. Timestamp"; timestamp_col_index = COLUMN_MAP.get(timestamp_col_key); if timestamp_col_index is None: return logging.critical(f"FEHLER: Schlüssel '{timestamp_col_key}' fehlt.")
ts_col_letter = self.sheet_handler._get_col_letter(timestamp_col_index + 1)
start_data_index = self.sheet_handler.get_start_row_index(check_column_key=timestamp_col_key, min_sheet_row=header_rows + 1); if start_data_index == -1: return logging.error(f"FEHLER bei Startzeilensuche auf Spalte '{timestamp_col_key}'."); if start_data_index >= len(self.sheet_handler.get_data()): logging.info("Alle Zeilen mit Timestamp gefüllt. Nichts zu tun."); return
# Erste Zeile finden, in der AX leer ist
start_data_index = self.sheet_handler.get_start_row_index(
check_column_key=timestamp_col_key,
min_sheet_row=header_rows + 1
)
if start_data_index == -1:
return logging.error(f"FEHLER bei Startzeilensuche auf Spalte '{timestamp_col_key}'.")
if start_data_index >= len(self.sheet_handler.get_data()):
logging.info("Alle Zeilen mit Timestamp gefüllt. Nichts zu tun.")
return
start_sheet_row = start_data_index + header_rows + 1; total_sheet_rows = len(all_data); end_sheet_row = total_sheet_rows
if limit is not None and limit >= 0: end_sheet_row = min(start_sheet_row + limit - 1, total_sheet_rows); if limit == 0: logging.info("Limit 0."); return
if start_sheet_row > end_sheet_row: logging.warning("Start nach Ende (Limit)."); return
# Bereich festlegen
start_sheet_row = start_data_index + header_rows + 1
total_sheet_rows = len(all_data)
end_sheet_row = total_sheet_rows
logging.info(f"Verarbeite Sheet-Zeilen {start_sheet_row} bis {end_sheet_row} für Wiki Verifizierung (Batch).")
if limit is not None and limit >= 0:
end_sheet_row = min(start_sheet_row + limit - 1, total_sheet_rows)
if limit == 0:
logging.info("Limit 0.")
return
if start_sheet_row > end_sheet_row:
logging.warning("Start nach Ende (Limit).")
return
batch_size = Config.BATCH_SIZE; current_batch = []; current_row_numbers = []; processed_count = 0
logging.info(
f"Verarbeite Sheet-Zeilen {start_sheet_row} bis {end_sheet_row} "
"für Wiki-Verifizierung (Batch)."
)
batch_size = Config.BATCH_SIZE
current_batch = []
current_row_numbers = []
processed_count = 0
for i in range(start_sheet_row, end_sheet_row + 1):
row_index_in_list = i - 1; row = all_data[row_index_in_list]
row_index_in_list = i - 1
row = all_data[row_index_in_list]
company_name = self._get_cell_value(row, "CRM Name"); crm_desc = self._get_cell_value(row, "CRM Beschreibung")
wiki_url = self._get_cell_value(row, "Wiki URL"); wiki_paragraph = self._get_cell_value(row, "Wiki Absatz")
wiki_categories = self._get_cell_value(row, "Wiki Kategorien")
company_name = self._get_cell_value(row, "CRM Name")
crm_desc = self._get_cell_value(row, "CRM Beschreibung")
wiki_url = self._get_cell_value(row, "Wiki URL")
wiki_paragraph = self._get_cell_value(row, "Wiki Absatz")
wiki_categories = self._get_cell_value(row, "Wiki Kategorien")
if wiki_url != 'k.A.' or wiki_paragraph != 'k.A.' or wiki_categories != 'k.A.':
entry_text = ( f"Eintrag {i}:\n" f" Firmenname: {company_name}\n" f" CRM-Beschreibung: {crm_desc[:200]}...\n" f" Wikipedia-URL: {wiki_url}\n" f" Wiki-Absatz: {wiki_paragraph[:200]}...\n" f" Wiki-Kategorien: {wiki_categories[:200]}...\n" f"----\n" )
current_batch.append(entry_text); current_row_numbers.append(i); processed_count += 1
if wiki_url != 'k.A.' or wiki_paragraph != 'k.A.' or wiki_categories != 'k.A.':
entry_text = (
f"Eintrag {i}:\n"
f" Firmenname: {company_name}\n"
f" CRM-Beschreibung: {crm_desc[:200]}...\n"
f" Wikipedia-URL: {wiki_url}\n"
f" Wiki-Absatz: {wiki_paragraph[:200]}...\n"
f" Wiki-Kategorien: {wiki_categories[:200]}...\n"
"----\n"
)
current_batch.append(entry_text)
current_row_numbers.append(i)
processed_count += 1
if len(current_batch) >= batch_size or i == end_sheet_row:
if current_batch:
try: _process_batch(self.sheet_handler.sheet, current_batch, current_row_numbers); # Globale Helferfunktion
wiki_ts_updates = []; current_wiki_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S");
for row_num in current_row_numbers: wiki_ts_updates.append({'range': f'{ts_col_letter}{row_num}', 'values': [[current_wiki_timestamp]]})
if wiki_ts_updates: success_ts = self.sheet_handler.batch_update_cells(wiki_ts_updates); if success_ts: logging.debug(f"Wiki Verif. Timestamp {ts_col_letter} für Batch {current_row_numbers[0]}-{current_row_numbers[-1]} gesetzt."); else: logging.error(f"FEHLER beim Setzen des Wiki Verif. Timestamps {ts_col_letter} für Batch.");
except Exception as e_batch: logging.error(f"FEHLER bei Verarbeitung von Batch {current_row_numbers[0]}-{current_row_numbers[-1]} in _process_batch: {e_batch}"); pass
time.sleep(Config.RETRY_DELAY)
current_batch = []; current_row_numbers = []
logging.info(f"Wikipedia-Verifizierungs-Batch abgeschlossen. {processed_count} Zeilen in Batches verarbeitet.")
if len(current_batch) >= batch_size or i == end_sheet_row:
if current_batch:
try:
_process_batch(self.sheet_handler.sheet,
current_batch,
current_row_numbers)
wiki_ts_updates = []
current_wiki_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
for row_num in current_row_numbers:
wiki_ts_updates.append({
'range': f'{ts_col_letter}{row_num}',
'values': [[current_wiki_ts]]
})
if wiki_ts_updates:
success_ts = self.sheet_handler.batch_update_cells(wiki_ts_updates)
if success_ts:
logging.debug(
f"Wiki Verif. Timestamp {ts_col_letter} "
f"für Batch {current_row_numbers[0]}"
f"{current_row_numbers[-1]} gesetzt."
)
else:
logging.error(
"FEHLER beim Setzen des Wiki Verif. Timestamps."
)
except Exception as e_batch:
logging.error(
f"FEHLER bei Verarbeitung von Batch "
f"{current_row_numbers[0]}"
f"{current_row_numbers[-1]} in _process_batch: {e_batch}"
)
finally:
time.sleep(Config.RETRY_DELAY)
current_batch = []
current_row_numbers = []
logging.info(
f"Wikipedia-Verifizierungs-Batch abgeschlossen. "
f"{processed_count} Zeilen in Batches verarbeitet."
)
# process_website_batch Methode
def process_website_batch(self, limit=None):