From bcdc045877bc80ed5740ea4efd8d526039435f1a Mon Sep 17 00:00:00 2001 From: Floke Date: Thu, 17 Apr 2025 09:48:32 +0000 Subject: [PATCH] bugfix --- brancheneinstufung.py | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/brancheneinstufung.py b/brancheneinstufung.py index 77d1dc5e..22e17be8 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -2208,6 +2208,7 @@ def _process_batch(sheet, batches, row_numbers): # Komplette Funktion process_website_batch (MIT Batched Google Sheet Updates) # Komplette Funktion process_website_batch (NEUE STRUKTUR - ECHTER BATCH WORKFLOW) # Komplette Funktion process_website_batch (NUR SCRAPING) +# Komplette Funktion process_website_batch (Korrigierte Config-Referenzen) def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet): """ Batch-Prozess NUR für Website-Scraping (Rohtext AR). @@ -2216,10 +2217,6 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index """ debug_print(f"Starte Website-Scraping NUR ROHDATEN (Batch) für Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}...") - # --- Konfiguration --- - MAX_SCRAPING_WORKERS = Config.MAX_SCRAPING_WORKERS # Aus Config holen - update_batch_row_limit = Config.UPDATE_BATCH_ROW_LIMIT # Aus Config holen - # --- Lade Daten --- if not sheet_handler.load_data(): return all_data = sheet_handler.get_all_data_with_headers() @@ -2227,7 +2224,7 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index header_rows = 5 # --- Indizes und Buchstaben --- - rohtext_col_key = "Website Rohtext" # Spalte AR + rohtext_col_key = "Website Rohtext" rohtext_col_index = COLUMN_MAP.get(rohtext_col_key) website_col_idx = COLUMN_MAP.get("CRM Website") version_col_idx = COLUMN_MAP.get("Version") @@ -2255,6 +2252,11 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index total_skipped_url_count = 0 total_error_count = 0 + # Verwende Werte aus Config + processing_batch_size = Config.PROCESSING_BATCH_SIZE + max_scraping_workers = Config.MAX_SCRAPING_WORKERS + update_batch_row_limit = Config.UPDATE_BATCH_ROW_LIMIT # Annahme: UPDATE_BATCH_ROW_LIMIT ist auch in Config definiert + for i in range(start_row_index_in_sheet, end_row_index_in_sheet + 1): row_index_in_list = i - 1 if row_index_in_list >= len(all_data): continue @@ -2265,10 +2267,8 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index cell_value_ar = None if len(row) > rohtext_col_index: cell_value_ar = str(row[rohtext_col_index]).strip() - # Überspringen, wenn NICHT leer oder k.A. if cell_value_ar and cell_value_ar.lower() not in ["", "k.a.", "k.a. (nur cookie-banner erkannt)", "k.a. (fehler)"]: should_skip = True - # else: Spalte nicht vorhanden -> nicht überspringen # Debug Log log_debug = (i < start_row_index_in_sheet + 5 or i > end_row_index_in_sheet - 5 or i % 500 == 0) @@ -2289,16 +2289,18 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index tasks_for_processing_batch.append({"row_num": i, "url": website_url}) # --- Verarbeitungs-Batch ausführen --- - if len(tasks_for_processing_batch) >= PROCESSING_BATCH_SIZE or i == end_row_index_in_sheet: + # HIER KORRIGIERT: Verwende processing_batch_size + if len(tasks_for_processing_batch) >= processing_batch_size or i == end_row_index_in_sheet: if tasks_for_processing_batch: batch_start_row = tasks_for_processing_batch[0]['row_num'] batch_end_row = tasks_for_processing_batch[-1]['row_num'] - batch_task_count = len(tasks_for_current_processing_batch) # Korrigiert + batch_task_count = len(tasks_for_processing_batch) debug_print(f"\n--- Starte Scraping-Batch ({batch_task_count} Tasks, Zeilen {batch_start_row}-{batch_end_row}) ---") scraping_results = {} - debug_print(f" Scrape {batch_task_count} Websites parallel (max {MAX_SCRAPING_WORKERS} worker)...") # Korrigiert - with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_SCRAPING_WORKERS) as executor: + # HIER KORRIGIERT: Verwende max_scraping_workers + debug_print(f" Scrape {batch_task_count} Websites parallel (max {max_scraping_workers} worker)...") + with concurrent.futures.ThreadPoolExecutor(max_workers=max_scraping_workers) as executor: future_to_task = {executor.submit(scrape_raw_text_task, task): task for task in tasks_for_processing_batch} for future in concurrent.futures.as_completed(future_to_task): task = future_to_task[future] @@ -2306,12 +2308,15 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index result = future.result() scraping_results[result['row_num']] = result['raw_text'] if result['error']: total_error_count += 1 - total_processed_count += 1 # Zähle hier jeden Versuch + # Zähle erst hier, wenn Ergebnis da ist except Exception as exc: row_num = task['row_num']; err_msg = f"Generischer Fehler Scraping Task Zeile {row_num}: {exc}" - debug_print(err_msg); scraping_results[row_num] = "k.A. (Fehler)"; total_error_count +=1; total_processed_count += 1 + debug_print(err_msg); scraping_results[row_num] = "k.A. (Fehler)"; total_error_count +=1 - debug_print(f" Scraping für Batch beendet.") + # Zähle hier die Anzahl der tatsächlich bearbeiteten Ergebnisse + current_batch_processed_count = len(scraping_results) + total_processed_count += current_batch_processed_count + debug_print(f" Scraping für Batch beendet. {current_batch_processed_count} Ergebnisse erhalten ({total_error_count} Fehler in diesem Batch).") # --- Sheet Updates vorbereiten (NUR AR und AP) --- if scraping_results: @@ -2320,7 +2325,7 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index for row_num, raw_text_res in scraping_results.items(): row_updates = [ {'range': f'{rohtext_col_letter}{row_num}', 'values': [[raw_text_res]]}, - # {'range': f'{ts_col_letter}{row_num}', 'values': [[current_timestamp]]}, # AT wird NICHT mehr gesetzt + # KEIN AT Timestamp mehr {'range': f'{version_col_letter}{row_num}', 'values': [[current_version]]} ] batch_sheet_updates.extend(row_updates) @@ -2330,7 +2335,8 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index tasks_for_processing_batch = [] # --- Sheet Updates senden (wenn update_batch_row_limit erreicht) --- - # Hinweis: Diese Logik sendet jetzt seltener, erst wenn genug Updates gesammelt wurden + # HIER KORRIGIERT: Verwende update_batch_row_limit + # Prüfe die Anzahl der *Zellen* in all_sheet_updates if len(all_sheet_updates) >= update_batch_row_limit * 2: # *2 weil 2 Updates pro Zeile debug_print(f" Sende gesammelte Sheet-Updates ({len(all_sheet_updates)} Zellen)...") success = sheet_handler.batch_update_cells(all_sheet_updates)