diff --git a/brancheneinstufung.py b/brancheneinstufung.py index c2cf7e0d..7070be75 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -2260,17 +2260,18 @@ def _process_batch(sheet, batches, row_numbers): def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet): """ Batch-Prozess NUR für Website-Scraping (Rohtext AR). - Lädt Daten neu, prüft Spalte AR auf leere/k.A.-Werte und überspringt Zeilen mit Inhalt. + Lädt Daten neu, prüft Spalte AR auf Inhalt ('', 'k.A.', etc.) und überspringt Zeilen mit Inhalt. Setzt AR + AP für bearbeitete Zeilen. Sendet Updates gebündelt. """ debug_print(f"Starte Website-Scraping NUR ROHDATEN (Batch) für Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}...") + # --- Lade Daten --- if not sheet_handler.load_data(): return all_data = sheet_handler.get_all_data_with_headers() if not all_data or len(all_data) <= 5: return header_rows = 5 - # Indizes und Buchstaben + # --- Indizes und Buchstaben --- rohtext_col_key = "Website Rohtext" rohtext_col_index = COLUMN_MAP.get(rohtext_col_key) website_col_idx = COLUMN_MAP.get("CRM Website") @@ -2281,14 +2282,14 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index rohtext_col_letter = sheet_handler._get_col_letter(rohtext_col_index + 1) version_col_letter = sheet_handler._get_col_letter(version_col_idx + 1) - # Worker-Funktion (unverändert) + # --- Worker-Funktion für Scraping (unverändert) --- def scrape_raw_text_task(task_info): row_num = task_info['row_num']; url = task_info['url']; raw_text = "k.A."; error = None - try: raw_text = get_website_raw(url) + try: raw_text = get_website_raw(url) # Annahme: get_website_raw ist definiert except Exception as e: error = f"Scraping Fehler Zeile {row_num}: {e}"; debug_print(error) return {"row_num": row_num, "raw_text": raw_text, "error": error} - # Hauptlogik + # --- Hauptlogik: Iteriere und sammle Batches --- tasks_for_processing_batch = [] all_sheet_updates = [] total_processed_count = 0 @@ -2296,10 +2297,10 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index total_skipped_url_count = 0 total_error_count = 0 + # Werte aus Config holen processing_batch_size = Config.PROCESSING_BATCH_SIZE max_scraping_workers = Config.MAX_SCRAPING_WORKERS update_batch_row_limit = Config.UPDATE_BATCH_ROW_LIMIT - # Diese Liste wird weiterhin für die Skip-Logik *innerhalb* der Funktion verwendet empty_values_for_skip = ["", "k.a.", "k.a. (nur cookie-banner erkannt)", "k.a. (fehler)"] for i in range(start_row_index_in_sheet, end_row_index_in_sheet + 1): @@ -2307,15 +2308,13 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index if row_index_in_list >= len(all_data): continue row = all_data[row_index_in_list] - # --- Prüfung, ob AR schon Inhalt hat (bleibt gleich!) --- + # --- Prüfung, ob AR schon Inhalt hat --- should_skip = False - cell_value_ar_str_lower = "FEHLER_INDEX" + cell_value_ar_str_lower = "INDEX_FEHLER" if len(row) > rohtext_col_index: cell_value_ar_str_lower = str(row[rohtext_col_index]).strip().lower() - # Überspringen, wenn der Wert NICHT in der Liste der leeren Werte ist if cell_value_ar_str_lower not in empty_values_for_skip: should_skip = True - # else: Spalte existiert nicht -> nicht überspringen log_debug = (i < start_row_index_in_sheet + 5 or i > end_row_index_in_sheet - 5 or i % 500 == 0) if log_debug: @@ -2332,39 +2331,67 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index total_skipped_url_count += 1 continue - # Task hinzufügen, wenn AR leer/k.A. war UND URL vorhanden ist tasks_for_processing_batch.append({"row_num": i, "url": website_url}) - # --- Verarbeitungs-Batch ausführen (Logik unverändert) --- + # --- Verarbeitungs-Batch ausführen --- if len(tasks_for_processing_batch) >= processing_batch_size or i == end_row_index_in_sheet: if tasks_for_processing_batch: - # ... (Paralleles Scraping wie zuvor) ... - # ... (Sheet Updates vorbereiten wie zuvor - nur AR und AP) ... - # ... (Updates sammeln und senden wie zuvor) ... - batch_start_row = tasks_for_processing_batch[0]['row_num']; batch_end_row = tasks_for_processing_batch[-1]['row_num'] - batch_task_count = len(tasks_for_processing_batch); debug_print(f"\n--- Starte Scraping-Batch ({batch_task_count} Tasks, Zeilen {batch_start_row}-{batch_end_row}) ---") - scraping_results = {}; debug_print(f" Scrape {batch_task_count} Websites parallel (max {max_scraping_workers} worker)...") + batch_start_row = tasks_for_processing_batch[0]['row_num'] + batch_end_row = tasks_for_processing_batch[-1]['row_num'] + batch_task_count = len(tasks_for_processing_batch) + debug_print(f"\n--- Starte Scraping-Batch ({batch_task_count} Tasks, Zeilen {batch_start_row}-{batch_end_row}) ---") + + scraping_results = {} + batch_error_count = 0 # Fehlerzähler für diesen spezifischen Batch + debug_print(f" Scrape {batch_task_count} Websites parallel (max {max_scraping_workers} worker)...") with concurrent.futures.ThreadPoolExecutor(max_workers=max_scraping_workers) as executor: - # ... (executor Logik) ... future_to_task = {executor.submit(scrape_raw_text_task, task): task for task in tasks_for_processing_batch} for future in concurrent.futures.as_completed(future_to_task): - # ... (Ergebnisse sammeln) ... - task = future_to_task[future]; try: result = future.result(); scraping_results[result['row_num']] = result['raw_text']; - except Exception as exc: row_num = task['row_num']; err_msg = f"Generischer Fehler Scraping Task Zeile {row_num}: {exc}"; debug_print(err_msg); scraping_results[row_num] = "k.A. (Fehler)"; total_error_count +=1; - current_batch_processed_count = len(scraping_results); total_processed_count += current_batch_processed_count; debug_print(f" Scraping für Batch beendet. {current_batch_processed_count} Ergebnisse erhalten.") + task = future_to_task[future] + # --- KORRIGIERTER TRY-EXCEPT Block --- + try: + result = future.result() + scraping_results[result['row_num']] = result['raw_text'] + if result['error']: + batch_error_count += 1 + total_error_count += 1 + except Exception as exc: + row_num = task['row_num'] + err_msg = f"Generischer Fehler Scraping Task Zeile {row_num}: {exc}" + debug_print(err_msg) + scraping_results[row_num] = "k.A. (Fehler)" + batch_error_count += 1 + total_error_count +=1 + # --- Ende Korrektur --- + + current_batch_processed_count = len(scraping_results) # Anzahl Ergebnisse (inkl. Fehler) + total_processed_count += current_batch_processed_count + debug_print(f" Scraping für Batch beendet. {current_batch_processed_count} Ergebnisse erhalten ({batch_error_count} Fehler in diesem Batch).") + + # Sheet Updates vorbereiten (AR und AP) if scraping_results: - # ... (Sheet Updates vorbereiten - AR/AP) ... - current_version = Config.VERSION; batch_sheet_updates = [] + current_version = Config.VERSION + batch_sheet_updates = [] for row_num, raw_text_res in scraping_results.items(): - row_updates = [{'range': f'{rohtext_col_letter}{row_num}', 'values': [[raw_text_res]]}, {'range': f'{version_col_letter}{row_num}', 'values': [[current_version]]}]; batch_sheet_updates.extend(row_updates) - all_sheet_updates.extend(batch_sheet_updates) + # Updates für AR und AP + row_updates = [ + {'range': f'{rohtext_col_letter}{row_num}', 'values': [[raw_text_res]]}, + {'range': f'{version_col_letter}{row_num}', 'values': [[current_version]]} + ] + batch_sheet_updates.extend(row_updates) + all_sheet_updates.extend(batch_sheet_updates) # Sammle für größeren Batch-Update + tasks_for_processing_batch = [] # Batch leeren - if len(all_sheet_updates) >= update_batch_row_limit * 2: - # ... (Sheet Updates senden) ... - debug_print(f" Sende gesammelte Sheet-Updates ({len(all_sheet_updates)} Zellen)..."); success = sheet_handler.batch_update_cells(all_sheet_updates) - if success: debug_print(f" Sheet-Update bis Zeile {i} erfolgreich.") - else: debug_print(f" FEHLER beim Sheet-Update bis Zeile {i}.") - all_sheet_updates = [] + + # Sheet Updates senden (wenn update_batch_row_limit erreicht) + # Prüfe die Anzahl der *Zeilen*, für die Updates gesammelt wurden + # Da wir jetzt Updates für alle Ergebnisse sammeln, prüfen wir direkt die Länge von all_sheet_updates + if len(all_sheet_updates) >= update_batch_row_limit * 2: # *2 weil 2 Updates pro Zeile + debug_print(f" Sende gesammelte Sheet-Updates ({len(all_sheet_updates)} Zellen)...") + success = sheet_handler.batch_update_cells(all_sheet_updates) + if success: debug_print(f" Sheet-Update bis Zeile {batch_end_row} erfolgreich.") # Logge Endzeile des Batches + else: debug_print(f" FEHLER beim Sheet-Update bis Zeile {batch_end_row}.") + all_sheet_updates = [] # Zurücksetzen nach Senden # Finale Sheet Updates senden if all_sheet_updates: