From d9d33db2eb2e4da588cdc44a973a896fff625a01 Mon Sep 17 00:00:00 2001 From: Floke Date: Thu, 17 Apr 2025 10:05:07 +0000 Subject: [PATCH] bugfix --- brancheneinstufung.py | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/brancheneinstufung.py b/brancheneinstufung.py index abde347a..40ab1fa3 100644 --- a/brancheneinstufung.py +++ b/brancheneinstufung.py @@ -2177,12 +2177,13 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index """ debug_print(f"Starte Website-Scraping NUR ROHDATEN (Batch) für Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}...") + # --- Lade Daten --- if not sheet_handler.load_data(): return all_data = sheet_handler.get_all_data_with_headers() if not all_data or len(all_data) <= 5: return header_rows = 5 - # Indizes und Buchstaben + # --- Indizes und Buchstaben --- rohtext_col_key = "Website Rohtext" rohtext_col_index = COLUMN_MAP.get(rohtext_col_key) website_col_idx = COLUMN_MAP.get("CRM Website") @@ -2193,14 +2194,14 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index rohtext_col_letter = sheet_handler._get_col_letter(rohtext_col_index + 1) version_col_letter = sheet_handler._get_col_letter(version_col_idx + 1) - # Worker-Funktion für Scraping + # --- Worker-Funktion (unverändert) --- def scrape_raw_text_task(task_info): row_num = task_info['row_num']; url = task_info['url']; raw_text = "k.A."; error = None try: raw_text = get_website_raw(url) except Exception as e: error = f"Scraping Fehler Zeile {row_num}: {e}"; debug_print(error) return {"row_num": row_num, "raw_text": raw_text, "error": error} - # Hauptlogik + # --- Hauptlogik: Iteriere und sammle Batches --- tasks_for_processing_batch = [] all_sheet_updates = [] total_processed_count = 0 @@ -2208,9 +2209,11 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index total_skipped_url_count = 0 total_error_count = 0 + # --- KORRIGIERT: Hole Konfigurationswerte aus Config --- processing_batch_size = Config.PROCESSING_BATCH_SIZE max_scraping_workers = Config.MAX_SCRAPING_WORKERS update_batch_row_limit = Config.UPDATE_BATCH_ROW_LIMIT + # --- Ende Korrektur --- empty_values_for_skip = ["", "k.a.", "k.a. (nur cookie-banner erkannt)", "k.a. (fehler)"] for i in range(start_row_index_in_sheet, end_row_index_in_sheet + 1): @@ -2218,38 +2221,31 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index if row_index_in_list >= len(all_data): continue row = all_data[row_index_in_list] - # --- KORRIGIERTE Prüfung, ob AR schon Inhalt hat --- + # Prüfung, ob AR schon Inhalt hat should_skip = False - cell_value_ar_str_lower = "FEHLER_INDEX" # Wird überschrieben, wenn Index existiert + cell_value_ar_str_lower = "FEHLER_INDEX" if len(row) > rohtext_col_index: cell_value_ar_str_lower = str(row[rohtext_col_index]).strip().lower() - # Überspringen, wenn der Wert NICHT in der Liste der leeren Werte ist if cell_value_ar_str_lower not in empty_values_for_skip: should_skip = True - # else: Spalte existiert nicht -> nicht überspringen (gilt als leer) - log_debug = (i < start_row_index_in_sheet + 5 or i > end_row_index_in_sheet - 5 or i % 500 == 0 or i in range(10,15)) # Debug-Zeilen anpassen + log_debug = (i < start_row_index_in_sheet + 5 or i > end_row_index_in_sheet - 5 or i % 500 == 0 or i in [10, 13]) if log_debug: debug_print(f"Zeile {i} (Website AR Check): Prüfe Inhalt Spalte {rohtext_col_letter}. Wert='{cell_value_ar_str_lower}'. Überspringen (da schon Inhalt)? -> {should_skip}") if should_skip: total_skipped_count += 1 continue - # --- Ende AR Prüfung --- - # --- Zusätzliche Prüfung: URL vorhanden? --- + # URL Prüfung website_url = row[website_col_idx] if len(row) > website_col_idx else "" if not website_url or website_url.strip().lower() == "k.a.": - # Logge nur, wenn AR tatsächlich leer war (also nicht übersprungen wurde) - # debug_print(f"Zeile {i}: AR ist leer/k.A., aber keine gültige URL vorhanden. Überspringe.") total_skipped_url_count += 1 continue - # --- Ende URL Prüfung --- - # Nur wenn AR leer/k.A. war UND eine URL existiert, Task hinzufügen tasks_for_processing_batch.append({"row_num": i, "url": website_url}) - # --- Verarbeitungs-Batch ausführen --- + # Verarbeitungs-Batch ausführen if len(tasks_for_processing_batch) >= processing_batch_size or i == end_row_index_in_sheet: if tasks_for_processing_batch: batch_start_row = tasks_for_processing_batch[0]['row_num'] @@ -2290,7 +2286,7 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index tasks_for_processing_batch = [] # Batch leeren # Sheet Updates senden (wenn update_batch_row_limit erreicht) - if len(all_sheet_updates) >= update_batch_row_limit * 2: + if len(all_sheet_updates) >= update_batch_row_limit * 2: # *2 Updates pro Zeile debug_print(f" Sende gesammelte Sheet-Updates ({len(all_sheet_updates)} Zellen)...") success = sheet_handler.batch_update_cells(all_sheet_updates) if success: debug_print(f" Sheet-Update bis Zeile {i} erfolgreich.") @@ -2305,6 +2301,7 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index debug_print(f"Website-Scraping NUR ROHDATEN abgeschlossen. {total_processed_count} Websites verarbeitet (inkl. Fehler), {total_error_count} Fehler, {total_skipped_count} Zeilen wg. Inhalt übersprungen, {total_skipped_url_count} Zeilen ohne URL übersprungen.") + # NEUE Funktion process_website_summarization_batch def process_website_summarization_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet): """