bugfix
This commit is contained in:
@@ -2177,12 +2177,13 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
|
|||||||
"""
|
"""
|
||||||
debug_print(f"Starte Website-Scraping NUR ROHDATEN (Batch) für Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}...")
|
debug_print(f"Starte Website-Scraping NUR ROHDATEN (Batch) für Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}...")
|
||||||
|
|
||||||
|
# --- Lade Daten ---
|
||||||
if not sheet_handler.load_data(): return
|
if not sheet_handler.load_data(): return
|
||||||
all_data = sheet_handler.get_all_data_with_headers()
|
all_data = sheet_handler.get_all_data_with_headers()
|
||||||
if not all_data or len(all_data) <= 5: return
|
if not all_data or len(all_data) <= 5: return
|
||||||
header_rows = 5
|
header_rows = 5
|
||||||
|
|
||||||
# Indizes und Buchstaben
|
# --- Indizes und Buchstaben ---
|
||||||
rohtext_col_key = "Website Rohtext"
|
rohtext_col_key = "Website Rohtext"
|
||||||
rohtext_col_index = COLUMN_MAP.get(rohtext_col_key)
|
rohtext_col_index = COLUMN_MAP.get(rohtext_col_key)
|
||||||
website_col_idx = COLUMN_MAP.get("CRM Website")
|
website_col_idx = COLUMN_MAP.get("CRM Website")
|
||||||
@@ -2193,14 +2194,14 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
|
|||||||
rohtext_col_letter = sheet_handler._get_col_letter(rohtext_col_index + 1)
|
rohtext_col_letter = sheet_handler._get_col_letter(rohtext_col_index + 1)
|
||||||
version_col_letter = sheet_handler._get_col_letter(version_col_idx + 1)
|
version_col_letter = sheet_handler._get_col_letter(version_col_idx + 1)
|
||||||
|
|
||||||
# Worker-Funktion für Scraping
|
# --- Worker-Funktion (unverändert) ---
|
||||||
def scrape_raw_text_task(task_info):
|
def scrape_raw_text_task(task_info):
|
||||||
row_num = task_info['row_num']; url = task_info['url']; raw_text = "k.A."; error = None
|
row_num = task_info['row_num']; url = task_info['url']; raw_text = "k.A."; error = None
|
||||||
try: raw_text = get_website_raw(url)
|
try: raw_text = get_website_raw(url)
|
||||||
except Exception as e: error = f"Scraping Fehler Zeile {row_num}: {e}"; debug_print(error)
|
except Exception as e: error = f"Scraping Fehler Zeile {row_num}: {e}"; debug_print(error)
|
||||||
return {"row_num": row_num, "raw_text": raw_text, "error": error}
|
return {"row_num": row_num, "raw_text": raw_text, "error": error}
|
||||||
|
|
||||||
# Hauptlogik
|
# --- Hauptlogik: Iteriere und sammle Batches ---
|
||||||
tasks_for_processing_batch = []
|
tasks_for_processing_batch = []
|
||||||
all_sheet_updates = []
|
all_sheet_updates = []
|
||||||
total_processed_count = 0
|
total_processed_count = 0
|
||||||
@@ -2208,9 +2209,11 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
|
|||||||
total_skipped_url_count = 0
|
total_skipped_url_count = 0
|
||||||
total_error_count = 0
|
total_error_count = 0
|
||||||
|
|
||||||
|
# --- KORRIGIERT: Hole Konfigurationswerte aus Config ---
|
||||||
processing_batch_size = Config.PROCESSING_BATCH_SIZE
|
processing_batch_size = Config.PROCESSING_BATCH_SIZE
|
||||||
max_scraping_workers = Config.MAX_SCRAPING_WORKERS
|
max_scraping_workers = Config.MAX_SCRAPING_WORKERS
|
||||||
update_batch_row_limit = Config.UPDATE_BATCH_ROW_LIMIT
|
update_batch_row_limit = Config.UPDATE_BATCH_ROW_LIMIT
|
||||||
|
# --- Ende Korrektur ---
|
||||||
empty_values_for_skip = ["", "k.a.", "k.a. (nur cookie-banner erkannt)", "k.a. (fehler)"]
|
empty_values_for_skip = ["", "k.a.", "k.a. (nur cookie-banner erkannt)", "k.a. (fehler)"]
|
||||||
|
|
||||||
for i in range(start_row_index_in_sheet, end_row_index_in_sheet + 1):
|
for i in range(start_row_index_in_sheet, end_row_index_in_sheet + 1):
|
||||||
@@ -2218,38 +2221,31 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
|
|||||||
if row_index_in_list >= len(all_data): continue
|
if row_index_in_list >= len(all_data): continue
|
||||||
row = all_data[row_index_in_list]
|
row = all_data[row_index_in_list]
|
||||||
|
|
||||||
# --- KORRIGIERTE Prüfung, ob AR schon Inhalt hat ---
|
# Prüfung, ob AR schon Inhalt hat
|
||||||
should_skip = False
|
should_skip = False
|
||||||
cell_value_ar_str_lower = "FEHLER_INDEX" # Wird überschrieben, wenn Index existiert
|
cell_value_ar_str_lower = "FEHLER_INDEX"
|
||||||
if len(row) > rohtext_col_index:
|
if len(row) > rohtext_col_index:
|
||||||
cell_value_ar_str_lower = str(row[rohtext_col_index]).strip().lower()
|
cell_value_ar_str_lower = str(row[rohtext_col_index]).strip().lower()
|
||||||
# Überspringen, wenn der Wert NICHT in der Liste der leeren Werte ist
|
|
||||||
if cell_value_ar_str_lower not in empty_values_for_skip:
|
if cell_value_ar_str_lower not in empty_values_for_skip:
|
||||||
should_skip = True
|
should_skip = True
|
||||||
# else: Spalte existiert nicht -> nicht überspringen (gilt als leer)
|
|
||||||
|
|
||||||
log_debug = (i < start_row_index_in_sheet + 5 or i > end_row_index_in_sheet - 5 or i % 500 == 0 or i in range(10,15)) # Debug-Zeilen anpassen
|
log_debug = (i < start_row_index_in_sheet + 5 or i > end_row_index_in_sheet - 5 or i % 500 == 0 or i in [10, 13])
|
||||||
if log_debug:
|
if log_debug:
|
||||||
debug_print(f"Zeile {i} (Website AR Check): Prüfe Inhalt Spalte {rohtext_col_letter}. Wert='{cell_value_ar_str_lower}'. Überspringen (da schon Inhalt)? -> {should_skip}")
|
debug_print(f"Zeile {i} (Website AR Check): Prüfe Inhalt Spalte {rohtext_col_letter}. Wert='{cell_value_ar_str_lower}'. Überspringen (da schon Inhalt)? -> {should_skip}")
|
||||||
|
|
||||||
if should_skip:
|
if should_skip:
|
||||||
total_skipped_count += 1
|
total_skipped_count += 1
|
||||||
continue
|
continue
|
||||||
# --- Ende AR Prüfung ---
|
|
||||||
|
|
||||||
# --- Zusätzliche Prüfung: URL vorhanden? ---
|
# URL Prüfung
|
||||||
website_url = row[website_col_idx] if len(row) > website_col_idx else ""
|
website_url = row[website_col_idx] if len(row) > website_col_idx else ""
|
||||||
if not website_url or website_url.strip().lower() == "k.a.":
|
if not website_url or website_url.strip().lower() == "k.a.":
|
||||||
# Logge nur, wenn AR tatsächlich leer war (also nicht übersprungen wurde)
|
|
||||||
# debug_print(f"Zeile {i}: AR ist leer/k.A., aber keine gültige URL vorhanden. Überspringe.")
|
|
||||||
total_skipped_url_count += 1
|
total_skipped_url_count += 1
|
||||||
continue
|
continue
|
||||||
# --- Ende URL Prüfung ---
|
|
||||||
|
|
||||||
# Nur wenn AR leer/k.A. war UND eine URL existiert, Task hinzufügen
|
|
||||||
tasks_for_processing_batch.append({"row_num": i, "url": website_url})
|
tasks_for_processing_batch.append({"row_num": i, "url": website_url})
|
||||||
|
|
||||||
# --- Verarbeitungs-Batch ausführen ---
|
# Verarbeitungs-Batch ausführen
|
||||||
if len(tasks_for_processing_batch) >= processing_batch_size or i == end_row_index_in_sheet:
|
if len(tasks_for_processing_batch) >= processing_batch_size or i == end_row_index_in_sheet:
|
||||||
if tasks_for_processing_batch:
|
if tasks_for_processing_batch:
|
||||||
batch_start_row = tasks_for_processing_batch[0]['row_num']
|
batch_start_row = tasks_for_processing_batch[0]['row_num']
|
||||||
@@ -2290,7 +2286,7 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
|
|||||||
tasks_for_processing_batch = [] # Batch leeren
|
tasks_for_processing_batch = [] # Batch leeren
|
||||||
|
|
||||||
# Sheet Updates senden (wenn update_batch_row_limit erreicht)
|
# Sheet Updates senden (wenn update_batch_row_limit erreicht)
|
||||||
if len(all_sheet_updates) >= update_batch_row_limit * 2:
|
if len(all_sheet_updates) >= update_batch_row_limit * 2: # *2 Updates pro Zeile
|
||||||
debug_print(f" Sende gesammelte Sheet-Updates ({len(all_sheet_updates)} Zellen)...")
|
debug_print(f" Sende gesammelte Sheet-Updates ({len(all_sheet_updates)} Zellen)...")
|
||||||
success = sheet_handler.batch_update_cells(all_sheet_updates)
|
success = sheet_handler.batch_update_cells(all_sheet_updates)
|
||||||
if success: debug_print(f" Sheet-Update bis Zeile {i} erfolgreich.")
|
if success: debug_print(f" Sheet-Update bis Zeile {i} erfolgreich.")
|
||||||
@@ -2305,6 +2301,7 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
|
|||||||
debug_print(f"Website-Scraping NUR ROHDATEN abgeschlossen. {total_processed_count} Websites verarbeitet (inkl. Fehler), {total_error_count} Fehler, {total_skipped_count} Zeilen wg. Inhalt übersprungen, {total_skipped_url_count} Zeilen ohne URL übersprungen.")
|
debug_print(f"Website-Scraping NUR ROHDATEN abgeschlossen. {total_processed_count} Websites verarbeitet (inkl. Fehler), {total_error_count} Fehler, {total_skipped_count} Zeilen wg. Inhalt übersprungen, {total_skipped_url_count} Zeilen ohne URL übersprungen.")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# NEUE Funktion process_website_summarization_batch
|
# NEUE Funktion process_website_summarization_batch
|
||||||
def process_website_summarization_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet):
|
def process_website_summarization_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet):
|
||||||
"""
|
"""
|
||||||
|
|||||||
Reference in New Issue
Block a user