bugfix
This commit is contained in:
@@ -2208,6 +2208,7 @@ def _process_batch(sheet, batches, row_numbers):
|
|||||||
# Komplette Funktion process_website_batch (MIT Batched Google Sheet Updates)
|
# Komplette Funktion process_website_batch (MIT Batched Google Sheet Updates)
|
||||||
# Komplette Funktion process_website_batch (NEUE STRUKTUR - ECHTER BATCH WORKFLOW)
|
# Komplette Funktion process_website_batch (NEUE STRUKTUR - ECHTER BATCH WORKFLOW)
|
||||||
# Komplette Funktion process_website_batch (NUR SCRAPING)
|
# Komplette Funktion process_website_batch (NUR SCRAPING)
|
||||||
|
# Komplette Funktion process_website_batch (Korrigierte Config-Referenzen)
|
||||||
def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet):
|
def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet):
|
||||||
"""
|
"""
|
||||||
Batch-Prozess NUR für Website-Scraping (Rohtext AR).
|
Batch-Prozess NUR für Website-Scraping (Rohtext AR).
|
||||||
@@ -2216,10 +2217,6 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
|
|||||||
"""
|
"""
|
||||||
debug_print(f"Starte Website-Scraping NUR ROHDATEN (Batch) für Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}...")
|
debug_print(f"Starte Website-Scraping NUR ROHDATEN (Batch) für Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}...")
|
||||||
|
|
||||||
# --- Konfiguration ---
|
|
||||||
MAX_SCRAPING_WORKERS = Config.MAX_SCRAPING_WORKERS # Aus Config holen
|
|
||||||
update_batch_row_limit = Config.UPDATE_BATCH_ROW_LIMIT # Aus Config holen
|
|
||||||
|
|
||||||
# --- Lade Daten ---
|
# --- Lade Daten ---
|
||||||
if not sheet_handler.load_data(): return
|
if not sheet_handler.load_data(): return
|
||||||
all_data = sheet_handler.get_all_data_with_headers()
|
all_data = sheet_handler.get_all_data_with_headers()
|
||||||
@@ -2227,7 +2224,7 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
|
|||||||
header_rows = 5
|
header_rows = 5
|
||||||
|
|
||||||
# --- Indizes und Buchstaben ---
|
# --- Indizes und Buchstaben ---
|
||||||
rohtext_col_key = "Website Rohtext" # Spalte AR
|
rohtext_col_key = "Website Rohtext"
|
||||||
rohtext_col_index = COLUMN_MAP.get(rohtext_col_key)
|
rohtext_col_index = COLUMN_MAP.get(rohtext_col_key)
|
||||||
website_col_idx = COLUMN_MAP.get("CRM Website")
|
website_col_idx = COLUMN_MAP.get("CRM Website")
|
||||||
version_col_idx = COLUMN_MAP.get("Version")
|
version_col_idx = COLUMN_MAP.get("Version")
|
||||||
@@ -2255,6 +2252,11 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
|
|||||||
total_skipped_url_count = 0
|
total_skipped_url_count = 0
|
||||||
total_error_count = 0
|
total_error_count = 0
|
||||||
|
|
||||||
|
# Verwende Werte aus Config
|
||||||
|
processing_batch_size = Config.PROCESSING_BATCH_SIZE
|
||||||
|
max_scraping_workers = Config.MAX_SCRAPING_WORKERS
|
||||||
|
update_batch_row_limit = Config.UPDATE_BATCH_ROW_LIMIT # Annahme: UPDATE_BATCH_ROW_LIMIT ist auch in Config definiert
|
||||||
|
|
||||||
for i in range(start_row_index_in_sheet, end_row_index_in_sheet + 1):
|
for i in range(start_row_index_in_sheet, end_row_index_in_sheet + 1):
|
||||||
row_index_in_list = i - 1
|
row_index_in_list = i - 1
|
||||||
if row_index_in_list >= len(all_data): continue
|
if row_index_in_list >= len(all_data): continue
|
||||||
@@ -2265,10 +2267,8 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
|
|||||||
cell_value_ar = None
|
cell_value_ar = None
|
||||||
if len(row) > rohtext_col_index:
|
if len(row) > rohtext_col_index:
|
||||||
cell_value_ar = str(row[rohtext_col_index]).strip()
|
cell_value_ar = str(row[rohtext_col_index]).strip()
|
||||||
# Überspringen, wenn NICHT leer oder k.A.
|
|
||||||
if cell_value_ar and cell_value_ar.lower() not in ["", "k.a.", "k.a. (nur cookie-banner erkannt)", "k.a. (fehler)"]:
|
if cell_value_ar and cell_value_ar.lower() not in ["", "k.a.", "k.a. (nur cookie-banner erkannt)", "k.a. (fehler)"]:
|
||||||
should_skip = True
|
should_skip = True
|
||||||
# else: Spalte nicht vorhanden -> nicht überspringen
|
|
||||||
|
|
||||||
# Debug Log
|
# Debug Log
|
||||||
log_debug = (i < start_row_index_in_sheet + 5 or i > end_row_index_in_sheet - 5 or i % 500 == 0)
|
log_debug = (i < start_row_index_in_sheet + 5 or i > end_row_index_in_sheet - 5 or i % 500 == 0)
|
||||||
@@ -2289,16 +2289,18 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
|
|||||||
tasks_for_processing_batch.append({"row_num": i, "url": website_url})
|
tasks_for_processing_batch.append({"row_num": i, "url": website_url})
|
||||||
|
|
||||||
# --- Verarbeitungs-Batch ausführen ---
|
# --- Verarbeitungs-Batch ausführen ---
|
||||||
if len(tasks_for_processing_batch) >= PROCESSING_BATCH_SIZE or i == end_row_index_in_sheet:
|
# HIER KORRIGIERT: Verwende processing_batch_size
|
||||||
|
if len(tasks_for_processing_batch) >= processing_batch_size or i == end_row_index_in_sheet:
|
||||||
if tasks_for_processing_batch:
|
if tasks_for_processing_batch:
|
||||||
batch_start_row = tasks_for_processing_batch[0]['row_num']
|
batch_start_row = tasks_for_processing_batch[0]['row_num']
|
||||||
batch_end_row = tasks_for_processing_batch[-1]['row_num']
|
batch_end_row = tasks_for_processing_batch[-1]['row_num']
|
||||||
batch_task_count = len(tasks_for_current_processing_batch) # Korrigiert
|
batch_task_count = len(tasks_for_processing_batch)
|
||||||
debug_print(f"\n--- Starte Scraping-Batch ({batch_task_count} Tasks, Zeilen {batch_start_row}-{batch_end_row}) ---")
|
debug_print(f"\n--- Starte Scraping-Batch ({batch_task_count} Tasks, Zeilen {batch_start_row}-{batch_end_row}) ---")
|
||||||
|
|
||||||
scraping_results = {}
|
scraping_results = {}
|
||||||
debug_print(f" Scrape {batch_task_count} Websites parallel (max {MAX_SCRAPING_WORKERS} worker)...") # Korrigiert
|
# HIER KORRIGIERT: Verwende max_scraping_workers
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_SCRAPING_WORKERS) as executor:
|
debug_print(f" Scrape {batch_task_count} Websites parallel (max {max_scraping_workers} worker)...")
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_scraping_workers) as executor:
|
||||||
future_to_task = {executor.submit(scrape_raw_text_task, task): task for task in tasks_for_processing_batch}
|
future_to_task = {executor.submit(scrape_raw_text_task, task): task for task in tasks_for_processing_batch}
|
||||||
for future in concurrent.futures.as_completed(future_to_task):
|
for future in concurrent.futures.as_completed(future_to_task):
|
||||||
task = future_to_task[future]
|
task = future_to_task[future]
|
||||||
@@ -2306,12 +2308,15 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
|
|||||||
result = future.result()
|
result = future.result()
|
||||||
scraping_results[result['row_num']] = result['raw_text']
|
scraping_results[result['row_num']] = result['raw_text']
|
||||||
if result['error']: total_error_count += 1
|
if result['error']: total_error_count += 1
|
||||||
total_processed_count += 1 # Zähle hier jeden Versuch
|
# Zähle erst hier, wenn Ergebnis da ist
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
row_num = task['row_num']; err_msg = f"Generischer Fehler Scraping Task Zeile {row_num}: {exc}"
|
row_num = task['row_num']; err_msg = f"Generischer Fehler Scraping Task Zeile {row_num}: {exc}"
|
||||||
debug_print(err_msg); scraping_results[row_num] = "k.A. (Fehler)"; total_error_count +=1; total_processed_count += 1
|
debug_print(err_msg); scraping_results[row_num] = "k.A. (Fehler)"; total_error_count +=1
|
||||||
|
|
||||||
debug_print(f" Scraping für Batch beendet.")
|
# Zähle hier die Anzahl der tatsächlich bearbeiteten Ergebnisse
|
||||||
|
current_batch_processed_count = len(scraping_results)
|
||||||
|
total_processed_count += current_batch_processed_count
|
||||||
|
debug_print(f" Scraping für Batch beendet. {current_batch_processed_count} Ergebnisse erhalten ({total_error_count} Fehler in diesem Batch).")
|
||||||
|
|
||||||
# --- Sheet Updates vorbereiten (NUR AR und AP) ---
|
# --- Sheet Updates vorbereiten (NUR AR und AP) ---
|
||||||
if scraping_results:
|
if scraping_results:
|
||||||
@@ -2320,7 +2325,7 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
|
|||||||
for row_num, raw_text_res in scraping_results.items():
|
for row_num, raw_text_res in scraping_results.items():
|
||||||
row_updates = [
|
row_updates = [
|
||||||
{'range': f'{rohtext_col_letter}{row_num}', 'values': [[raw_text_res]]},
|
{'range': f'{rohtext_col_letter}{row_num}', 'values': [[raw_text_res]]},
|
||||||
# {'range': f'{ts_col_letter}{row_num}', 'values': [[current_timestamp]]}, # AT wird NICHT mehr gesetzt
|
# KEIN AT Timestamp mehr
|
||||||
{'range': f'{version_col_letter}{row_num}', 'values': [[current_version]]}
|
{'range': f'{version_col_letter}{row_num}', 'values': [[current_version]]}
|
||||||
]
|
]
|
||||||
batch_sheet_updates.extend(row_updates)
|
batch_sheet_updates.extend(row_updates)
|
||||||
@@ -2330,7 +2335,8 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
|
|||||||
tasks_for_processing_batch = []
|
tasks_for_processing_batch = []
|
||||||
|
|
||||||
# --- Sheet Updates senden (wenn update_batch_row_limit erreicht) ---
|
# --- Sheet Updates senden (wenn update_batch_row_limit erreicht) ---
|
||||||
# Hinweis: Diese Logik sendet jetzt seltener, erst wenn genug Updates gesammelt wurden
|
# HIER KORRIGIERT: Verwende update_batch_row_limit
|
||||||
|
# Prüfe die Anzahl der *Zellen* in all_sheet_updates
|
||||||
if len(all_sheet_updates) >= update_batch_row_limit * 2: # *2 weil 2 Updates pro Zeile
|
if len(all_sheet_updates) >= update_batch_row_limit * 2: # *2 weil 2 Updates pro Zeile
|
||||||
debug_print(f" Sende gesammelte Sheet-Updates ({len(all_sheet_updates)} Zellen)...")
|
debug_print(f" Sende gesammelte Sheet-Updates ({len(all_sheet_updates)} Zellen)...")
|
||||||
success = sheet_handler.batch_update_cells(all_sheet_updates)
|
success = sheet_handler.batch_update_cells(all_sheet_updates)
|
||||||
|
|||||||
Reference in New Issue
Block a user