This commit is contained in:
2025-04-17 09:57:44 +00:00
parent 35c8ea21cb
commit a2d5ffcd66

View File

@@ -2172,18 +2172,17 @@ def _process_batch(sheet, batches, row_numbers):
def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet): def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index_in_sheet):
""" """
Batch-Prozess NUR für Website-Scraping (Rohtext AR). Batch-Prozess NUR für Website-Scraping (Rohtext AR).
Lädt Daten neu, prüft Spalte AR auf Inhalt ('', 'k.A.', etc.) und überspringt ggf. Lädt Daten neu, prüft Spalte AR auf leere/k.A.-Werte und überspringt Zeilen mit Inhalt.
Setzt AR + AP für bearbeitete Zeilen. Sendet Updates gebündelt. Setzt AR + AP für bearbeitete Zeilen. Sendet Updates gebündelt.
""" """
debug_print(f"Starte Website-Scraping NUR ROHDATEN (Batch) für Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}...") debug_print(f"Starte Website-Scraping NUR ROHDATEN (Batch) für Zeilen {start_row_index_in_sheet} bis {end_row_index_in_sheet}...")
# --- Lade Daten ---
if not sheet_handler.load_data(): return if not sheet_handler.load_data(): return
all_data = sheet_handler.get_all_data_with_headers() all_data = sheet_handler.get_all_data_with_headers()
if not all_data or len(all_data) <= 5: return if not all_data or len(all_data) <= 5: return
header_rows = 5 header_rows = 5
# --- Indizes und Buchstaben --- # Indizes und Buchstaben
rohtext_col_key = "Website Rohtext" rohtext_col_key = "Website Rohtext"
rohtext_col_index = COLUMN_MAP.get(rohtext_col_key) rohtext_col_index = COLUMN_MAP.get(rohtext_col_key)
website_col_idx = COLUMN_MAP.get("CRM Website") website_col_idx = COLUMN_MAP.get("CRM Website")
@@ -2194,14 +2193,14 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
rohtext_col_letter = sheet_handler._get_col_letter(rohtext_col_index + 1) rohtext_col_letter = sheet_handler._get_col_letter(rohtext_col_index + 1)
version_col_letter = sheet_handler._get_col_letter(version_col_idx + 1) version_col_letter = sheet_handler._get_col_letter(version_col_idx + 1)
# --- Worker-Funktion (unverändert) --- # Worker-Funktion für Scraping
def scrape_raw_text_task(task_info): def scrape_raw_text_task(task_info):
row_num = task_info['row_num']; url = task_info['url']; raw_text = "k.A."; error = None row_num = task_info['row_num']; url = task_info['url']; raw_text = "k.A."; error = None
try: raw_text = get_website_raw(url) try: raw_text = get_website_raw(url)
except Exception as e: error = f"Scraping Fehler Zeile {row_num}: {e}"; debug_print(error) except Exception as e: error = f"Scraping Fehler Zeile {row_num}: {e}"; debug_print(error)
return {"row_num": row_num, "raw_text": raw_text, "error": error} return {"row_num": row_num, "raw_text": raw_text, "error": error}
# --- Hauptlogik --- # Hauptlogik
tasks_for_processing_batch = [] tasks_for_processing_batch = []
all_sheet_updates = [] all_sheet_updates = []
total_processed_count = 0 total_processed_count = 0
@@ -2209,41 +2208,45 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
total_skipped_url_count = 0 total_skipped_url_count = 0
total_error_count = 0 total_error_count = 0
# Werte aus Config holen
processing_batch_size = Config.PROCESSING_BATCH_SIZE processing_batch_size = Config.PROCESSING_BATCH_SIZE
max_scraping_workers = Config.MAX_SCRAPING_WORKERS max_scraping_workers = Config.MAX_SCRAPING_WORKERS
update_batch_row_limit = Config.UPDATE_BATCH_ROW_LIMIT update_batch_row_limit = Config.UPDATE_BATCH_ROW_LIMIT
empty_values_for_skip = ["", "k.a.", "k.a. (nur cookie-banner erkannt)", "k.a. (fehler)"] # Werte, die als "leer" gelten empty_values_for_skip = ["", "k.a.", "k.a. (nur cookie-banner erkannt)", "k.a. (fehler)"]
for i in range(start_row_index_in_sheet, end_row_index_in_sheet + 1): for i in range(start_row_index_in_sheet, end_row_index_in_sheet + 1):
row_index_in_list = i - 1 row_index_in_list = i - 1
if row_index_in_list >= len(all_data): continue if row_index_in_list >= len(all_data): continue
row = all_data[row_index_in_list] row = all_data[row_index_in_list]
# --- Prüfung, ob AR schon Inhalt hat --- # --- KORRIGIERTE Prüfung, ob AR schon Inhalt hat ---
should_skip = False should_skip = False
cell_value_ar_str = "" cell_value_ar_str_lower = "FEHLER_INDEX" # Wird überschrieben, wenn Index existiert
if len(row) > rohtext_col_index: if len(row) > rohtext_col_index:
cell_value_ar_str = str(row[rohtext_col_index]).strip().lower() cell_value_ar_str_lower = str(row[rohtext_col_index]).strip().lower()
if cell_value_ar_str not in empty_values_for_skip: # Überspringen, wenn NICHT in der Liste der leeren Werte # Überspringen, wenn der Wert NICHT in der Liste der leeren Werte ist
if cell_value_ar_str_lower not in empty_values_for_skip:
should_skip = True should_skip = True
# else: Spalte zu kurz -> nicht überspringen (wird als leer behandelt) # else: Spalte existiert nicht -> nicht überspringen (gilt als leer)
log_debug = (i < start_row_index_in_sheet + 5 or i > end_row_index_in_sheet - 5 or i % 500 == 0) log_debug = (i < start_row_index_in_sheet + 5 or i > end_row_index_in_sheet - 5 or i % 500 == 0 or i in range(10,15)) # Debug-Zeilen anpassen
if log_debug: if log_debug:
debug_print(f"Zeile {i} (Website AR Check): Prüfe Inhalt Spalte {rohtext_col_letter}. Wert='{cell_value_ar_str}'. Überspringen? -> {should_skip}") debug_print(f"Zeile {i} (Website AR Check): Prüfe Inhalt Spalte {rohtext_col_letter}. Wert='{cell_value_ar_str_lower}'. Überspringen (da schon Inhalt)? -> {should_skip}")
if should_skip: if should_skip:
total_skipped_count += 1 total_skipped_count += 1
continue continue
# --- Ende AR Prüfung --- # --- Ende AR Prüfung ---
# Gültige URL Prüfung # --- Zusätzliche Prüfung: URL vorhanden? ---
website_url = row[website_col_idx] if len(row) > website_col_idx else "" website_url = row[website_col_idx] if len(row) > website_col_idx else ""
if not website_url or website_url.strip().lower() == "k.a.": if not website_url or website_url.strip().lower() == "k.a.":
# Logge nur, wenn AR tatsächlich leer war (also nicht übersprungen wurde)
# debug_print(f"Zeile {i}: AR ist leer/k.A., aber keine gültige URL vorhanden. Überspringe.")
total_skipped_url_count += 1 total_skipped_url_count += 1
continue continue
# --- Ende URL Prüfung ---
# Nur wenn AR leer/k.A. war UND eine URL existiert, Task hinzufügen
tasks_for_processing_batch.append({"row_num": i, "url": website_url}) tasks_for_processing_batch.append({"row_num": i, "url": website_url})
# --- Verarbeitungs-Batch ausführen --- # --- Verarbeitungs-Batch ausführen ---
@@ -2270,9 +2273,9 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
current_batch_processed_count = len(scraping_results) current_batch_processed_count = len(scraping_results)
total_processed_count += current_batch_processed_count total_processed_count += current_batch_processed_count
debug_print(f" Scraping für Batch beendet. {current_batch_processed_count} Ergebnisse erhalten ({total_error_count} Fehler in diesem Batch).") debug_print(f" Scraping für Batch beendet. {current_batch_processed_count} Ergebnisse erhalten.")
# --- Sheet Updates vorbereiten (NUR AR und AP) --- # Sheet Updates vorbereiten (AR und AP)
if scraping_results: if scraping_results:
current_version = Config.VERSION current_version = Config.VERSION
batch_sheet_updates = [] batch_sheet_updates = []
@@ -2286,15 +2289,15 @@ def process_website_batch(sheet_handler, start_row_index_in_sheet, end_row_index
tasks_for_processing_batch = [] # Batch leeren tasks_for_processing_batch = [] # Batch leeren
# --- Sheet Updates senden (wenn update_batch_row_limit erreicht) --- # Sheet Updates senden (wenn update_batch_row_limit erreicht)
if len(all_sheet_updates) >= update_batch_row_limit * 2: # *2 weil 2 Updates pro Zeile if len(all_sheet_updates) >= update_batch_row_limit * 2:
debug_print(f" Sende gesammelte Sheet-Updates ({len(all_sheet_updates)} Zellen)...") debug_print(f" Sende gesammelte Sheet-Updates ({len(all_sheet_updates)} Zellen)...")
success = sheet_handler.batch_update_cells(all_sheet_updates) success = sheet_handler.batch_update_cells(all_sheet_updates)
if success: debug_print(f" Sheet-Update bis Zeile {i} erfolgreich.") if success: debug_print(f" Sheet-Update bis Zeile {i} erfolgreich.")
else: debug_print(f" FEHLER beim Sheet-Update bis Zeile {i}.") else: debug_print(f" FEHLER beim Sheet-Update bis Zeile {i}.")
all_sheet_updates = [] all_sheet_updates = []
# --- Finale Sheet Updates senden --- # Finale Sheet Updates senden
if all_sheet_updates: if all_sheet_updates:
debug_print(f"Sende finale Sheet-Updates ({len(all_sheet_updates)} Zellen)...") debug_print(f"Sende finale Sheet-Updates ({len(all_sheet_updates)} Zellen)...")
sheet_handler.batch_update_cells(all_sheet_updates) sheet_handler.batch_update_cells(all_sheet_updates)