bugfix
This commit is contained in:
@@ -2835,94 +2835,94 @@ class DataProcessor:
|
|||||||
|
|
||||||
logging.info(f"Verarbeite Sheet-Zeilen {start_sheet_row} bis {end_sheet_row} für Website Scraping (Batch).")
|
logging.info(f"Verarbeite Sheet-Zeilen {start_sheet_row} bis {end_sheet_row} für Website Scraping (Batch).")
|
||||||
|
|
||||||
# Worker-Funktion für Scraping (Kann global bleiben oder private statische Methode)
|
# Worker-Funktion für Scraping (Kann global bleiben oder private statische Methode)
|
||||||
# Bleibt global, da sie keine self benötigt.
|
# Bleibt global, da sie keine self benötigt.
|
||||||
def scrape_raw_text_task(task_info):
|
def scrape_raw_text_task(task_info):
|
||||||
row_num = task_info['row_num']; url = task_info['url']; raw_text = "k.A."; error = None
|
row_num = task_info['row_num']; url = task_info['url']; raw_text = "k.A."; error = None
|
||||||
try: raw_text = get_website_raw(url) # Annahme: get_website_raw ist global mit Retry
|
try: raw_text = get_website_raw(url) # Annahme: get_website_raw ist global mit Retry
|
||||||
except Exception as e: error = f"Scraping Fehler Zeile {row_num}: {e}"; logging.error(error) # Logge Fehler im Worker
|
except Exception as e: error = f"Scraping Fehler Zeile {row_num}: {e}"; logging.error(error) # Logge Fehler im Worker
|
||||||
return {"row_num": row_num, "raw_text": raw_text, "error": error}
|
return {"row_num": row_num, "raw_text": raw_text, "error": error}
|
||||||
|
|
||||||
|
|
||||||
tasks_for_processing_batch = []
|
tasks_for_processing_batch = []
|
||||||
all_sheet_updates = []
|
all_sheet_updates = []
|
||||||
processed_count = 0 # Zählt Zeilen, für die Task erstellt wird
|
processed_count = 0 # Zählt Zeilen, für die Task erstellt wird
|
||||||
skipped_url_count = 0
|
skipped_url_count = 0
|
||||||
|
|
||||||
processing_batch_size = Config.PROCESSING_BATCH_SIZE
|
processing_batch_size = Config.PROCESSING_BATCH_SIZE
|
||||||
max_scraping_workers = Config.MAX_SCRAPING_WORKERS
|
max_scraping_workers = Config.MAX_SCRAPING_WORKERS
|
||||||
|
|
||||||
for i in range(start_sheet_row, end_sheet_row + 1):
|
for i in range(start_sheet_row, end_sheet_row + 1):
|
||||||
row_index_in_list = i - 1
|
row_index_in_list = i - 1
|
||||||
row = all_data[row_index_in_list]
|
row = all_data[row_index_in_list]
|
||||||
|
|
||||||
# URL Prüfung (immer nötig, auch wenn AT fehlt)
|
# URL Prüfung (immer nötig, auch wenn AT fehlt)
|
||||||
website_url = row[website_col_idx] if len(row) > website_col_idx else ""
|
website_url = row[website_col_idx] if len(row) > website_col_idx else ""
|
||||||
if not website_url or website_url.strip().lower() == "k.A.":
|
if not website_url or website_url.strip().lower() == "k.A.":
|
||||||
skipped_url_count += 1
|
skipped_url_count += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Kein AT Timestamp -> Task erstellen
|
# Kein AT Timestamp -> Task erstellen
|
||||||
tasks_for_processing_batch.append({"row_num": i, "url": website_url})
|
tasks_for_processing_batch.append({"row_num": i, "url": website_url})
|
||||||
processed_count += 1
|
processed_count += 1
|
||||||
|
|
||||||
# Verarbeitungs-Batch ausführen
|
# Verarbeitungs-Batch ausführen
|
||||||
if len(tasks_for_processing_batch) >= processing_batch_size or i == end_sheet_row:
|
if len(tasks_for_processing_batch) >= processing_batch_size or i == end_sheet_row:
|
||||||
if tasks_for_processing_batch:
|
if tasks_for_processing_batch:
|
||||||
batch_start_row = tasks_for_processing_batch[0]['row_num']
|
batch_start_row = tasks_for_processing_batch[0]['row_num']
|
||||||
batch_end_row = tasks_for_processing_batch[-1]['row_num']
|
batch_end_row = tasks_for_processing_batch[-1]['row_num']
|
||||||
batch_task_count = len(tasks_for_processing_batch)
|
batch_task_count = len(tasks_for_processing_batch)
|
||||||
logging.info(f"\n--- Starte Scraping-Batch ({batch_task_count} Tasks, Zeilen {batch_start_row}-{batch_end_row}) ---")
|
logging.info(f"\n--- Starte Scraping-Batch ({batch_task_count} Tasks, Zeilen {batch_start_row}-{batch_end_row}) ---")
|
||||||
|
|
||||||
scraping_results = {} # {'row_num': raw_text}
|
scraping_results = {} # {'row_num': raw_text}
|
||||||
batch_error_count = 0
|
batch_error_count = 0
|
||||||
logging.info(f" Scrape {batch_task_count} Websites parallel (max {max_scraping_workers} worker)...")
|
logging.info(f" Scrape {batch_task_count} Websites parallel (max {max_scraping_workers} worker)...")
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_scraping_workers) as executor:
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_scraping_workers) as executor:
|
||||||
future_to_task = {executor.submit(scrape_raw_text_task, task): task for task in tasks_for_processing_batch}
|
future_to_task = {executor.submit(scrape_raw_text_task, task): task for task in tasks_for_processing_batch}
|
||||||
for future in concurrent.futures.as_completed(future_to_task):
|
for future in concurrent.futures.as_completed(future_to_task):
|
||||||
task = future_to_task[future]
|
task = future_to_task[future]
|
||||||
try:
|
try:
|
||||||
result = future.result()
|
result = future.result()
|
||||||
scraping_results[result['row_num']] = result['raw_text']
|
scraping_results[result['row_num']] = result['raw_text']
|
||||||
if result['error']: batch_error_count += 1
|
if result['error']: batch_error_count += 1
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
row_num = task['row_num']
|
row_num = task['row_num']
|
||||||
err_msg = f"Generischer Fehler Scraping Task Zeile {row_num}: {exc}"
|
err_msg = f"Generischer Fehler Scraping Task Zeile {row_num}: {exc}"
|
||||||
logging.error(err_msg)
|
logging.error(err_msg)
|
||||||
scraping_results[row_num] = "k.A. (Fehler)"
|
scraping_results[row_num] = "k.A. (Fehler)"
|
||||||
batch_error_count += 1
|
batch_error_count += 1
|
||||||
|
|
||||||
logging.info(f" Scraping für Batch beendet. {len(scraping_results)} Ergebnisse erhalten ({batch_error_count} Fehler in diesem Batch).")
|
logging.info(f" Scraping für Batch beendet. {len(scraping_results)} Ergebnisse erhalten ({batch_error_count} Fehler in diesem Batch).")
|
||||||
|
|
||||||
# Sheet Updates vorbereiten (AR und AT)
|
# Sheet Updates vorbereiten (AR und AT)
|
||||||
if scraping_results:
|
if scraping_results:
|
||||||
current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
batch_sheet_updates = []
|
batch_sheet_updates = []
|
||||||
for row_num, raw_text_res in scraping_results.items():
|
for row_num, raw_text_res in scraping_results.items():
|
||||||
batch_sheet_updates.extend([
|
batch_sheet_updates.extend([
|
||||||
{'range': f'{rohtext_col_letter}{row_num}', 'values': [[raw_text_res]]},
|
{'range': f'{rohtext_col_letter}{row_num}', 'values': [[raw_text_res]]},
|
||||||
{'range': f'{ts_col_letter}{row_num}', 'values': [[current_timestamp]]} # Setze AT Timestamp
|
{'range': f'{ts_col_letter}{row_num}', 'values': [[current_timestamp]]} # Setze AT Timestamp
|
||||||
])
|
])
|
||||||
all_sheet_updates.extend(batch_sheet_updates)
|
all_sheet_updates.extend(batch_sheet_updates)
|
||||||
|
|
||||||
# Sheet Updates senden für diesen Batch
|
# Sheet Updates senden für diesen Batch
|
||||||
if all_sheet_updates:
|
if all_sheet_updates:
|
||||||
logging.info(f" Sende Sheet-Update für {len(all_sheet_updates)} Zellen für Batch {batch_start_row}-{batch_end_row}...")
|
logging.info(f" Sende Sheet-Update für {len(all_sheet_updates)} Zellen für Batch {batch_start_row}-{batch_end_row}...")
|
||||||
success = self.sheet_handler.batch_update_cells(all_sheet_updates)
|
success = self.sheet_handler.batch_update_cells(all_sheet_updates)
|
||||||
if success: logging.info(f" Sheet-Update erfolgreich.")
|
if success: logging.info(f" Sheet-Update erfolgreich.")
|
||||||
else: logging.error(f" FEHLER beim Sheet-Update.")
|
else: logging.error(f" FEHLER beim Sheet-Update.")
|
||||||
all_sheet_updates = [] # Zurücksetzen nach Senden
|
all_sheet_updates = [] # Zurücksetzen nach Senden
|
||||||
|
|
||||||
# Pause nach jedem Batch
|
# Pause nach jedem Batch
|
||||||
logging.debug(" Warte nach Batch...")
|
logging.debug(" Warte nach Batch...")
|
||||||
time.sleep(Config.RETRY_DELAY)
|
time.sleep(Config.RETRY_DELAY)
|
||||||
|
|
||||||
# Finaler Sheet Update Batch senden (falls Reste übrig)
|
# Finaler Sheet Update Batch senden (falls Reste übrig)
|
||||||
if all_sheet_updates:
|
if all_sheet_updates:
|
||||||
logging.info(f"Sende finalen Sheet-Update ({len(all_sheet_updates)} Zellen)...")
|
logging.info(f"Sende finalen Sheet-Update ({len(all_sheet_updates)} Zellen)...")
|
||||||
self.sheet_handler.batch_update_cells(all_sheet_updates)
|
self.sheet_handler.batch_update_cells(all_sheet_updates)
|
||||||
|
|
||||||
logging.info(f"Website-Scraping Batch abgeschlossen. {processed_count} Tasks erstellt, {skipped_url_count} Zeilen ohne URL übersprungen.")
|
logging.info(f"Website-Scraping Batch abgeschlossen. {processed_count} Tasks erstellt, {skipped_url_count} Zeilen ohne URL übersprungen.")
|
||||||
|
|
||||||
|
|
||||||
# process_summarization_batch Methode
|
# process_summarization_batch Methode
|
||||||
@@ -3270,7 +3270,7 @@ class DataProcessor:
|
|||||||
except KeyError as e: logging.critical(f"FEHLER: Benötigte Spalte '{e}' fehlt."); return
|
except KeyError as e: logging.critical(f"FEHLER: Benötigte Spalte '{e}' fehlt."); return
|
||||||
except Exception as e: logging.critical(f"FEHLER beim Holen der Spaltenbuchstaben: {e}"); return
|
except Exception as e: logging.critical(f"FEHLER beim Holen der Spaltenbuchstaben: {e}"); return
|
||||||
|
|
||||||
for i, row in enumerate(data_rows): # <= for-Schleife beginnt hier
|
for i, row in enumerate(data_rows): # <= for-Schleife beginnt hier
|
||||||
row_num_in_sheet = i + header_rows + 1
|
row_num_in_sheet = i + header_rows + 1
|
||||||
|
|
||||||
if limit is not None and rows_processed_count >= limit:
|
if limit is not None and rows_processed_count >= limit:
|
||||||
@@ -3307,7 +3307,7 @@ for i, row in enumerate(data_rows): # <= for-Schleife beginnt hier
|
|||||||
|
|
||||||
# <= Hier endet die for-Schleife. Die folgenden Blöcke müssen auf dieser Ebene (derselben wie for) eingerückt sein.
|
# <= Hier endet die for-Schleife. Die folgenden Blöcke müssen auf dieser Ebene (derselben wie for) eingerückt sein.
|
||||||
|
|
||||||
if updates: # <= DIESER BLOCK muss auf derselben Ebene wie die for-Schleife beginnen
|
if updates: # <= DIESER BLOCK muss auf derselben Ebene wie die for-Schleife beginnen
|
||||||
logging.info(f"Sende Batch-Update für {len(updates)} Zellen ({rows_processed_count} Zeilen geprüft)...")
|
logging.info(f"Sende Batch-Update für {len(updates)} Zellen ({rows_processed_count} Zeilen geprüft)...")
|
||||||
success = self.sheet_handler.batch_update_cells(updates)
|
success = self.sheet_handler.batch_update_cells(updates)
|
||||||
if success:
|
if success:
|
||||||
@@ -3315,10 +3315,10 @@ if updates: # <= DIESER BLOCK muss auf derselben Ebene wie die for-Schleife begi
|
|||||||
else: # <= Dieses else gehört zum if success:
|
else: # <= Dieses else gehört zum if success:
|
||||||
logging.error(f"FEHLER beim Batch-Update.")
|
logging.error(f"FEHLER beim Batch-Update.")
|
||||||
|
|
||||||
else: # <= DIESER BLOCK gehört zum if updates:
|
else: # <= DIESER BLOCK gehört zum if updates:
|
||||||
logging.info("Keine fehlenden Websites gefunden oder keine Updates nötig.")
|
logging.info("Keine fehlenden Websites gefunden oder keine Updates nötig.")
|
||||||
|
|
||||||
logging.info(f"Modus 'website_lookup' abgeschlossen. {rows_processed_count} Zeilen geprüft.") # <= Diese Zeile gehört zur Methode, auf derselben Ebene wie das if updates:
|
logging.info(f"Modus 'website_lookup' abgeschlossen. {rows_processed_count} Zeilen geprüft.") # <= Diese Zeile gehört zur Methode, auf derselben Ebene wie das if updates:
|
||||||
|
|
||||||
|
|
||||||
# process_find_wiki_serp Methode
|
# process_find_wiki_serp Methode
|
||||||
|
|||||||
Reference in New Issue
Block a user