data_processor.py aktualisiert
This commit is contained in:
@@ -2077,9 +2077,9 @@ class DataProcessor:
|
|||||||
"""
|
"""
|
||||||
self.logger.info(f"Starte Website-Scraping & Summarizing (Batch). Limit: {limit or 'Unbegrenzt'}")
|
self.logger.info(f"Starte Website-Scraping & Summarizing (Batch). Limit: {limit or 'Unbegrenzt'}")
|
||||||
|
|
||||||
# --- 1. Tasks sammeln ---
|
|
||||||
if start_sheet_row is None:
|
if start_sheet_row is None:
|
||||||
start_data_idx = self.sheet_handler.get_start_row_index("Website Scrape Timestamp")
|
start_data_idx = self.sheet_handler.get_start_row_index("Website Scrape Timestamp")
|
||||||
|
if start_data_idx == -1: return # Fehler wurde bereits geloggt
|
||||||
start_sheet_row = start_data_idx + self.sheet_handler._header_rows + 1
|
start_sheet_row = start_data_idx + self.sheet_handler._header_rows + 1
|
||||||
|
|
||||||
if not self.sheet_handler.load_data(): return
|
if not self.sheet_handler.load_data(): return
|
||||||
@@ -2100,12 +2100,12 @@ class DataProcessor:
|
|||||||
self.logger.info("Keine Zeilen gefunden, die Website-Verarbeitung erfordern.")
|
self.logger.info("Keine Zeilen gefunden, die Website-Verarbeitung erfordern.")
|
||||||
return
|
return
|
||||||
|
|
||||||
# --- 2. Worker-Funktion definieren ---
|
|
||||||
def _scrape_worker(task):
|
def _scrape_worker(task):
|
||||||
"""Interne Worker-Funktion. Gibt IMMER ein Dictionary zurück."""
|
"""Interne Worker-Funktion. Gibt IMMER ein Dictionary zurück."""
|
||||||
company_name = task['company_name']
|
company_name = task['company_name']
|
||||||
website_url = task['url']
|
website_url = task['url']
|
||||||
|
|
||||||
|
# Initialisiere das Ergebnis-Dictionary mit Default-Werten
|
||||||
result = {'raw_text': 'k.A.', 'meta_text': 'k.A.', 'summary': 'k.A.', 'url_pruefstatus': 'URL_UNPROCESSED', 'final_url': website_url}
|
result = {'raw_text': 'k.A.', 'meta_text': 'k.A.', 'summary': 'k.A.', 'url_pruefstatus': 'URL_UNPROCESSED', 'final_url': website_url}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -2141,10 +2141,10 @@ class DataProcessor:
|
|||||||
result['url_pruefstatus'] = "URL_SCRAPE_ERROR"
|
result['url_pruefstatus'] = "URL_SCRAPE_ERROR"
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# --- 3. Parallele Ausführung & Ergebnisse sammeln ---
|
|
||||||
all_updates = []
|
all_updates = []
|
||||||
now_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
now_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
# Stelle sicher, dass der Import am Anfang der Datei steht
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=getattr(Config, 'MAX_SCRAPING_WORKERS', 5)) as executor:
|
with ThreadPoolExecutor(max_workers=getattr(Config, 'MAX_SCRAPING_WORKERS', 5)) as executor:
|
||||||
@@ -2156,23 +2156,20 @@ class DataProcessor:
|
|||||||
try:
|
try:
|
||||||
result_dict = future.result()
|
result_dict = future.result()
|
||||||
|
|
||||||
# Stelle sicher, dass result_dict ein Dictionary ist
|
|
||||||
if not isinstance(result_dict, dict):
|
if not isinstance(result_dict, dict):
|
||||||
self.logger.error(f"Fehlerhaftes Ergebnis für Zeile {row_num}: Worker gab keinen Dictionary zurück. Bekam {type(result_dict)}.")
|
self.logger.error(f"Fehlerhaftes Ergebnis für Zeile {row_num}: Worker gab keinen Dictionary zurück. Bekam {type(result_dict)}. Überspringe Update.")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Bereite die Updates vor
|
all_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("CRM Website") + 1)}{row_num}', 'values': [[result_dict.get('final_url')]]})
|
||||||
all_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("CRM Website") + 1)}{row_num}', 'values': [[result_dict.get('final_url', '')]]})
|
all_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Rohtext") + 1)}{row_num}', 'values': [[result_dict.get('raw_text')]]})
|
||||||
all_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Rohtext") + 1)}{row_num}', 'values': [[result_dict.get('raw_text', '')]]})
|
all_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Meta-Details") + 1)}{row_num}', 'values': [[result_dict.get('meta_text')]]})
|
||||||
all_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Meta-Details") + 1)}{row_num}', 'values': [[result_dict.get('meta_text', '')]]})
|
all_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Zusammenfassung") + 1)}{row_num}', 'values': [[result_dict.get('summary')]]})
|
||||||
all_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Zusammenfassung") + 1)}{row_num}', 'values': [[result_dict.get('summary', '')]]})
|
all_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("URL Prüfstatus") + 1)}{row_num}', 'values': [[result_dict.get('url_pruefstatus')]]})
|
||||||
all_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("URL Prüfstatus") + 1)}{row_num}', 'values': [[result_dict.get('url_pruefstatus', '')]]})
|
|
||||||
all_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Scrape Timestamp") + 1)}{row_num}', 'values': [[now_timestamp]]})
|
all_updates.append({'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Scrape Timestamp") + 1)}{row_num}', 'values': [[now_timestamp]]})
|
||||||
|
|
||||||
except Exception as e_future:
|
except Exception as e_future:
|
||||||
self.logger.error(f"Fehler beim Abrufen des Ergebnisses für Zeile {row_num}: {e_future}", exc_info=True)
|
self.logger.error(f"Fehler beim Abrufen des Ergebnisses für Zeile {row_num}: {e_future}", exc_info=True)
|
||||||
|
|
||||||
# --- 4. Finales Schreiben ---
|
|
||||||
if all_updates:
|
if all_updates:
|
||||||
self.logger.info(f"Sende Batch-Update für {len(tasks)} verarbeitete Websites...")
|
self.logger.info(f"Sende Batch-Update für {len(tasks)} verarbeitete Websites...")
|
||||||
self.sheet_handler.batch_update_cells(all_updates)
|
self.sheet_handler.batch_update_cells(all_updates)
|
||||||
|
|||||||
Reference in New Issue
Block a user