data_processor.py aktualisiert
This commit is contained in:
@@ -2096,10 +2096,9 @@ class DataProcessor:
|
|||||||
# Stellen Sie sicher, dass alle benoetigten Spalten in COLUMN_MAP
|
# Stellen Sie sicher, dass alle benoetigten Spalten in COLUMN_MAP
|
||||||
# (Block 1) vorhanden sind
|
# (Block 1) vorhanden sind
|
||||||
required_keys = [
|
required_keys = [
|
||||||
# AR, D, AP, AT, B
|
"Website Rohtext", "CRM Website", "Version", "Website Scrape Timestamp", "CRM Name", "Website Meta-Details"
|
||||||
"Website Rohtext", "CRM Website", "Version", "Website Scrape Timestamp", "CRM Name"
|
|
||||||
]
|
]
|
||||||
# Erstellen Sie ein Dictionary mit Schluesseln und Indizes
|
# Erstellen Sie ein Dictionary mit Schluesseln und den korrekten Indizes
|
||||||
col_indices = {key: COLUMN_MAP.get(key, {}).get('index') for key in required_keys}
|
col_indices = {key: COLUMN_MAP.get(key, {}).get('index') for key in required_keys}
|
||||||
|
|
||||||
# Pruefen Sie, ob alle benoetigten Schluessel in COLUMN_MAP gefunden
|
# Pruefen Sie, ob alle benoetigten Schluessel in COLUMN_MAP gefunden
|
||||||
@@ -2317,12 +2316,13 @@ class DataProcessor:
|
|||||||
batch_sheet_updates = []
|
batch_sheet_updates = []
|
||||||
# Iteriere über die Ergebnisse des finalen Batches
|
# Iteriere über die Ergebnisse des finalen Batches
|
||||||
for row_num, result_dict in scraping_results.items():
|
for row_num, result_dict in scraping_results.items():
|
||||||
# Füge Updates für Rohtext, Meta-Details, Timestamp und Version hinzu
|
# result_dict ist jetzt garantiert ein Dictionary
|
||||||
# Wir verwenden hier die zuvor ermittelten col_indices
|
batch_sheet_updates.extend([
|
||||||
batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Rohtext"] + 1)}{row_num}', 'values': [[result_dict.get('raw_text', 'k.A.')]]})
|
{'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Rohtext"] + 1)}{row_num}', 'values': [[result_dict.get('raw_text', 'k.A.')]]},
|
||||||
batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Meta-Details"] + 1)}{row_num}', 'values': [[result_dict.get('meta_details', 'k.A.')]]})
|
{'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Meta-Details"] + 1)}{row_num}', 'values': [[result_dict.get('meta_details', 'k.A.')]]},
|
||||||
batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Scrape Timestamp"] + 1)}{row_num}', 'values': [[current_timestamp]]})
|
{'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Scrape Timestamp"] + 1)}{row_num}', 'values': [[current_timestamp]]},
|
||||||
batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Version"] + 1)}{row_num}', 'values': [[current_version]]})
|
{'range': f'{self.sheet_handler._get_col_letter(col_indices["Version"] + 1)}{row_num}', 'values': [[current_version]]}
|
||||||
|
])
|
||||||
|
|
||||||
all_sheet_updates.extend(batch_sheet_updates)
|
all_sheet_updates.extend(batch_sheet_updates)
|
||||||
|
|
||||||
@@ -2401,12 +2401,13 @@ class DataProcessor:
|
|||||||
|
|
||||||
# ANPASSUNG AN NEUE LOGIK
|
# ANPASSUNG AN NEUE LOGIK
|
||||||
for row_num, result_dict in scraping_results.items():
|
for row_num, result_dict in scraping_results.items():
|
||||||
batch_sheet_updates.extend([
|
# result_dict ist jetzt garantiert ein Dictionary
|
||||||
{'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Rohtext"] + 1)}{row_num}', 'values': [[result_dict.get('raw_text', 'k.A.')]]},
|
batch_sheet_updates.extend([
|
||||||
{'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Meta-Details"] + 1)}{row_num}', 'values': [[result_dict.get('meta_details', 'k.A.')]]},
|
{'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Rohtext"] + 1)}{row_num}', 'values': [[result_dict.get('raw_text', 'k.A.')]]},
|
||||||
{'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Scrape Timestamp"] + 1)}{row_num}', 'values': [[current_timestamp]]},
|
{'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Meta-Details"] + 1)}{row_num}', 'values': [[result_dict.get('meta_details', 'k.A.')]]},
|
||||||
{'range': f'{self.sheet_handler._get_col_letter(col_indices["Version"] + 1)}{row_num}', 'values': [[current_version]]}
|
{'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Scrape Timestamp"] + 1)}{row_num}', 'values': [[current_timestamp]]},
|
||||||
])
|
{'range': f'{self.sheet_handler._get_col_letter(col_indices["Version"] + 1)}{row_num}', 'values': [[current_version]]}
|
||||||
|
])
|
||||||
all_sheet_updates.extend(batch_sheet_updates)
|
all_sheet_updates.extend(batch_sheet_updates)
|
||||||
|
|
||||||
# --- Finale Sheet Updates senden ---
|
# --- Finale Sheet Updates senden ---
|
||||||
@@ -2423,26 +2424,22 @@ class DataProcessor:
|
|||||||
|
|
||||||
def _scrape_raw_text_task(self, task_info, scrape_function):
|
def _scrape_raw_text_task(self, task_info, scrape_function):
|
||||||
"""
|
"""
|
||||||
Worker-Funktion, die von jedem Thread im ThreadPoolExecutor ausgeführt wird.
|
Worker-Funktion. Ruft gehärtete Helper auf und gibt IMMER ein Dictionary zurück.
|
||||||
Ruft die übergebene Scraping-Funktion (get_website_raw) auf.
|
|
||||||
Gibt IMMER ein Dictionary zurück, um Abstürze zu vermeiden.
|
|
||||||
"""
|
"""
|
||||||
url = task_info.get('url')
|
url = task_info.get('url')
|
||||||
row_num = task_info.get('row_num')
|
row_num = task_info.get('row_num')
|
||||||
self.logger.debug(f" -> Scrape Task gestartet für Zeile {row_num}: {url}")
|
self.logger.debug(f" -> Scrape Task gestartet für Zeile {row_num}: {url}")
|
||||||
|
|
||||||
try:
|
raw_text_result = get_website_raw(url)
|
||||||
# Hier rufen wir die gehärteten Helper-Funktionen auf
|
# Wir müssen auch die Meta-Details scrapen, da sie jetzt erwartet werden
|
||||||
raw_text_result = get_website_raw(url)
|
meta_details_result = scrape_website_details(url)
|
||||||
meta_details_result = scrape_website_details(url)
|
|
||||||
|
return {
|
||||||
# Gib immer ein Dictionary mit den Ergebnissen zurück
|
'row_num': row_num,
|
||||||
return {
|
'raw_text': raw_text_result,
|
||||||
'row_num': row_num,
|
'meta_details': meta_details_result,
|
||||||
'raw_text': raw_text_result,
|
'error': None # Fehler werden jetzt im Text zurückgegeben
|
||||||
'meta_details': meta_details_result,
|
}
|
||||||
'error': None
|
|
||||||
}
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Dieser Block ist ein zusätzliches Sicherheitsnetz, falls die Helper doch abstürzen
|
# Dieser Block ist ein zusätzliches Sicherheitsnetz, falls die Helper doch abstürzen
|
||||||
self.logger.error(f"FATALER FEHLER im Scraping Worker für Zeile {row_num}: {e}", exc_info=True)
|
self.logger.error(f"FATALER FEHLER im Scraping Worker für Zeile {row_num}: {e}", exc_info=True)
|
||||||
|
|||||||
Reference in New Issue
Block a user