From 1e3c873f906c2cc588da1d02ee7520055cbf4832 Mon Sep 17 00:00:00 2001 From: Floke Date: Fri, 18 Jul 2025 13:34:26 +0000 Subject: [PATCH] data_processor.py aktualisiert --- data_processor.py | 95 +++++++++++++++++++---------------------------- 1 file changed, 39 insertions(+), 56 deletions(-) diff --git a/data_processor.py b/data_processor.py index 582a6233..7d18e23a 100644 --- a/data_processor.py +++ b/data_processor.py @@ -1984,28 +1984,24 @@ class DataProcessor: self.logger.info( f"Wikipedia-Verifizierungs-Batch abgeschlossen. {processed_count} Zeilen verarbeitet (in Batch aufgenommen), {skipped_count} Zeilen uebersprungen ({skipped_no_wiki_url} wegen fehlender M-URL).") # <<< GEÄNDERT - def _scrape_raw_text_task(self, task_info, get_website_raw_func): + def _scrape_raw_text_task(self, task_info, scrape_function): """ - Scrapt den Rohtext einer Website in einem separaten Thread. + Worker-Funktion für das parallele Scrapen. + Passt sich an, um sowohl Rohtext als auch Meta-Details zu liefern. """ - logger = logging.getLogger(__name__ + ".scrape_worker") - row_num, url = task_info['row_num'], task_info['url'] - raw_text, error = "k.A.", None + url = task_info.get('url') + row_num = task_info.get('row_num') + self.logger.debug(f" -> Scrape Task gestartet für Zeile {row_num}: {url}") try: - raw_text = get_website_raw_func(url) - if isinstance(raw_text, str) and (raw_text.startswith( - "k.A. (Fehler") or raw_text.startswith("FEHLER:")): - error = f"Scraping Fehler: {raw_text[:100]}..." - elif not isinstance(raw_text, str) or not raw_text.strip(): - error = "Scraping Task Fehler: Funktion gab keinen gueltigen String zurueck." - raw_text = "k.A. (Extraktion fehlgeschlagen)" + # Wir rufen jetzt zwei Helper-Funktionen auf + raw_text = get_website_raw(url) + meta_details = scrape_website_details(url) + return {'row_num': row_num, 'raw_text': raw_text, 'meta_details': meta_details, 'error': None} except Exception as e: - error = f"Unerwarteter Fehler im Scraping Task Zeile {row_num}: {e}" - logger.error(error) - raw_text = "k.A. (Unerwarteter Fehler Task)" - return {"row_num": row_num, "raw_text": raw_text, "error": error} + self.logger.error(f"Fehler im Scraping Worker für Zeile {row_num}: {e}") + return {'row_num': row_num, 'raw_text': f'k.A. (Fehler: {e})', 'meta_details': 'k.A.', 'error': True} - def process_website_scraping_batch( + def process_website_scraping( self, start_sheet_row=None, end_sheet_row=None, @@ -2101,7 +2097,7 @@ class DataProcessor: "Website Rohtext", "CRM Website", "Version", "Website Scrape Timestamp", "CRM Name" ] # Erstellen Sie ein Dictionary mit Schluesseln und Indizes - col_indices = {key: COLUMN_MAP.get(key) for key in required_keys} + col_indices = {key: COLUMN_MAP.get(key, {}).get('index') for key in required_keys} # Pruefen Sie, ob alle benoetigten Schluessel in COLUMN_MAP gefunden # wurden @@ -2112,11 +2108,11 @@ class DataProcessor: return # Beende die Methode bei kritischem Fehler # Ermitteln Sie die Indizes und Buchstaben fuer Updates (AR, AT, AP) - rohtext_col_idx = col_indices["Website Rohtext"] - website_col_idx = col_indices["CRM Website"] - version_col_idx = col_indices["Version"] - timestamp_col_idx = col_indices["Website Scrape Timestamp"] - name_col_idx = col_indices["CRM Name"] + rohtext_col_idx = col_indices.get("Website Rohtext") + website_col_idx = col_indices.get("CRM Website") + version_col_idx = col_indices.get("Version") + timestamp_col_idx = col_indices.get("Website Scrape Timestamp") + name_col_idx = col_indices.get("CRM Name") rohtext_col_letter = self.sheet_handler._get_col_letter( rohtext_col_idx + 1) # Block 14 _get_col_letter @@ -2312,31 +2308,20 @@ class DataProcessor: # Sammle Sheet Updates (AR, AT, AP) fuer diesen Batch. # Dies geschieht jetzt nach der parallelen Verarbeitung. - if scraping_results: - # Aktueller Zeitstempel und Version - current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - current_version = getattr( - Config, 'VERSION', 'unknown') # Block 1 Config Attribut - batch_sheet_updates = [] # Updates fuer DIESEN spezifischen Batch von Zeilen - - # Iteriere ueber die Zeilennummern im Batch, fuer die Ergebnisse vorliegen. - # Ergebnisse koennen Fehlerwerte enthalten. - for row_num, raw_text_res in scraping_results.items(): - # Fuege Updates fuer AR, AT und AP hinzu (nutzt interne Helfer) - # AR: Roh extrahierter Text (kann auch Fehlerwert sein) - batch_sheet_updates.append({'range': f'{rohtext_col_letter}{row_num}', 'values': [ - [raw_text_res]]}) # Block 1 Column Map - # AT: Timestamp des Scraping-Versuchs (immer setzen, - # wenn versucht wurde) - batch_sheet_updates.append({'range': f'{timestamp_col_letter}{row_num}', 'values': [ - [current_timestamp]]}) # Block 1 Column Map - # AP: Version des Skripts - batch_sheet_updates.append({'range': f'{version_col_letter}{row_num}', 'values': [ - [current_version]]}) # Block 1 Column Map - - # Sammle diese Batch-Updates fuer das groessere Batch-Update am Ende oder bei Limit. - # update_batch_row_limit wird aus Config geholt (Block 1). - all_sheet_updates.extend(batch_sheet_updates) + if scraping_results: + current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + current_version = getattr(Config, 'VERSION', 'unknown') + batch_sheet_updates = [] + # Iteriere über die Ergebnisse des finalen Batches + for row_num, result_dict in scraping_results.items(): + # Füge Updates für Rohtext, Meta-Details, Timestamp und Version hinzu + # Wir verwenden hier die zuvor ermittelten col_indices + batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Rohtext"] + 1)}{row_num}', 'values': [[result_dict.get('raw_text', 'k.A.')]]}) + batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Meta-Details"] + 1)}{row_num}', 'values': [[result_dict.get('meta_details', 'k.A.')]]}) + batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Scrape Timestamp"] + 1)}{row_num}', 'values': [[current_timestamp]]}) + batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Version"] + 1)}{row_num}', 'values': [[current_version]]}) + + all_sheet_updates.extend(batch_sheet_updates) # Leere den Scraping-Batch fuer die naechste Iteration tasks_for_processing_batch = [] @@ -2424,14 +2409,12 @@ class DataProcessor: batch_sheet_updates = [] # Updates fuer diesen spezifischen Batch # Iteriere ueber die Zeilennummern im Batch, fuer die # Ergebnisse vorliegen. - for row_num, raw_text_res in scraping_results.items(): - # Fuege Updates fuer AR, AT und AP hinzu - batch_sheet_updates.append({'range': f'{rohtext_col_letter}{row_num}', 'values': [ - [raw_text_res]]}) # Block 1 Column Map - batch_sheet_updates.append({'range': f'{timestamp_col_letter}{row_num}', 'values': [ - [current_timestamp]]}) # Block 1 Column Map - batch_sheet_updates.append({'range': f'{version_col_letter}{row_num}', 'values': [ - [current_version]]}) # Block 1 Column Map + for row_num, result_dict in scraping_results.items(): + # Füge Updates für Rohtext, Meta-Details, Timestamp und Version hinzu + batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Rohtext"] + 1)}{row_num}', 'values': [[result_dict['raw_text']]]}) + batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Meta-Details"] + 1)}{row_num}', 'values': [[result_dict['meta_details']]]}) + batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Website Scrape Timestamp"] + 1)}{row_num}', 'values': [[current_timestamp]]}) + batch_sheet_updates.append({'range': f'{self.sheet_handler._get_col_letter(col_indices["Version"] + 1)}{row_num}', 'values': [[current_version]]}) # Block 1 Column Map # Fuege diese Updates zur globalen Liste hinzu (wird dann nur # noch einmal gesendet) all_sheet_updates.extend(batch_sheet_updates)