From a189124de4e41edb986680b9688f91bd344d4cdf Mon Sep 17 00:00:00 2001 From: Floke Date: Sat, 19 Jul 2025 18:42:41 +0000 Subject: [PATCH] data_processor.py aktualisiert --- data_processor.py | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/data_processor.py b/data_processor.py index e8b02e0d..c10ed7f4 100644 --- a/data_processor.py +++ b/data_processor.py @@ -45,7 +45,8 @@ from helpers import ( search_linkedin_contacts, is_valid_wikipedia_article_url, verify_wiki_article_chatgpt, - generate_fsm_pitch) + generate_fsm_pitch, + get_col_idx) # Klassen-Imports from google_sheet_handler import GoogleSheetHandler from wikipedia_scraper import WikipediaScraper @@ -237,7 +238,6 @@ class DataProcessor: url_pruefstatus = self._get_cell_value_safe( row_data, "URL Prüfstatus") or '' - # --- 1. Website Handling (Lookup, Scraping, Summarization, Meta) --- run_website_step = 'web' in steps_to_run website_processing_needed = self._needs_website_processing( row_data, force_reeval) @@ -247,6 +247,9 @@ class DataProcessor: grund_message = "Re-Eval" if force_reeval else "Timestamp (AJ) leer" self.logger.info( f"Zeile {row_num_in_sheet}: Fuehre WEBSITE Schritte aus (Grund: {grund_message})...") + + # WICHTIG: url_pruefstatus muss initialisiert werden, um Fehler zu vermeiden + url_pruefstatus = self._get_cell_value_safe(row_data, "URL Prüfstatus") if not website_url or website_url.lower() == "k.a.": self.logger.debug( @@ -258,7 +261,7 @@ class DataProcessor: website_url = new_website updates.append( { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["CRM Website"] + 1)}{row_num_in_sheet}', + 'range': f'{self.sheet_handler._get_col_letter(get_col_idx("CRM Website") + 1)}{row_num_in_sheet}', 'values': [ [website_url]]}) url_pruefstatus = "URL_OK_SERP" @@ -282,16 +285,17 @@ class DataProcessor: url_pruefstatus = "URL_OK_SCRAPED" website_meta_details = scrape_website_details( website_url) or "k.A. (Keine Meta-Details)" + # Verbessert: company_name für besseren Kontext übergeben website_summary = summarize_website_content( - website_raw) or "k.A. (Keine Zusammenfassung erhalten)" + website_raw, company_name) or "k.A. (Keine Zusammenfassung erhalten)" updates.append( { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Meta-Details"] + 1)}{row_num_in_sheet}', + 'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Meta-Details") + 1)}{row_num_in_sheet}', 'values': [ [website_meta_details]]}) updates.append( { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Zusammenfassung"] + 1)}{row_num_in_sheet}', + 'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Zusammenfassung") + 1)}{row_num_in_sheet}', 'values': [ [website_summary]]}) else: @@ -300,12 +304,12 @@ class DataProcessor: website_summary, website_meta_details = "k.A.", "k.A." updates.append( { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Zusammenfassung"] + 1)}{row_num_in_sheet}', + 'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Zusammenfassung") + 1)}{row_num_in_sheet}', 'values': [ [website_summary]]}) updates.append( { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Meta-Details"] + 1)}{row_num_in_sheet}', + 'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Meta-Details") + 1)}{row_num_in_sheet}', 'values': [ [website_meta_details]]}) except Exception as e_scrape_web: @@ -315,17 +319,17 @@ class DataProcessor: url_pruefstatus = "URL_SCRAPE_ERROR" updates.append( { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Zusammenfassung"] + 1)}{row_num_in_sheet}', + 'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Zusammenfassung") + 1)}{row_num_in_sheet}', 'values': [ [website_summary]]}) updates.append( { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Meta-Details"] + 1)}{row_num_in_sheet}', + 'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Meta-Details") + 1)}{row_num_in_sheet}', 'values': [ [website_meta_details]]}) updates.append( { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Rohtext"] + 1)}{row_num_in_sheet}', + 'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Rohtext") + 1)}{row_num_in_sheet}', 'values': [ [website_raw]]}) else: @@ -336,28 +340,28 @@ class DataProcessor: url_pruefstatus = "URL_MISSING" updates.append( { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Rohtext"] + 1)}{row_num_in_sheet}', + 'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Rohtext") + 1)}{row_num_in_sheet}', 'values': [ [website_raw]]}) updates.append( { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Zusammenfassung"] + 1)}{row_num_in_sheet}', + 'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Zusammenfassung") + 1)}{row_num_in_sheet}', 'values': [ [website_summary]]}) updates.append( { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Meta-Details"] + 1)}{row_num_in_sheet}', + 'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Meta-Details") + 1)}{row_num_in_sheet}', 'values': [ [website_meta_details]]}) updates.append( { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["URL Prüfstatus"] + 1)}{row_num_in_sheet}', + 'range': f'{self.sheet_handler._get_col_letter(get_col_idx("URL Prüfstatus") + 1)}{row_num_in_sheet}', 'values': [ [url_pruefstatus]]}) updates.append( { - 'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Scrape Timestamp"] + 1)}{row_num_in_sheet}', + 'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Scrape Timestamp") + 1)}{row_num_in_sheet}', 'values': [ [now_timestamp]]})