data_processor.py aktualisiert

This commit is contained in:
2025-07-19 18:42:41 +00:00
parent 56fbdded2c
commit a189124de4

View File

@@ -45,7 +45,8 @@ from helpers import (
search_linkedin_contacts,
is_valid_wikipedia_article_url,
verify_wiki_article_chatgpt,
generate_fsm_pitch)
generate_fsm_pitch,
get_col_idx)
# Klassen-Imports
from google_sheet_handler import GoogleSheetHandler
from wikipedia_scraper import WikipediaScraper
@@ -237,7 +238,6 @@ class DataProcessor:
url_pruefstatus = self._get_cell_value_safe(
row_data, "URL Prüfstatus") or ''
# --- 1. Website Handling (Lookup, Scraping, Summarization, Meta) ---
run_website_step = 'web' in steps_to_run
website_processing_needed = self._needs_website_processing(
row_data, force_reeval)
@@ -247,6 +247,9 @@ class DataProcessor:
grund_message = "Re-Eval" if force_reeval else "Timestamp (AJ) leer"
self.logger.info(
f"Zeile {row_num_in_sheet}: Fuehre WEBSITE Schritte aus (Grund: {grund_message})...")
# WICHTIG: url_pruefstatus muss initialisiert werden, um Fehler zu vermeiden
url_pruefstatus = self._get_cell_value_safe(row_data, "URL Prüfstatus")
if not website_url or website_url.lower() == "k.a.":
self.logger.debug(
@@ -258,7 +261,7 @@ class DataProcessor:
website_url = new_website
updates.append(
{
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["CRM Website"] + 1)}{row_num_in_sheet}',
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("CRM Website") + 1)}{row_num_in_sheet}',
'values': [
[website_url]]})
url_pruefstatus = "URL_OK_SERP"
@@ -282,16 +285,17 @@ class DataProcessor:
url_pruefstatus = "URL_OK_SCRAPED"
website_meta_details = scrape_website_details(
website_url) or "k.A. (Keine Meta-Details)"
# Verbessert: company_name für besseren Kontext übergeben
website_summary = summarize_website_content(
website_raw) or "k.A. (Keine Zusammenfassung erhalten)"
website_raw, company_name) or "k.A. (Keine Zusammenfassung erhalten)"
updates.append(
{
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Meta-Details"] + 1)}{row_num_in_sheet}',
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Meta-Details") + 1)}{row_num_in_sheet}',
'values': [
[website_meta_details]]})
updates.append(
{
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Zusammenfassung"] + 1)}{row_num_in_sheet}',
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Zusammenfassung") + 1)}{row_num_in_sheet}',
'values': [
[website_summary]]})
else:
@@ -300,12 +304,12 @@ class DataProcessor:
website_summary, website_meta_details = "k.A.", "k.A."
updates.append(
{
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Zusammenfassung"] + 1)}{row_num_in_sheet}',
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Zusammenfassung") + 1)}{row_num_in_sheet}',
'values': [
[website_summary]]})
updates.append(
{
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Meta-Details"] + 1)}{row_num_in_sheet}',
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Meta-Details") + 1)}{row_num_in_sheet}',
'values': [
[website_meta_details]]})
except Exception as e_scrape_web:
@@ -315,17 +319,17 @@ class DataProcessor:
url_pruefstatus = "URL_SCRAPE_ERROR"
updates.append(
{
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Zusammenfassung"] + 1)}{row_num_in_sheet}',
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Zusammenfassung") + 1)}{row_num_in_sheet}',
'values': [
[website_summary]]})
updates.append(
{
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Meta-Details"] + 1)}{row_num_in_sheet}',
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Meta-Details") + 1)}{row_num_in_sheet}',
'values': [
[website_meta_details]]})
updates.append(
{
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Rohtext"] + 1)}{row_num_in_sheet}',
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Rohtext") + 1)}{row_num_in_sheet}',
'values': [
[website_raw]]})
else:
@@ -336,28 +340,28 @@ class DataProcessor:
url_pruefstatus = "URL_MISSING"
updates.append(
{
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Rohtext"] + 1)}{row_num_in_sheet}',
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Rohtext") + 1)}{row_num_in_sheet}',
'values': [
[website_raw]]})
updates.append(
{
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Zusammenfassung"] + 1)}{row_num_in_sheet}',
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Zusammenfassung") + 1)}{row_num_in_sheet}',
'values': [
[website_summary]]})
updates.append(
{
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Meta-Details"] + 1)}{row_num_in_sheet}',
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Meta-Details") + 1)}{row_num_in_sheet}',
'values': [
[website_meta_details]]})
updates.append(
{
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["URL Prüfstatus"] + 1)}{row_num_in_sheet}',
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("URL Prüfstatus") + 1)}{row_num_in_sheet}',
'values': [
[url_pruefstatus]]})
updates.append(
{
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Scrape Timestamp"] + 1)}{row_num_in_sheet}',
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Scrape Timestamp") + 1)}{row_num_in_sheet}',
'values': [
[now_timestamp]]})