data_processor.py aktualisiert
This commit is contained in:
@@ -45,7 +45,8 @@ from helpers import (
|
||||
search_linkedin_contacts,
|
||||
is_valid_wikipedia_article_url,
|
||||
verify_wiki_article_chatgpt,
|
||||
generate_fsm_pitch)
|
||||
generate_fsm_pitch,
|
||||
get_col_idx)
|
||||
# Klassen-Imports
|
||||
from google_sheet_handler import GoogleSheetHandler
|
||||
from wikipedia_scraper import WikipediaScraper
|
||||
@@ -237,7 +238,6 @@ class DataProcessor:
|
||||
url_pruefstatus = self._get_cell_value_safe(
|
||||
row_data, "URL Prüfstatus") or ''
|
||||
|
||||
# --- 1. Website Handling (Lookup, Scraping, Summarization, Meta) ---
|
||||
run_website_step = 'web' in steps_to_run
|
||||
website_processing_needed = self._needs_website_processing(
|
||||
row_data, force_reeval)
|
||||
@@ -247,6 +247,9 @@ class DataProcessor:
|
||||
grund_message = "Re-Eval" if force_reeval else "Timestamp (AJ) leer"
|
||||
self.logger.info(
|
||||
f"Zeile {row_num_in_sheet}: Fuehre WEBSITE Schritte aus (Grund: {grund_message})...")
|
||||
|
||||
# WICHTIG: url_pruefstatus muss initialisiert werden, um Fehler zu vermeiden
|
||||
url_pruefstatus = self._get_cell_value_safe(row_data, "URL Prüfstatus")
|
||||
|
||||
if not website_url or website_url.lower() == "k.a.":
|
||||
self.logger.debug(
|
||||
@@ -258,7 +261,7 @@ class DataProcessor:
|
||||
website_url = new_website
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["CRM Website"] + 1)}{row_num_in_sheet}',
|
||||
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("CRM Website") + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[website_url]]})
|
||||
url_pruefstatus = "URL_OK_SERP"
|
||||
@@ -282,16 +285,17 @@ class DataProcessor:
|
||||
url_pruefstatus = "URL_OK_SCRAPED"
|
||||
website_meta_details = scrape_website_details(
|
||||
website_url) or "k.A. (Keine Meta-Details)"
|
||||
# Verbessert: company_name für besseren Kontext übergeben
|
||||
website_summary = summarize_website_content(
|
||||
website_raw) or "k.A. (Keine Zusammenfassung erhalten)"
|
||||
website_raw, company_name) or "k.A. (Keine Zusammenfassung erhalten)"
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Meta-Details"] + 1)}{row_num_in_sheet}',
|
||||
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Meta-Details") + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[website_meta_details]]})
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Zusammenfassung"] + 1)}{row_num_in_sheet}',
|
||||
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Zusammenfassung") + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[website_summary]]})
|
||||
else:
|
||||
@@ -300,12 +304,12 @@ class DataProcessor:
|
||||
website_summary, website_meta_details = "k.A.", "k.A."
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Zusammenfassung"] + 1)}{row_num_in_sheet}',
|
||||
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Zusammenfassung") + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[website_summary]]})
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Meta-Details"] + 1)}{row_num_in_sheet}',
|
||||
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Meta-Details") + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[website_meta_details]]})
|
||||
except Exception as e_scrape_web:
|
||||
@@ -315,17 +319,17 @@ class DataProcessor:
|
||||
url_pruefstatus = "URL_SCRAPE_ERROR"
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Zusammenfassung"] + 1)}{row_num_in_sheet}',
|
||||
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Zusammenfassung") + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[website_summary]]})
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Meta-Details"] + 1)}{row_num_in_sheet}',
|
||||
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Meta-Details") + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[website_meta_details]]})
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Rohtext"] + 1)}{row_num_in_sheet}',
|
||||
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Rohtext") + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[website_raw]]})
|
||||
else:
|
||||
@@ -336,28 +340,28 @@ class DataProcessor:
|
||||
url_pruefstatus = "URL_MISSING"
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Rohtext"] + 1)}{row_num_in_sheet}',
|
||||
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Rohtext") + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[website_raw]]})
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Zusammenfassung"] + 1)}{row_num_in_sheet}',
|
||||
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Zusammenfassung") + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[website_summary]]})
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Meta-Details"] + 1)}{row_num_in_sheet}',
|
||||
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Meta-Details") + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[website_meta_details]]})
|
||||
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["URL Prüfstatus"] + 1)}{row_num_in_sheet}',
|
||||
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("URL Prüfstatus") + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[url_pruefstatus]]})
|
||||
updates.append(
|
||||
{
|
||||
'range': f'{self.sheet_handler._get_col_letter(COLUMN_MAP["Website Scrape Timestamp"] + 1)}{row_num_in_sheet}',
|
||||
'range': f'{self.sheet_handler._get_col_letter(get_col_idx("Website Scrape Timestamp") + 1)}{row_num_in_sheet}',
|
||||
'values': [
|
||||
[now_timestamp]]})
|
||||
|
||||
|
||||
Reference in New Issue
Block a user