feat(gtm-architect): Finalize migration and implement web scraping

- Refactors the gtm-architect Dockerfile for a flat, more efficient build process.
- Implements robust web scraping via BeautifulSoup in helpers.py for URL analysis in phase1.
- Makes shared library imports (gspread, pandas, etc.) in helpers.py optional to prevent ModuleNotFoundErrors in microservices.
- Implements the main execution logic in the orchestrator to handle command-line arguments.
- Updates documentation to reflect the new architecture, scraping feature, and dependency handling.
This commit is contained in:
2026-01-03 08:43:53 +00:00
parent 2663d85ae7
commit 302a211239
7 changed files with 282 additions and 64 deletions

View File

@@ -29,11 +29,26 @@ from urllib.parse import urlparse, unquote
from difflib import SequenceMatcher
# Externe Bibliotheken
import gspread
import wikipedia
try:
import gspread
GSPREAD_AVAILABLE = True
except ImportError:
GSPREAD_AVAILABLE = False
gspread = None # Define to avoid runtime errors on reference
try:
import wikipedia
WIKIPEDIA_AVAILABLE = True
except ImportError:
WIKIPEDIA_AVAILABLE = False
wikipedia = None # Define to avoid runtime errors on reference
import requests
from bs4 import BeautifulSoup
import pandas as pd
try:
import pandas as pd
PANDAS_AVAILABLE = True
except ImportError:
PANDAS_AVAILABLE = False
pd = None # Define to avoid runtime errors on reference
# --- KI UMSCHALTUNG: Google Generative AI statt OpenAI ---
try:
@@ -44,8 +59,20 @@ except ImportError:
logging.warning("google-generativeai Bibliothek nicht gefunden. KI-Funktionen deaktiviert.")
# OpenAI Imports entfernen wir oder machen sie optional, um Verwirrung zu vermeiden
import openai
from openai.error import AuthenticationError, OpenAIError, RateLimitError, APIError, Timeout, InvalidRequestError, ServiceUnavailableError
try:
import openai
from openai.error import AuthenticationError, OpenAIError, RateLimitError, APIError, Timeout, InvalidRequestError, ServiceUnavailableError
OPENAI_AVAILABLE = True
except ImportError:
OPENAI_AVAILABLE = False
# Define dummy exception classes so the code doesn't crash if it tries to catch them
class AuthenticationError(Exception): pass
class OpenAIError(Exception): pass
class RateLimitError(Exception): pass
class APIError(Exception): pass
class Timeout(Exception): pass
class InvalidRequestError(Exception): pass
class ServiceUnavailableError(Exception): pass
from config import (Config, BRANCH_MAPPING_FILE, URL_CHECK_MARKER, USER_AGENTS, LOG_DIR)
@@ -106,11 +133,17 @@ def retry_on_failure(func):
decorator_logger.warning(f"Wiederhole Versuch {attempt + 1}/{max_retries_config} fuer '{effective_func_name}'...")
return func(*args, **kwargs)
except (gspread.exceptions.SpreadsheetNotFound, ValueError) as e: # AuthError removed from here as it might be recoverable with new key
decorator_logger.critical(f"❌ ENDGUELTIGER FEHLER bei '{effective_func_name}': Permanentes Problem erkannt. {type(e).__name__} - {str(e)[:150]}...")
raise e
except Exception as e: # Catch all to include Gemini errors
# Define permanent errors that should not be retried
permanent_errors = [ValueError]
if GSPREAD_AVAILABLE:
permanent_errors.append(gspread.exceptions.SpreadsheetNotFound)
if any(isinstance(e, error_type) for error_type in permanent_errors):
decorator_logger.critical(f"❌ ENDGUELTIGER FEHLER bei '{effective_func_name}': Permanentes Problem erkannt. {type(e).__name__} - {str(e)[:150]}...")
raise e
# Handle retryable errors
error_msg = str(e)
error_type = type(e).__name__
@@ -380,6 +413,58 @@ def generate_fsm_pitch(company_name, company_short_name, ki_branche, website_sum
def serp_website_lookup(company_name): return "k.A." # Placeholder
def search_linkedin_contacts(company_name, website, position_query, crm_kurzform, num_results=10): return [] # Placeholder
def get_website_raw(url, max_length=30000, verify_cert=False): return "k.A." # Placeholder
def scrape_website_details(url): return "k.A." # Placeholder
def scrape_website_details(url):
"""
Fetches and extracts clean text content from a URL using requests and BeautifulSoup.
- Removes common non-content tags.
- Limits content length to avoid excessive token usage.
"""
logger = logging.getLogger(__name__)
if not url or not isinstance(url, str) or not url.startswith('http'):
logger.warning(f"Ungültige oder fehlende URL für Scraping: {url}")
return "Keine gültige URL angegeben."
try:
# Use a random user-agent to avoid simple bot detection
headers = {'User-Agent': random.choice(USER_AGENTS)}
response = requests.get(url, headers=headers, timeout=getattr(Config, 'REQUEST_TIMEOUT', 15), verify=False)
response.raise_for_status()
# Check content type to avoid parsing non-HTML content
if 'text/html' not in response.headers.get('Content-Type', ''):
logger.warning(f"Inhalt der URL {url} ist kein HTML.")
return "Die URL lieferte keinen auswertbaren HTML-Inhalt."
soup = BeautifulSoup(response.content, 'html.parser')
# Gezieltes Entfernen von störenden Elementen
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']):
element.decompose()
# Extrahieren des Textes aus dem Body, um Metadaten etc. im Head zu ignorieren
body = soup.find('body')
if body:
text = body.get_text(separator=' ', strip=True)
else:
text = soup.get_text(separator=' ', strip=True) # Fallback für seltsame HTML-Strukturen
# Bereinigen von überflüssigen Leerzeichen
text = re.sub(r'\s+', ' ', text).strip()
# Limit the content length to a reasonable size (e.g., 25000 chars)
max_len = 25000
if len(text) > max_len:
logger.info(f"Inhalt von {url} auf {max_len} Zeichen gekürzt (Original: {len(text)}).")
text = text[:max_len]
logger.info(f"Scraping von {url} erfolgreich. Länge: {len(text)} Zeichen.")
return text if text else "Website-Inhalt konnte nicht extrahiert werden."
except requests.exceptions.RequestException as e:
logger.error(f"Fehler beim Abrufen der URL {url}: {e}")
return f"Fehler: Die URL konnte nicht abgerufen werden. (Grund: {e.__class__.__name__})"
except Exception as e:
logger.error(f"Unerwarteter Fehler beim Parsen der URL {url}: {e}")
return "Fehler: Ein unerwarteter Fehler ist beim Verarbeiten der Website aufgetreten."
def is_valid_wikipedia_article_url(url): return False # Placeholder
def alignment_demo(sheet_handler): pass # Placeholder