feat(gtm-architect): Finalize migration and implement web scraping

- Refactors the gtm-architect Dockerfile for a flat, more efficient build process. - Implements robust web scraping via BeautifulSoup in helpers.py for URL analysis in phase1. - Makes shared library imports (gspread, pandas, etc.) in helpers.py optional to prevent ModuleNotFoundErrors in microservices. - Implements the main execution logic in the orchestrator to handle command-line arguments. - Updates documentation to reflect the new architecture, scraping feature, and dependency handling.
2026-01-03 08:43:53 +00:00
parent 2663d85ae7
commit 302a211239
7 changed files with 282 additions and 64 deletions
--- a/helpers.py
+++ b/helpers.py
@@ -29,11 +29,26 @@ from urllib.parse import urlparse, unquote
 from difflib import SequenceMatcher

 # Externe Bibliotheken
-import gspread
-import wikipedia
+try:
+    import gspread
+    GSPREAD_AVAILABLE = True
+except ImportError:
+    GSPREAD_AVAILABLE = False
+    gspread = None # Define to avoid runtime errors on reference
+try:
+    import wikipedia
+    WIKIPEDIA_AVAILABLE = True
+except ImportError:
+    WIKIPEDIA_AVAILABLE = False
+    wikipedia = None # Define to avoid runtime errors on reference
 import requests
 from bs4 import BeautifulSoup
-import pandas as pd
+try:
+    import pandas as pd
+    PANDAS_AVAILABLE = True
+except ImportError:
+    PANDAS_AVAILABLE = False
+    pd = None # Define to avoid runtime errors on reference

 # --- KI UMSCHALTUNG: Google Generative AI statt OpenAI ---
 try:
@@ -44,8 +59,20 @@ except ImportError:
    logging.warning("google-generativeai Bibliothek nicht gefunden. KI-Funktionen deaktiviert.")

 # OpenAI Imports entfernen wir oder machen sie optional, um Verwirrung zu vermeiden
-import openai 
-from openai.error import AuthenticationError, OpenAIError, RateLimitError, APIError, Timeout, InvalidRequestError, ServiceUnavailableError
+try:
+    import openai 
+    from openai.error import AuthenticationError, OpenAIError, RateLimitError, APIError, Timeout, InvalidRequestError, ServiceUnavailableError
+    OPENAI_AVAILABLE = True
+except ImportError:
+    OPENAI_AVAILABLE = False
+    # Define dummy exception classes so the code doesn't crash if it tries to catch them
+    class AuthenticationError(Exception): pass
+    class OpenAIError(Exception): pass
+    class RateLimitError(Exception): pass
+    class APIError(Exception): pass
+    class Timeout(Exception): pass
+    class InvalidRequestError(Exception): pass
+    class ServiceUnavailableError(Exception): pass

 from config import (Config, BRANCH_MAPPING_FILE, URL_CHECK_MARKER, USER_AGENTS, LOG_DIR)

@@ -106,11 +133,17 @@ def retry_on_failure(func):
                    decorator_logger.warning(f"Wiederhole Versuch {attempt + 1}/{max_retries_config} fuer '{effective_func_name}'...")
                return func(*args, **kwargs)

-            except (gspread.exceptions.SpreadsheetNotFound, ValueError) as e: # AuthError removed from here as it might be recoverable with new key
-                 decorator_logger.critical(f"❌ ENDGUELTIGER FEHLER bei '{effective_func_name}': Permanentes Problem erkannt. {type(e).__name__} - {str(e)[:150]}...")
-                 raise e
-
            except Exception as e: # Catch all to include Gemini errors
+                 # Define permanent errors that should not be retried
+                 permanent_errors = [ValueError]
+                 if GSPREAD_AVAILABLE:
+                     permanent_errors.append(gspread.exceptions.SpreadsheetNotFound)
+                 
+                 if any(isinstance(e, error_type) for error_type in permanent_errors):
+                     decorator_logger.critical(f"❌ ENDGUELTIGER FEHLER bei '{effective_func_name}': Permanentes Problem erkannt. {type(e).__name__} - {str(e)[:150]}...")
+                     raise e
+
+                 # Handle retryable errors
                 error_msg = str(e)
                 error_type = type(e).__name__

@@ -380,6 +413,58 @@ def generate_fsm_pitch(company_name, company_short_name, ki_branche, website_sum
 def serp_website_lookup(company_name): return "k.A." # Placeholder
 def search_linkedin_contacts(company_name, website, position_query, crm_kurzform, num_results=10): return [] # Placeholder
 def get_website_raw(url, max_length=30000, verify_cert=False): return "k.A." # Placeholder
-def scrape_website_details(url): return "k.A." # Placeholder
+def scrape_website_details(url):
+    """
+    Fetches and extracts clean text content from a URL using requests and BeautifulSoup.
+    - Removes common non-content tags.
+    - Limits content length to avoid excessive token usage.
+    """
+    logger = logging.getLogger(__name__)
+    if not url or not isinstance(url, str) or not url.startswith('http'):
+        logger.warning(f"Ungültige oder fehlende URL für Scraping: {url}")
+        return "Keine gültige URL angegeben."
+
+    try:
+        # Use a random user-agent to avoid simple bot detection
+        headers = {'User-Agent': random.choice(USER_AGENTS)}
+        response = requests.get(url, headers=headers, timeout=getattr(Config, 'REQUEST_TIMEOUT', 15), verify=False)
+        response.raise_for_status()
+        
+        # Check content type to avoid parsing non-HTML content
+        if 'text/html' not in response.headers.get('Content-Type', ''):
+            logger.warning(f"Inhalt der URL {url} ist kein HTML.")
+            return "Die URL lieferte keinen auswertbaren HTML-Inhalt."
+
+        soup = BeautifulSoup(response.content, 'html.parser')
+
+        # Gezieltes Entfernen von störenden Elementen
+        for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']):
+            element.decompose()
+        
+        # Extrahieren des Textes aus dem Body, um Metadaten etc. im Head zu ignorieren
+        body = soup.find('body')
+        if body:
+            text = body.get_text(separator=' ', strip=True)
+        else:
+            text = soup.get_text(separator=' ', strip=True) # Fallback für seltsame HTML-Strukturen
+
+        # Bereinigen von überflüssigen Leerzeichen
+        text = re.sub(r'\s+', ' ', text).strip()
+        
+        # Limit the content length to a reasonable size (e.g., 25000 chars)
+        max_len = 25000
+        if len(text) > max_len:
+            logger.info(f"Inhalt von {url} auf {max_len} Zeichen gekürzt (Original: {len(text)}).")
+            text = text[:max_len]
+            
+        logger.info(f"Scraping von {url} erfolgreich. Länge: {len(text)} Zeichen.")
+        return text if text else "Website-Inhalt konnte nicht extrahiert werden."
+
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Fehler beim Abrufen der URL {url}: {e}")
+        return f"Fehler: Die URL konnte nicht abgerufen werden. (Grund: {e.__class__.__name__})"
+    except Exception as e:
+        logger.error(f"Unerwarteter Fehler beim Parsen der URL {url}: {e}")
+        return "Fehler: Ein unerwarteter Fehler ist beim Verarbeiten der Website aufgetreten."
 def is_valid_wikipedia_article_url(url): return False # Placeholder
 def alignment_demo(sheet_handler): pass # Placeholder