feat(gtm-architect): Finalize migration and implement web scraping
- Refactors the gtm-architect Dockerfile for a flat, more efficient build process. - Implements robust web scraping via BeautifulSoup in helpers.py for URL analysis in phase1. - Makes shared library imports (gspread, pandas, etc.) in helpers.py optional to prevent ModuleNotFoundErrors in microservices. - Implements the main execution logic in the orchestrator to handle command-line arguments. - Updates documentation to reflect the new architecture, scraping feature, and dependency handling.
This commit is contained in:
105
helpers.py
105
helpers.py
@@ -29,11 +29,26 @@ from urllib.parse import urlparse, unquote
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
# Externe Bibliotheken
|
||||
import gspread
|
||||
import wikipedia
|
||||
try:
|
||||
import gspread
|
||||
GSPREAD_AVAILABLE = True
|
||||
except ImportError:
|
||||
GSPREAD_AVAILABLE = False
|
||||
gspread = None # Define to avoid runtime errors on reference
|
||||
try:
|
||||
import wikipedia
|
||||
WIKIPEDIA_AVAILABLE = True
|
||||
except ImportError:
|
||||
WIKIPEDIA_AVAILABLE = False
|
||||
wikipedia = None # Define to avoid runtime errors on reference
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
try:
|
||||
import pandas as pd
|
||||
PANDAS_AVAILABLE = True
|
||||
except ImportError:
|
||||
PANDAS_AVAILABLE = False
|
||||
pd = None # Define to avoid runtime errors on reference
|
||||
|
||||
# --- KI UMSCHALTUNG: Google Generative AI statt OpenAI ---
|
||||
try:
|
||||
@@ -44,8 +59,20 @@ except ImportError:
|
||||
logging.warning("google-generativeai Bibliothek nicht gefunden. KI-Funktionen deaktiviert.")
|
||||
|
||||
# OpenAI Imports entfernen wir oder machen sie optional, um Verwirrung zu vermeiden
|
||||
import openai
|
||||
from openai.error import AuthenticationError, OpenAIError, RateLimitError, APIError, Timeout, InvalidRequestError, ServiceUnavailableError
|
||||
try:
|
||||
import openai
|
||||
from openai.error import AuthenticationError, OpenAIError, RateLimitError, APIError, Timeout, InvalidRequestError, ServiceUnavailableError
|
||||
OPENAI_AVAILABLE = True
|
||||
except ImportError:
|
||||
OPENAI_AVAILABLE = False
|
||||
# Define dummy exception classes so the code doesn't crash if it tries to catch them
|
||||
class AuthenticationError(Exception): pass
|
||||
class OpenAIError(Exception): pass
|
||||
class RateLimitError(Exception): pass
|
||||
class APIError(Exception): pass
|
||||
class Timeout(Exception): pass
|
||||
class InvalidRequestError(Exception): pass
|
||||
class ServiceUnavailableError(Exception): pass
|
||||
|
||||
from config import (Config, BRANCH_MAPPING_FILE, URL_CHECK_MARKER, USER_AGENTS, LOG_DIR)
|
||||
|
||||
@@ -106,11 +133,17 @@ def retry_on_failure(func):
|
||||
decorator_logger.warning(f"Wiederhole Versuch {attempt + 1}/{max_retries_config} fuer '{effective_func_name}'...")
|
||||
return func(*args, **kwargs)
|
||||
|
||||
except (gspread.exceptions.SpreadsheetNotFound, ValueError) as e: # AuthError removed from here as it might be recoverable with new key
|
||||
decorator_logger.critical(f"❌ ENDGUELTIGER FEHLER bei '{effective_func_name}': Permanentes Problem erkannt. {type(e).__name__} - {str(e)[:150]}...")
|
||||
raise e
|
||||
|
||||
except Exception as e: # Catch all to include Gemini errors
|
||||
# Define permanent errors that should not be retried
|
||||
permanent_errors = [ValueError]
|
||||
if GSPREAD_AVAILABLE:
|
||||
permanent_errors.append(gspread.exceptions.SpreadsheetNotFound)
|
||||
|
||||
if any(isinstance(e, error_type) for error_type in permanent_errors):
|
||||
decorator_logger.critical(f"❌ ENDGUELTIGER FEHLER bei '{effective_func_name}': Permanentes Problem erkannt. {type(e).__name__} - {str(e)[:150]}...")
|
||||
raise e
|
||||
|
||||
# Handle retryable errors
|
||||
error_msg = str(e)
|
||||
error_type = type(e).__name__
|
||||
|
||||
@@ -380,6 +413,58 @@ def generate_fsm_pitch(company_name, company_short_name, ki_branche, website_sum
|
||||
def serp_website_lookup(company_name): return "k.A." # Placeholder
|
||||
def search_linkedin_contacts(company_name, website, position_query, crm_kurzform, num_results=10): return [] # Placeholder
|
||||
def get_website_raw(url, max_length=30000, verify_cert=False): return "k.A." # Placeholder
|
||||
def scrape_website_details(url): return "k.A." # Placeholder
|
||||
def scrape_website_details(url):
|
||||
"""
|
||||
Fetches and extracts clean text content from a URL using requests and BeautifulSoup.
|
||||
- Removes common non-content tags.
|
||||
- Limits content length to avoid excessive token usage.
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
if not url or not isinstance(url, str) or not url.startswith('http'):
|
||||
logger.warning(f"Ungültige oder fehlende URL für Scraping: {url}")
|
||||
return "Keine gültige URL angegeben."
|
||||
|
||||
try:
|
||||
# Use a random user-agent to avoid simple bot detection
|
||||
headers = {'User-Agent': random.choice(USER_AGENTS)}
|
||||
response = requests.get(url, headers=headers, timeout=getattr(Config, 'REQUEST_TIMEOUT', 15), verify=False)
|
||||
response.raise_for_status()
|
||||
|
||||
# Check content type to avoid parsing non-HTML content
|
||||
if 'text/html' not in response.headers.get('Content-Type', ''):
|
||||
logger.warning(f"Inhalt der URL {url} ist kein HTML.")
|
||||
return "Die URL lieferte keinen auswertbaren HTML-Inhalt."
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# Gezieltes Entfernen von störenden Elementen
|
||||
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']):
|
||||
element.decompose()
|
||||
|
||||
# Extrahieren des Textes aus dem Body, um Metadaten etc. im Head zu ignorieren
|
||||
body = soup.find('body')
|
||||
if body:
|
||||
text = body.get_text(separator=' ', strip=True)
|
||||
else:
|
||||
text = soup.get_text(separator=' ', strip=True) # Fallback für seltsame HTML-Strukturen
|
||||
|
||||
# Bereinigen von überflüssigen Leerzeichen
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
# Limit the content length to a reasonable size (e.g., 25000 chars)
|
||||
max_len = 25000
|
||||
if len(text) > max_len:
|
||||
logger.info(f"Inhalt von {url} auf {max_len} Zeichen gekürzt (Original: {len(text)}).")
|
||||
text = text[:max_len]
|
||||
|
||||
logger.info(f"Scraping von {url} erfolgreich. Länge: {len(text)} Zeichen.")
|
||||
return text if text else "Website-Inhalt konnte nicht extrahiert werden."
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Fehler beim Abrufen der URL {url}: {e}")
|
||||
return f"Fehler: Die URL konnte nicht abgerufen werden. (Grund: {e.__class__.__name__})"
|
||||
except Exception as e:
|
||||
logger.error(f"Unerwarteter Fehler beim Parsen der URL {url}: {e}")
|
||||
return "Fehler: Ein unerwarteter Fehler ist beim Verarbeiten der Website aufgetreten."
|
||||
def is_valid_wikipedia_article_url(url): return False # Placeholder
|
||||
def alignment_demo(sheet_handler): pass # Placeholder
|
||||
|
||||
Reference in New Issue
Block a user