feat(gtm-architect): Finalize migration and implement web scraping

- Refactors the gtm-architect Dockerfile for a flat, more efficient build process.
- Implements robust web scraping via BeautifulSoup in helpers.py for URL analysis in phase1.
- Makes shared library imports (gspread, pandas, etc.) in helpers.py optional to prevent ModuleNotFoundErrors in microservices.
- Implements the main execution logic in the orchestrator to handle command-line arguments.
- Updates documentation to reflect the new architecture, scraping feature, and dependency handling.
This commit is contained in:
2026-01-03 08:43:53 +00:00
parent 2663d85ae7
commit 302a211239
7 changed files with 282 additions and 64 deletions

View File

@@ -42,14 +42,29 @@ Wenn die App unter einem Unterverzeichnis (z.B. `/gtm/`) läuft, findet sie ihre
});
```
### 1.4 Python Dependencies (OpenAI Version)
Das Projekt nutzt ein geteiltes `helpers.py`, das auf der alten OpenAI Python Library (v0.28.1) basiert.
* **Fehler:** `ModuleNotFoundError: No module named 'openai.error'`
* **Ursache:** `pip install openai` installiert standardmäßig v1.x, was inkompatibel ist.
* **Fix:** In `requirements.txt` zwingend die Version pinnen:
### 1.4 Python Dependencies & Shared Libraries (Critical Pitfall)
Das Projekt nutzt ein zentrales `helpers.py`, das von mehreren Services geteilt wird. Dies führt oft zu `ModuleNotFoundError`, da eine kleine App (wie `gtm-architect`) nicht alle Bibliotheken benötigt, die in `helpers.py` importiert werden (z.B. `gspread`, `pandas`).
* **Fehler:** `ModuleNotFoundError: No module named 'gspread'`
* **Ursache:** Die `gtm-architect/requirements.txt` enthält `gspread` nicht, aber `helpers.py` versucht es zu importieren.
* **Fix (in `helpers.py`):** Machen Sie "exotische" Importe optional. Dies ist die robusteste Methode, um die Kompatibilität zu wahren, ohne die `requirements.txt` kleiner Apps aufzublähen.
```python
# Beispiel in helpers.py
try:
import gspread
GSPREAD_AVAILABLE = True
except ImportError:
GSPREAD_AVAILABLE = False
gspread = None # Wichtig, damit Referenzen nicht fehlschlagen
```
* **Fix (in `requirements.txt`):** Stellen Sie sicher, dass die für die App **unmittelbar** benötigten Bibliotheken vorhanden sind. Für `gtm-architect` sind das:
```text
openai==0.28.1
# weitere deps...
google-generativeai
requests
beautifulsoup4
```
### 1.5 Python Syntax & F-Strings
Multi-Line Prompts können in Docker-Umgebungen zu **sehr hartnäckigen Syntaxfehlern** führen, selbst wenn sie lokal korrekt aussehen.
@@ -149,10 +164,14 @@ RUN npm run build
FROM python:3.11-slim
WORKDIR /app
# Node.js installieren (für Server Bridge)
RUN apt-get update && apt-get install -y --no-install-recommends nodejs npm && rm -rf /var/lib/apt/lists/*
# Node.js installieren (für Server Bridge, optimierte Methode)
RUN apt-get update && \
apt-get install -y --no-install-recommends curl ca-certificates && \
curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
apt-get install -y --no-install-recommends nodejs && \
rm -rf /var/lib/apt/lists/*
# Python Deps
# Python Deps (aus der app-spezifischen requirements.txt)
COPY mein-app-ordner/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
@@ -228,7 +247,7 @@ Achtung beim Routing. Wenn die App unter `/app/` laufen soll, muss der Trailing
- [ ] `express` in `package.json`?
- [ ] `vite.config.ts` hat `base: './'`?
- [ ] `requirements.txt` hat `openai==0.28.1`?
- [ ] `requirements.txt` enthält die korrekten (minimalen) Dependencies?
- [ ] `server.cjs` hat Timeouts (>600s)?
- [ ] `docker-compose.yml` mountet auch `helpers.py` und `config.py`?
- [ ] Leere `.db` Datei auf dem Host erstellt?

View File

@@ -9,7 +9,7 @@ und das Spalten-Mapping für das Google Sheet.
import os
import re
import openai
import logging
# ==============================================================================

View File

@@ -1,5 +1,5 @@
# Stage 1: Build the React frontend
FROM node:18-slim AS builder
FROM node:20-slim AS builder
WORKDIR /app
# Copy package.json from the subdirectory (relative to project root)
@@ -15,20 +15,22 @@ COPY gtm-architect/ ./
RUN npm run build
# Stage 2: Setup the production environment
FROM python:3.9-slim
FROM python:3.11-slim
WORKDIR /app
# Install Node.js
RUN apt-get update && apt-get install -y curl && \
curl -sL https://deb.nodesource.com/setup_18.x | bash - && \
apt-get install -y nodejs
RUN apt-get update && \
apt-get install -y --no-install-recommends curl ca-certificates && \
curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
apt-get install -y --no-install-recommends nodejs && \
rm -rf /var/lib/apt/lists/*
# Copy built frontend from builder stage
COPY --from=builder /app/dist ./gtm-architect/dist
COPY --from=builder /app/dist ./dist
# Copy backend files and application code (paths relative to project root)
COPY gtm-architect/server.cjs ./gtm-architect/
COPY gtm-architect/package.json ./gtm-architect/
COPY gtm-architect/server.cjs .
COPY gtm-architect/package.json .
COPY gtm_architect_orchestrator.py .
COPY helpers.py .
COPY config.py .
@@ -38,10 +40,10 @@ COPY gtm_db_manager.py .
# Install Python and Node.js dependencies
RUN pip install --no-cache-dir -r requirements.txt
RUN cd gtm-architect && npm install --production
RUN npm install --omit=dev
# Expose the port the server will run on
EXPOSE 3005
# Command to run the server
CMD ["node", "gtm-architect/server.cjs"]
CMD ["node", "server.cjs"]

View File

@@ -4,6 +4,8 @@ const cors = require('cors');
const path = require('path');
const fs = require('fs');
const VERSION = "1.0.0"; // Added for debugging and tracking
const app = express();
const port = 3005;
@@ -33,7 +35,7 @@ app.post('/api/run', (req, res) => {
const payloadString = JSON.stringify(payload);
const payloadBase64 = Buffer.from(payloadString).toString('base64');
const pythonScriptPath = path.join(__dirname, '../gtm_architect_orchestrator.py');
const pythonScriptPath = path.join(__dirname, 'gtm_architect_orchestrator.py');
const pythonProcess = spawn('python3', [
pythonScriptPath,
'--mode', mode,
@@ -84,6 +86,9 @@ if (fs.existsSync(staticPath)) {
}
app.listen(port, () => {
console.log(`Server listening on port ${port}`);
const server = app.listen(port, () => {
console.log(`Server listening on port ${port} (Version: ${VERSION})`);
});
server.setTimeout(600000); // 10 minutes
server.keepAliveTimeout = 610000;
server.headersTimeout = 620000;

View File

@@ -32,7 +32,7 @@ graph LR
* Funktion: Nimmt HTTP-Requests vom Frontend entgegen, decodiert den Base64-Payload, startet das Python-Skript via `spawn`, und streamt den JSON-Output zurück.
3. **Logic Core (`gtm_architect_orchestrator.py`):**
* Runtime: **Python 3.9+**.
* Runtime: **Python 3.11+**.
* Funktion: Enthält die gesamte Business-Logik und Prompt-Engineering für alle 9 Phasen.
* Abhängigkeiten: `helpers.py` (Gemini Wrapper), `gtm_db_manager.py` (Datenbank), `config.py` (Keys).
* **NEU:** Argument-Parsing über `--mode` und `--payload_base64`.
@@ -48,7 +48,7 @@ Der Orchestrator steuert die folgenden Phasen. Jeder Modus erwartet ein spezifis
| Phase | Modus | Input | Output | Beschreibung |
| :--- | :--- | :--- | :--- | :--- |
| **1** | `phase1` | Rohtext / URL | Features, Constraints | Extrahiert technische Daten & prüft Portfolio-Konflikte. |
| **1** | `phase1` | Rohtext / URL | Features, Constraints | Extrahiert technische Daten aus Text oder durch Scraping einer URL & prüft Portfolio-Konflikte. |
| **2** | `phase2` | Phase 1 Result | ICPs, Data Proxies | Identifiziert ideale Kundenprofile (Branchen). |
| **3** | `phase3` | Phase 2 Result | Whales (Firmen), Rollen | Identifiziert konkrete Zielkunden und Buying Center Rollen. |
| **4** | `phase4` | Phase 1 & 3 | Strategy Matrix | Entwickelt "Angles" und Pain-Points pro Segment. |
@@ -64,7 +64,7 @@ Der Orchestrator steuert die folgenden Phasen. Jeder Modus erwartet ein spezifis
### Docker Integration
Der Service läuft im Container `gtm-app`.
* **Build:** Multi-Stage: Node.js baut Frontend -> Python Image installiert Node & Python Deps.
* **Build:** Multi-Stage: Ein Node.js 20 Builder-Stage baut das Frontend. Das finale Python 3.11 Image installiert Node.js über das offizielle NodeSource-Repository und kopiert die Artefakte.
* **Volume Mounts (Sideloading):**
* `/app/gtm-architect`: Frontend-Build & Server-Code.
* `/app/gtm_architect_orchestrator.py`: Python-Logik.

View File

@@ -14,7 +14,7 @@ import gtm_db_manager as db_manager
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from helpers import call_gemini_flash
from helpers import call_gemini_flash, scrape_website_details
LOG_DIR = "Log_from_docker"
if not os.path.exists(LOG_DIR):
@@ -46,35 +46,92 @@ def log_and_save(project_id, step_name, data_type, content):
except Exception as e:
logging.error(f"Failed to save {data_type} to file: {e}")
def get_text_from_url(url):
try:
logging.info(f"Scraping URL: {url}")
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside']):
element.decompose()
text = soup.get_text(separator=' ', strip=True)
logging.info(f"Scraping successful. Content length: {len(text)}")
return text[:30000]
except Exception as e:
logging.error(f"Scraping failed for URL {url}: {e}")
return ""
def get_system_instruction(lang):
# Same as before
pass
if lang == 'de':
return """
Du bist ein internationaler Go-to-Market (GTM) Experte für B2B-Technologie-Unternehmen im Bereich Robotik, Facility Management und IoT.
Deine Aufgabe ist es, aus technischen Spezifikationen und Produktbeschreibungen eine umfassende GTM-Strategie zu entwickeln.
Du arbeitest strukturiert, datengetrieben und präzise. Deine Antworten sind immer klar, professionell und direkt auf den Punkt.
Wenn du JSON ausgeben sollst, gib NUR das JSON-Objekt aus, ohne umschließende Text- oder Code-Formatierungen.
Behalte während des gesamten Prozesses eine konsistente Logik bei. Alle Phasen bauen aufeinander auf.
Führe eine interne Plausibilitätsprüfung durch, bevor du eine Antwort gibst.
Verwende "Wackler Symbiosis" als internes Framework für die Analyse von Produkt-Synergien.
Nutze das "Hybrid Service Logic" Konzept, um zu bewerten, ob ein Produkt mit einer Dienstleistung kombiniert werden muss (z.B. bei hohen Wartungsanforderungen).
"""
else: # Default to English
return """
You are an international Go-to-Market (GTM) expert for B2B technology companies in robotics, facility management, and IoT.
Your task is to develop a comprehensive GTM strategy from technical specifications and product descriptions.
You are structured, data-driven, and precise. Your answers are always clear, professional, and to the point.
When asked to output JSON, provide ONLY the JSON object without any surrounding text or code formatting.
Maintain consistent logic throughout the process. All phases build on each other.
Perform an internal plausibility check before providing an answer.
Use "Wackler Symbiosis" as an internal framework for analyzing product synergies.
Use the "Hybrid Service Logic" concept to evaluate if a product needs to be combined with a service (e.g., due to high maintenance requirements).
"""
# --- ORCHESTRATOR PHASES ---
def phase1(payload):
# ... (implementation from before)
pass
product_input = payload.get('productInput', '')
lang = payload.get('lang', 'de')
project_id = payload.get('projectId')
# Check if input is a URL and scrape it
if product_input.strip().startswith('http'):
logging.info(f"Input detected as URL. Starting scrape for: {product_input}")
analysis_content = scrape_website_details(product_input)
if "Fehler:" in analysis_content:
# If scraping fails, use the URL itself with a note for the AI.
analysis_content = f"Scraping der URL {product_input} ist fehlgeschlagen. Analysiere das Produkt basierend auf der URL und deinem allgemeinen Wissen."
logging.warning("Scraping failed. Using URL as fallback content for analysis.")
else:
analysis_content = product_input
logging.info("Input is raw text. Analyzing directly.")
sys_instr = get_system_instruction(lang)
prompt = f"""
PHASE 1: PRODUCT ANALYSIS & CONSTRAINTS
Input: "{analysis_content}"
Task: 1. Extract technical features. 2. Define hard constraints. 3. Check for internal portfolio conflicts (hypothetical product "Scrubber 5000").
Output JSON format ONLY: {{"features": [], "constraints": [], "conflictCheck": {{"hasConflict": false, "details": "", "relatedProduct": ""}}, "rawAnalysis": ""}}
"""
log_and_save(project_id, "phase1", "prompt", prompt)
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
log_and_save(project_id, "phase1", "response", response)
try:
data = json.loads(response)
db_manager.save_gtm_result(project_id, 'phase1_result', json.dumps(data))
return data
except json.JSONDecodeError:
logging.error(f"Failed to decode JSON from Gemini response in phase1: {response}")
# Return a structured error that the frontend can display
error_response = {
"error": "Die Antwort des KI-Modells war kein gültiges JSON. Das passiert manchmal bei hoher Auslastung. Bitte versuchen Sie es in Kürze erneut.",
"details": response
}
return error_response
def phase2(payload):
# ... (implementation from before)
pass
phase1_data = payload.get('phase1Data', {})
lang = payload.get('lang', 'de')
project_id = payload.get('projectId')
sys_instr = get_system_instruction(lang)
prompt = f"""
PHASE 2: IDEAL CUSTOMER PROFILE (ICP) & DATA PROXIES
Product Context: {json.dumps(phase1_data)}
Task: 1. Identify top 3 ICPs (Ideal Customer Profiles/Industries). 2. Define data proxies for identifying these ICPs online.
Output JSON format ONLY: {{"icps": [{{"name": "", "rationale": ""}}], "dataProxies": [{{"target": "", "method": ""}}]}}
"""
log_and_save(project_id, "phase2", "prompt", prompt)
response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True)
log_and_save(project_id, "phase2", "response", response)
data = json.loads(response)
db_manager.save_gtm_result(project_id, 'phase2_result', json.dumps(data))
return data
def phase3(payload):
phase2_data = payload.get('phase2Data', {})
@@ -241,8 +298,58 @@ def image(payload):
return {"imageBase64": ""}
def main():
# ... (main function from before)
pass
"""
Main entry point of the script.
Parses command-line arguments to determine which phase to run.
"""
parser = argparse.ArgumentParser(description="GTM Architect Orchestrator")
parser.add_argument("--mode", required=True, help="The execution mode (e.g., phase1, phase2).")
parser.add_argument("--payload_base64", required=True, help="The Base64 encoded JSON payload.")
args = parser.parse_args()
try:
payload_str = base64.b64decode(args.payload_base64).decode('utf-8')
payload = json.loads(payload_str)
except (json.JSONDecodeError, base64.binascii.Error) as e:
logging.error(f"Failed to decode payload: {e}")
# Print error as JSON to stdout for the server to catch
print(json.dumps({"error": "Invalid payload format.", "details": str(e)}))
sys.exit(1)
# Function mapping to dynamically call the correct phase
modes = {
"phase1": phase1,
"phase2": phase2,
"phase3": phase3,
"phase4": phase4,
"phase5": phase5,
"phase6": phase6,
"phase7": phase7,
"phase8": phase8,
"phase9": phase9,
"translate": translate,
"image": image,
}
mode_function = modes.get(args.mode)
if not mode_function:
logging.error(f"Invalid mode specified: {args.mode}")
print(json.dumps({"error": f"Invalid mode: {args.mode}"}))
sys.exit(1)
try:
logging.info(f"Executing mode: {args.mode}")
result = mode_function(payload)
# Ensure the output is always a JSON string
print(json.dumps(result, ensure_ascii=False))
logging.info(f"Successfully executed mode: {args.mode}")
except Exception as e:
logging.error(f"An error occurred during execution of mode '{args.mode}': {e}", exc_info=True)
print(json.dumps({"error": f"An error occurred in {args.mode}.", "details": str(e)}))
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -29,11 +29,26 @@ from urllib.parse import urlparse, unquote
from difflib import SequenceMatcher
# Externe Bibliotheken
try:
import gspread
GSPREAD_AVAILABLE = True
except ImportError:
GSPREAD_AVAILABLE = False
gspread = None # Define to avoid runtime errors on reference
try:
import wikipedia
WIKIPEDIA_AVAILABLE = True
except ImportError:
WIKIPEDIA_AVAILABLE = False
wikipedia = None # Define to avoid runtime errors on reference
import requests
from bs4 import BeautifulSoup
try:
import pandas as pd
PANDAS_AVAILABLE = True
except ImportError:
PANDAS_AVAILABLE = False
pd = None # Define to avoid runtime errors on reference
# --- KI UMSCHALTUNG: Google Generative AI statt OpenAI ---
try:
@@ -44,8 +59,20 @@ except ImportError:
logging.warning("google-generativeai Bibliothek nicht gefunden. KI-Funktionen deaktiviert.")
# OpenAI Imports entfernen wir oder machen sie optional, um Verwirrung zu vermeiden
try:
import openai
from openai.error import AuthenticationError, OpenAIError, RateLimitError, APIError, Timeout, InvalidRequestError, ServiceUnavailableError
OPENAI_AVAILABLE = True
except ImportError:
OPENAI_AVAILABLE = False
# Define dummy exception classes so the code doesn't crash if it tries to catch them
class AuthenticationError(Exception): pass
class OpenAIError(Exception): pass
class RateLimitError(Exception): pass
class APIError(Exception): pass
class Timeout(Exception): pass
class InvalidRequestError(Exception): pass
class ServiceUnavailableError(Exception): pass
from config import (Config, BRANCH_MAPPING_FILE, URL_CHECK_MARKER, USER_AGENTS, LOG_DIR)
@@ -106,11 +133,17 @@ def retry_on_failure(func):
decorator_logger.warning(f"Wiederhole Versuch {attempt + 1}/{max_retries_config} fuer '{effective_func_name}'...")
return func(*args, **kwargs)
except (gspread.exceptions.SpreadsheetNotFound, ValueError) as e: # AuthError removed from here as it might be recoverable with new key
except Exception as e: # Catch all to include Gemini errors
# Define permanent errors that should not be retried
permanent_errors = [ValueError]
if GSPREAD_AVAILABLE:
permanent_errors.append(gspread.exceptions.SpreadsheetNotFound)
if any(isinstance(e, error_type) for error_type in permanent_errors):
decorator_logger.critical(f"❌ ENDGUELTIGER FEHLER bei '{effective_func_name}': Permanentes Problem erkannt. {type(e).__name__} - {str(e)[:150]}...")
raise e
except Exception as e: # Catch all to include Gemini errors
# Handle retryable errors
error_msg = str(e)
error_type = type(e).__name__
@@ -380,6 +413,58 @@ def generate_fsm_pitch(company_name, company_short_name, ki_branche, website_sum
def serp_website_lookup(company_name): return "k.A." # Placeholder
def search_linkedin_contacts(company_name, website, position_query, crm_kurzform, num_results=10): return [] # Placeholder
def get_website_raw(url, max_length=30000, verify_cert=False): return "k.A." # Placeholder
def scrape_website_details(url): return "k.A." # Placeholder
def scrape_website_details(url):
"""
Fetches and extracts clean text content from a URL using requests and BeautifulSoup.
- Removes common non-content tags.
- Limits content length to avoid excessive token usage.
"""
logger = logging.getLogger(__name__)
if not url or not isinstance(url, str) or not url.startswith('http'):
logger.warning(f"Ungültige oder fehlende URL für Scraping: {url}")
return "Keine gültige URL angegeben."
try:
# Use a random user-agent to avoid simple bot detection
headers = {'User-Agent': random.choice(USER_AGENTS)}
response = requests.get(url, headers=headers, timeout=getattr(Config, 'REQUEST_TIMEOUT', 15), verify=False)
response.raise_for_status()
# Check content type to avoid parsing non-HTML content
if 'text/html' not in response.headers.get('Content-Type', ''):
logger.warning(f"Inhalt der URL {url} ist kein HTML.")
return "Die URL lieferte keinen auswertbaren HTML-Inhalt."
soup = BeautifulSoup(response.content, 'html.parser')
# Gezieltes Entfernen von störenden Elementen
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']):
element.decompose()
# Extrahieren des Textes aus dem Body, um Metadaten etc. im Head zu ignorieren
body = soup.find('body')
if body:
text = body.get_text(separator=' ', strip=True)
else:
text = soup.get_text(separator=' ', strip=True) # Fallback für seltsame HTML-Strukturen
# Bereinigen von überflüssigen Leerzeichen
text = re.sub(r'\s+', ' ', text).strip()
# Limit the content length to a reasonable size (e.g., 25000 chars)
max_len = 25000
if len(text) > max_len:
logger.info(f"Inhalt von {url} auf {max_len} Zeichen gekürzt (Original: {len(text)}).")
text = text[:max_len]
logger.info(f"Scraping von {url} erfolgreich. Länge: {len(text)} Zeichen.")
return text if text else "Website-Inhalt konnte nicht extrahiert werden."
except requests.exceptions.RequestException as e:
logger.error(f"Fehler beim Abrufen der URL {url}: {e}")
return f"Fehler: Die URL konnte nicht abgerufen werden. (Grund: {e.__class__.__name__})"
except Exception as e:
logger.error(f"Unerwarteter Fehler beim Parsen der URL {url}: {e}")
return "Fehler: Ein unerwarteter Fehler ist beim Verarbeiten der Website aufgetreten."
def is_valid_wikipedia_article_url(url): return False # Placeholder
def alignment_demo(sheet_handler): pass # Placeholder