diff --git a/BUILDER_APPS_MIGRATION.md b/BUILDER_APPS_MIGRATION.md index 2e5ec088..98f470c8 100644 --- a/BUILDER_APPS_MIGRATION.md +++ b/BUILDER_APPS_MIGRATION.md @@ -42,14 +42,29 @@ Wenn die App unter einem Unterverzeichnis (z.B. `/gtm/`) läuft, findet sie ihre }); ``` -### 1.4 Python Dependencies (OpenAI Version) -Das Projekt nutzt ein geteiltes `helpers.py`, das auf der alten OpenAI Python Library (v0.28.1) basiert. -* **Fehler:** `ModuleNotFoundError: No module named 'openai.error'` -* **Ursache:** `pip install openai` installiert standardmäßig v1.x, was inkompatibel ist. -* **Fix:** In `requirements.txt` zwingend die Version pinnen: +### 1.4 Python Dependencies & Shared Libraries (Critical Pitfall) +Das Projekt nutzt ein zentrales `helpers.py`, das von mehreren Services geteilt wird. Dies führt oft zu `ModuleNotFoundError`, da eine kleine App (wie `gtm-architect`) nicht alle Bibliotheken benötigt, die in `helpers.py` importiert werden (z.B. `gspread`, `pandas`). + +* **Fehler:** `ModuleNotFoundError: No module named 'gspread'` +* **Ursache:** Die `gtm-architect/requirements.txt` enthält `gspread` nicht, aber `helpers.py` versucht es zu importieren. +* **Fix (in `helpers.py`):** Machen Sie "exotische" Importe optional. Dies ist die robusteste Methode, um die Kompatibilität zu wahren, ohne die `requirements.txt` kleiner Apps aufzublähen. + + ```python + # Beispiel in helpers.py + try: + import gspread + GSPREAD_AVAILABLE = True + except ImportError: + GSPREAD_AVAILABLE = False + gspread = None # Wichtig, damit Referenzen nicht fehlschlagen + ``` +* **Fix (in `requirements.txt`):** Stellen Sie sicher, dass die für die App **unmittelbar** benötigten Bibliotheken vorhanden sind. Für `gtm-architect` sind das: ```text - openai==0.28.1 - # weitere deps... + google-generativeai + requests + beautifulsoup4 + ``` + ### 1.5 Python Syntax & F-Strings Multi-Line Prompts können in Docker-Umgebungen zu **sehr hartnäckigen Syntaxfehlern** führen, selbst wenn sie lokal korrekt aussehen. @@ -149,10 +164,14 @@ RUN npm run build FROM python:3.11-slim WORKDIR /app -# Node.js installieren (für Server Bridge) -RUN apt-get update && apt-get install -y --no-install-recommends nodejs npm && rm -rf /var/lib/apt/lists/* +# Node.js installieren (für Server Bridge, optimierte Methode) +RUN apt-get update && \ + apt-get install -y --no-install-recommends curl ca-certificates && \ + curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \ + apt-get install -y --no-install-recommends nodejs && \ + rm -rf /var/lib/apt/lists/* -# Python Deps +# Python Deps (aus der app-spezifischen requirements.txt) COPY mein-app-ordner/requirements.txt . RUN pip install --no-cache-dir -r requirements.txt @@ -228,7 +247,7 @@ Achtung beim Routing. Wenn die App unter `/app/` laufen soll, muss der Trailing - [ ] `express` in `package.json`? - [ ] `vite.config.ts` hat `base: './'`? -- [ ] `requirements.txt` hat `openai==0.28.1`? +- [ ] `requirements.txt` enthält die korrekten (minimalen) Dependencies? - [ ] `server.cjs` hat Timeouts (>600s)? - [ ] `docker-compose.yml` mountet auch `helpers.py` und `config.py`? - [ ] Leere `.db` Datei auf dem Host erstellt? diff --git a/config.py b/config.py index cdc43bb4..85b3b8e2 100644 --- a/config.py +++ b/config.py @@ -9,7 +9,7 @@ und das Spalten-Mapping für das Google Sheet. import os import re -import openai + import logging # ============================================================================== diff --git a/gtm-architect/Dockerfile b/gtm-architect/Dockerfile index 92921137..8d5b35ae 100644 --- a/gtm-architect/Dockerfile +++ b/gtm-architect/Dockerfile @@ -1,5 +1,5 @@ # Stage 1: Build the React frontend -FROM node:18-slim AS builder +FROM node:20-slim AS builder WORKDIR /app # Copy package.json from the subdirectory (relative to project root) @@ -15,20 +15,22 @@ COPY gtm-architect/ ./ RUN npm run build # Stage 2: Setup the production environment -FROM python:3.9-slim +FROM python:3.11-slim WORKDIR /app # Install Node.js -RUN apt-get update && apt-get install -y curl && \ - curl -sL https://deb.nodesource.com/setup_18.x | bash - && \ - apt-get install -y nodejs +RUN apt-get update && \ + apt-get install -y --no-install-recommends curl ca-certificates && \ + curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \ + apt-get install -y --no-install-recommends nodejs && \ + rm -rf /var/lib/apt/lists/* # Copy built frontend from builder stage -COPY --from=builder /app/dist ./gtm-architect/dist +COPY --from=builder /app/dist ./dist # Copy backend files and application code (paths relative to project root) -COPY gtm-architect/server.cjs ./gtm-architect/ -COPY gtm-architect/package.json ./gtm-architect/ +COPY gtm-architect/server.cjs . +COPY gtm-architect/package.json . COPY gtm_architect_orchestrator.py . COPY helpers.py . COPY config.py . @@ -38,10 +40,10 @@ COPY gtm_db_manager.py . # Install Python and Node.js dependencies RUN pip install --no-cache-dir -r requirements.txt -RUN cd gtm-architect && npm install --production +RUN npm install --omit=dev # Expose the port the server will run on EXPOSE 3005 # Command to run the server -CMD ["node", "gtm-architect/server.cjs"] +CMD ["node", "server.cjs"] diff --git a/gtm-architect/server.cjs b/gtm-architect/server.cjs index fdda311f..44b0d406 100644 --- a/gtm-architect/server.cjs +++ b/gtm-architect/server.cjs @@ -4,6 +4,8 @@ const cors = require('cors'); const path = require('path'); const fs = require('fs'); +const VERSION = "1.0.0"; // Added for debugging and tracking + const app = express(); const port = 3005; @@ -33,7 +35,7 @@ app.post('/api/run', (req, res) => { const payloadString = JSON.stringify(payload); const payloadBase64 = Buffer.from(payloadString).toString('base64'); - const pythonScriptPath = path.join(__dirname, '../gtm_architect_orchestrator.py'); + const pythonScriptPath = path.join(__dirname, 'gtm_architect_orchestrator.py'); const pythonProcess = spawn('python3', [ pythonScriptPath, '--mode', mode, @@ -84,6 +86,9 @@ if (fs.existsSync(staticPath)) { } -app.listen(port, () => { - console.log(`Server listening on port ${port}`); -}); \ No newline at end of file +const server = app.listen(port, () => { + console.log(`Server listening on port ${port} (Version: ${VERSION})`); +}); +server.setTimeout(600000); // 10 minutes +server.keepAliveTimeout = 610000; +server.headersTimeout = 620000; \ No newline at end of file diff --git a/gtm_architect_documentation.md b/gtm_architect_documentation.md index 33440a0b..444dd5be 100644 --- a/gtm_architect_documentation.md +++ b/gtm_architect_documentation.md @@ -32,7 +32,7 @@ graph LR * Funktion: Nimmt HTTP-Requests vom Frontend entgegen, decodiert den Base64-Payload, startet das Python-Skript via `spawn`, und streamt den JSON-Output zurück. 3. **Logic Core (`gtm_architect_orchestrator.py`):** - * Runtime: **Python 3.9+**. + * Runtime: **Python 3.11+**. * Funktion: Enthält die gesamte Business-Logik und Prompt-Engineering für alle 9 Phasen. * Abhängigkeiten: `helpers.py` (Gemini Wrapper), `gtm_db_manager.py` (Datenbank), `config.py` (Keys). * **NEU:** Argument-Parsing über `--mode` und `--payload_base64`. @@ -48,7 +48,7 @@ Der Orchestrator steuert die folgenden Phasen. Jeder Modus erwartet ein spezifis | Phase | Modus | Input | Output | Beschreibung | | :--- | :--- | :--- | :--- | :--- | -| **1** | `phase1` | Rohtext / URL | Features, Constraints | Extrahiert technische Daten & prüft Portfolio-Konflikte. | +| **1** | `phase1` | Rohtext / URL | Features, Constraints | Extrahiert technische Daten aus Text oder durch Scraping einer URL & prüft Portfolio-Konflikte. | | **2** | `phase2` | Phase 1 Result | ICPs, Data Proxies | Identifiziert ideale Kundenprofile (Branchen). | | **3** | `phase3` | Phase 2 Result | Whales (Firmen), Rollen | Identifiziert konkrete Zielkunden und Buying Center Rollen. | | **4** | `phase4` | Phase 1 & 3 | Strategy Matrix | Entwickelt "Angles" und Pain-Points pro Segment. | @@ -64,7 +64,7 @@ Der Orchestrator steuert die folgenden Phasen. Jeder Modus erwartet ein spezifis ### Docker Integration Der Service läuft im Container `gtm-app`. -* **Build:** Multi-Stage: Node.js baut Frontend -> Python Image installiert Node & Python Deps. +* **Build:** Multi-Stage: Ein Node.js 20 Builder-Stage baut das Frontend. Das finale Python 3.11 Image installiert Node.js über das offizielle NodeSource-Repository und kopiert die Artefakte. * **Volume Mounts (Sideloading):** * `/app/gtm-architect`: Frontend-Build & Server-Code. * `/app/gtm_architect_orchestrator.py`: Python-Logik. diff --git a/gtm_architect_orchestrator.py b/gtm_architect_orchestrator.py index e27b4e37..5435992a 100644 --- a/gtm_architect_orchestrator.py +++ b/gtm_architect_orchestrator.py @@ -14,7 +14,7 @@ import gtm_db_manager as db_manager sys.path.append(os.path.dirname(os.path.abspath(__file__))) -from helpers import call_gemini_flash +from helpers import call_gemini_flash, scrape_website_details LOG_DIR = "Log_from_docker" if not os.path.exists(LOG_DIR): @@ -46,35 +46,92 @@ def log_and_save(project_id, step_name, data_type, content): except Exception as e: logging.error(f"Failed to save {data_type} to file: {e}") -def get_text_from_url(url): - try: - logging.info(f"Scraping URL: {url}") - headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'} - response = requests.get(url, headers=headers, timeout=15) - response.raise_for_status() - soup = BeautifulSoup(response.content, 'html.parser') - for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside']): - element.decompose() - text = soup.get_text(separator=' ', strip=True) - logging.info(f"Scraping successful. Content length: {len(text)}") - return text[:30000] - except Exception as e: - logging.error(f"Scraping failed for URL {url}: {e}") - return "" - def get_system_instruction(lang): - # Same as before - pass + if lang == 'de': + return """ + Du bist ein internationaler Go-to-Market (GTM) Experte für B2B-Technologie-Unternehmen im Bereich Robotik, Facility Management und IoT. + Deine Aufgabe ist es, aus technischen Spezifikationen und Produktbeschreibungen eine umfassende GTM-Strategie zu entwickeln. + Du arbeitest strukturiert, datengetrieben und präzise. Deine Antworten sind immer klar, professionell und direkt auf den Punkt. + Wenn du JSON ausgeben sollst, gib NUR das JSON-Objekt aus, ohne umschließende Text- oder Code-Formatierungen. + Behalte während des gesamten Prozesses eine konsistente Logik bei. Alle Phasen bauen aufeinander auf. + Führe eine interne Plausibilitätsprüfung durch, bevor du eine Antwort gibst. + Verwende "Wackler Symbiosis" als internes Framework für die Analyse von Produkt-Synergien. + Nutze das "Hybrid Service Logic" Konzept, um zu bewerten, ob ein Produkt mit einer Dienstleistung kombiniert werden muss (z.B. bei hohen Wartungsanforderungen). + """ + else: # Default to English + return """ + You are an international Go-to-Market (GTM) expert for B2B technology companies in robotics, facility management, and IoT. + Your task is to develop a comprehensive GTM strategy from technical specifications and product descriptions. + You are structured, data-driven, and precise. Your answers are always clear, professional, and to the point. + When asked to output JSON, provide ONLY the JSON object without any surrounding text or code formatting. + Maintain consistent logic throughout the process. All phases build on each other. + Perform an internal plausibility check before providing an answer. + Use "Wackler Symbiosis" as an internal framework for analyzing product synergies. + Use the "Hybrid Service Logic" concept to evaluate if a product needs to be combined with a service (e.g., due to high maintenance requirements). + """ # --- ORCHESTRATOR PHASES --- def phase1(payload): - # ... (implementation from before) - pass + product_input = payload.get('productInput', '') + lang = payload.get('lang', 'de') + project_id = payload.get('projectId') + + # Check if input is a URL and scrape it + if product_input.strip().startswith('http'): + logging.info(f"Input detected as URL. Starting scrape for: {product_input}") + analysis_content = scrape_website_details(product_input) + if "Fehler:" in analysis_content: + # If scraping fails, use the URL itself with a note for the AI. + analysis_content = f"Scraping der URL {product_input} ist fehlgeschlagen. Analysiere das Produkt basierend auf der URL und deinem allgemeinen Wissen." + logging.warning("Scraping failed. Using URL as fallback content for analysis.") + else: + analysis_content = product_input + logging.info("Input is raw text. Analyzing directly.") + + sys_instr = get_system_instruction(lang) + prompt = f""" + PHASE 1: PRODUCT ANALYSIS & CONSTRAINTS + Input: "{analysis_content}" + Task: 1. Extract technical features. 2. Define hard constraints. 3. Check for internal portfolio conflicts (hypothetical product "Scrubber 5000"). + Output JSON format ONLY: {{"features": [], "constraints": [], "conflictCheck": {{"hasConflict": false, "details": "", "relatedProduct": ""}}, "rawAnalysis": ""}} + """ + log_and_save(project_id, "phase1", "prompt", prompt) + response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True) + log_and_save(project_id, "phase1", "response", response) + + try: + data = json.loads(response) + db_manager.save_gtm_result(project_id, 'phase1_result', json.dumps(data)) + return data + except json.JSONDecodeError: + logging.error(f"Failed to decode JSON from Gemini response in phase1: {response}") + # Return a structured error that the frontend can display + error_response = { + "error": "Die Antwort des KI-Modells war kein gültiges JSON. Das passiert manchmal bei hoher Auslastung. Bitte versuchen Sie es in Kürze erneut.", + "details": response + } + return error_response + def phase2(payload): - # ... (implementation from before) - pass + phase1_data = payload.get('phase1Data', {}) + lang = payload.get('lang', 'de') + project_id = payload.get('projectId') + + sys_instr = get_system_instruction(lang) + prompt = f""" + PHASE 2: IDEAL CUSTOMER PROFILE (ICP) & DATA PROXIES + Product Context: {json.dumps(phase1_data)} + Task: 1. Identify top 3 ICPs (Ideal Customer Profiles/Industries). 2. Define data proxies for identifying these ICPs online. + Output JSON format ONLY: {{"icps": [{{"name": "", "rationale": ""}}], "dataProxies": [{{"target": "", "method": ""}}]}} + """ + log_and_save(project_id, "phase2", "prompt", prompt) + response = call_gemini_flash(prompt, system_instruction=sys_instr, json_mode=True) + log_and_save(project_id, "phase2", "response", response) + data = json.loads(response) + db_manager.save_gtm_result(project_id, 'phase2_result', json.dumps(data)) + return data def phase3(payload): phase2_data = payload.get('phase2Data', {}) @@ -241,8 +298,58 @@ def image(payload): return {"imageBase64": ""} def main(): - # ... (main function from before) - pass + """ + Main entry point of the script. + Parses command-line arguments to determine which phase to run. + """ + parser = argparse.ArgumentParser(description="GTM Architect Orchestrator") + parser.add_argument("--mode", required=True, help="The execution mode (e.g., phase1, phase2).") + parser.add_argument("--payload_base64", required=True, help="The Base64 encoded JSON payload.") + + args = parser.parse_args() + + try: + payload_str = base64.b64decode(args.payload_base64).decode('utf-8') + payload = json.loads(payload_str) + except (json.JSONDecodeError, base64.binascii.Error) as e: + logging.error(f"Failed to decode payload: {e}") + # Print error as JSON to stdout for the server to catch + print(json.dumps({"error": "Invalid payload format.", "details": str(e)})) + sys.exit(1) + + # Function mapping to dynamically call the correct phase + modes = { + "phase1": phase1, + "phase2": phase2, + "phase3": phase3, + "phase4": phase4, + "phase5": phase5, + "phase6": phase6, + "phase7": phase7, + "phase8": phase8, + "phase9": phase9, + "translate": translate, + "image": image, + } + + mode_function = modes.get(args.mode) + + if not mode_function: + logging.error(f"Invalid mode specified: {args.mode}") + print(json.dumps({"error": f"Invalid mode: {args.mode}"})) + sys.exit(1) + + try: + logging.info(f"Executing mode: {args.mode}") + result = mode_function(payload) + # Ensure the output is always a JSON string + print(json.dumps(result, ensure_ascii=False)) + logging.info(f"Successfully executed mode: {args.mode}") + + except Exception as e: + logging.error(f"An error occurred during execution of mode '{args.mode}': {e}", exc_info=True) + print(json.dumps({"error": f"An error occurred in {args.mode}.", "details": str(e)})) + sys.exit(1) if __name__ == "__main__": main() diff --git a/helpers.py b/helpers.py index e540f70e..b93527eb 100644 --- a/helpers.py +++ b/helpers.py @@ -29,11 +29,26 @@ from urllib.parse import urlparse, unquote from difflib import SequenceMatcher # Externe Bibliotheken -import gspread -import wikipedia +try: + import gspread + GSPREAD_AVAILABLE = True +except ImportError: + GSPREAD_AVAILABLE = False + gspread = None # Define to avoid runtime errors on reference +try: + import wikipedia + WIKIPEDIA_AVAILABLE = True +except ImportError: + WIKIPEDIA_AVAILABLE = False + wikipedia = None # Define to avoid runtime errors on reference import requests from bs4 import BeautifulSoup -import pandas as pd +try: + import pandas as pd + PANDAS_AVAILABLE = True +except ImportError: + PANDAS_AVAILABLE = False + pd = None # Define to avoid runtime errors on reference # --- KI UMSCHALTUNG: Google Generative AI statt OpenAI --- try: @@ -44,8 +59,20 @@ except ImportError: logging.warning("google-generativeai Bibliothek nicht gefunden. KI-Funktionen deaktiviert.") # OpenAI Imports entfernen wir oder machen sie optional, um Verwirrung zu vermeiden -import openai -from openai.error import AuthenticationError, OpenAIError, RateLimitError, APIError, Timeout, InvalidRequestError, ServiceUnavailableError +try: + import openai + from openai.error import AuthenticationError, OpenAIError, RateLimitError, APIError, Timeout, InvalidRequestError, ServiceUnavailableError + OPENAI_AVAILABLE = True +except ImportError: + OPENAI_AVAILABLE = False + # Define dummy exception classes so the code doesn't crash if it tries to catch them + class AuthenticationError(Exception): pass + class OpenAIError(Exception): pass + class RateLimitError(Exception): pass + class APIError(Exception): pass + class Timeout(Exception): pass + class InvalidRequestError(Exception): pass + class ServiceUnavailableError(Exception): pass from config import (Config, BRANCH_MAPPING_FILE, URL_CHECK_MARKER, USER_AGENTS, LOG_DIR) @@ -106,11 +133,17 @@ def retry_on_failure(func): decorator_logger.warning(f"Wiederhole Versuch {attempt + 1}/{max_retries_config} fuer '{effective_func_name}'...") return func(*args, **kwargs) - except (gspread.exceptions.SpreadsheetNotFound, ValueError) as e: # AuthError removed from here as it might be recoverable with new key - decorator_logger.critical(f"❌ ENDGUELTIGER FEHLER bei '{effective_func_name}': Permanentes Problem erkannt. {type(e).__name__} - {str(e)[:150]}...") - raise e - except Exception as e: # Catch all to include Gemini errors + # Define permanent errors that should not be retried + permanent_errors = [ValueError] + if GSPREAD_AVAILABLE: + permanent_errors.append(gspread.exceptions.SpreadsheetNotFound) + + if any(isinstance(e, error_type) for error_type in permanent_errors): + decorator_logger.critical(f"❌ ENDGUELTIGER FEHLER bei '{effective_func_name}': Permanentes Problem erkannt. {type(e).__name__} - {str(e)[:150]}...") + raise e + + # Handle retryable errors error_msg = str(e) error_type = type(e).__name__ @@ -380,6 +413,58 @@ def generate_fsm_pitch(company_name, company_short_name, ki_branche, website_sum def serp_website_lookup(company_name): return "k.A." # Placeholder def search_linkedin_contacts(company_name, website, position_query, crm_kurzform, num_results=10): return [] # Placeholder def get_website_raw(url, max_length=30000, verify_cert=False): return "k.A." # Placeholder -def scrape_website_details(url): return "k.A." # Placeholder +def scrape_website_details(url): + """ + Fetches and extracts clean text content from a URL using requests and BeautifulSoup. + - Removes common non-content tags. + - Limits content length to avoid excessive token usage. + """ + logger = logging.getLogger(__name__) + if not url or not isinstance(url, str) or not url.startswith('http'): + logger.warning(f"Ungültige oder fehlende URL für Scraping: {url}") + return "Keine gültige URL angegeben." + + try: + # Use a random user-agent to avoid simple bot detection + headers = {'User-Agent': random.choice(USER_AGENTS)} + response = requests.get(url, headers=headers, timeout=getattr(Config, 'REQUEST_TIMEOUT', 15), verify=False) + response.raise_for_status() + + # Check content type to avoid parsing non-HTML content + if 'text/html' not in response.headers.get('Content-Type', ''): + logger.warning(f"Inhalt der URL {url} ist kein HTML.") + return "Die URL lieferte keinen auswertbaren HTML-Inhalt." + + soup = BeautifulSoup(response.content, 'html.parser') + + # Gezieltes Entfernen von störenden Elementen + for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'a']): + element.decompose() + + # Extrahieren des Textes aus dem Body, um Metadaten etc. im Head zu ignorieren + body = soup.find('body') + if body: + text = body.get_text(separator=' ', strip=True) + else: + text = soup.get_text(separator=' ', strip=True) # Fallback für seltsame HTML-Strukturen + + # Bereinigen von überflüssigen Leerzeichen + text = re.sub(r'\s+', ' ', text).strip() + + # Limit the content length to a reasonable size (e.g., 25000 chars) + max_len = 25000 + if len(text) > max_len: + logger.info(f"Inhalt von {url} auf {max_len} Zeichen gekürzt (Original: {len(text)}).") + text = text[:max_len] + + logger.info(f"Scraping von {url} erfolgreich. Länge: {len(text)} Zeichen.") + return text if text else "Website-Inhalt konnte nicht extrahiert werden." + + except requests.exceptions.RequestException as e: + logger.error(f"Fehler beim Abrufen der URL {url}: {e}") + return f"Fehler: Die URL konnte nicht abgerufen werden. (Grund: {e.__class__.__name__})" + except Exception as e: + logger.error(f"Unerwarteter Fehler beim Parsen der URL {url}: {e}") + return "Fehler: Ein unerwarteter Fehler ist beim Verarbeiten der Website aufgetreten." def is_valid_wikipedia_article_url(url): return False # Placeholder def alignment_demo(sheet_handler): pass # Placeholder