Brancheneinstufung2/b2b_marketing_orchestrator.py

# -*- coding: utf-8 -*-
import argparse
import json
import sys
import logging
import os
import re
from urllib.parse import urljoin, urlparse

import requests
import time
from bs4 import BeautifulSoup
from datetime import datetime

# Logging Setup
log_dir = "Log_from_docker"
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

timestamp = datetime.now().strftime("%Y-%m-%d")
log_file = os.path.join(log_dir, f"{timestamp}_b2b_marketing.log")

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file, mode='a', encoding='utf-8'),
        logging.StreamHandler(sys.stderr)
    ]
)

def save_detailed_log(step_name, content_type, content):
    """Saves detailed logs (prompts, raw responses) to separate files for inspection."""
    try:
        ts = datetime.now().strftime("%H-%M-%S")
        filename = f"{ts}_{step_name}_{content_type}.txt"
        filepath = os.path.join(log_dir, filename)
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(content)
        logging.info(f"Detailed log saved: {filepath}")
    except Exception as e:
        logging.error(f"Failed to save detailed log: {e}")

LINK_KEYWORDS = ['product', 'solution', 'service', 'industrie', 'branche', 'anwendung', 'produkt', 'loesung', 'dienstleistung', 'portfolio', 'angebot']

# --- PROMPTS (Single line strings with explicit newlines for maximum compatibility) ---

SYSTEM_PROMPT_DE = """# Systemrolle

Du bist ein **B2B-Marketing-Researcher & Copywriter**. Du analysierst eine Unternehmens-URL, identifizierst Angebot, Zielgruppen, konkrete Zielrollen, deren Painpoints sowie Gains und formulierst darauf basierend eine wertschaetzende, fachkundige Marketingbotschaft. **Antworte nur mit Ergebnissen, keine Gedankengaenge.** Belege jede Aussage mit einer konkreten Seiten-URL der analysierten Domain. Kennzeichne Unsicherheiten explizit. Deine Antwort muss immer in der Zielsprache {{language}} sein.

# Arbeitsprinzipien

1.  **Quellenpriorisierung:** Produktseite -> Loesungsseite -> Branchen/Industrien/Referenzen -> Unternehmens-/Ueber-uns-Seite -> Blog/News (nur zur Verifikation).
2.  **Faktenpolicy:** Nur aus der Domain der bereitgestellten URL ableiten; bei Vermutungen: als *Hypothese* kennzeichnen.
3.  **B2B-Ton:** sachkundig, respektvoll, nicht marktschreierisch.
4.  **Klarheit & Struktur:** Jede Stufe als Markdown-Tabelle + optionales Kurzresuemee dokumentieren.
5.  **Mindestens 4 spezifische Rollen** je Zielgruppe (nicht generisch).
6.  **Kompakt & nuetzlich:** Praezise Formulierungen; keine Floskeln."""

STEP_PROMPTS_DE = [
    """# Aufgabe
Fuehre **Schritt 1 - Angebot verstehen (WAS)** fuer das folgende Unternehmen durch.

# Eingaben
*   **Unternehmens-URL:** `{{company_url}}`
*   **Zielsprache der Ausgabe:** `{{language}}`
*   **Region(en) / Maerkte (optional):** `{{regions}}`
*   **Produkt-/Loesungsfokus (optional):** `{{focus}}`

# Anweisungen fuer Schritt 1
*   Extrahiere Produkt(e)/Leistung(en), Kernfunktionen, Differenzierung, relevante Werteversprechen.
*   Erstelle ein kurzes Resuemee (max. 4 Bulletpoints) der wichtigsten Erkenntnisse.
*   **Output:** Tabelle mit Spalten: *Produkt/Loesung | Beschreibung (1-2 Saetze) | Kernfunktionen | Differenzierung | Primaere Quelle (URL)*.
*   **Format-Anforderung:** Antworte NUR mit den Ergebnissen fuer diesen einen Schritt. Deine Antwort muss mit der Ueberschrift \"## Schritt 1: Angebot (WAS)\" beginnen und das Kurzresuemee sowie die Markdown-Tabelle enthalten. Gib keine weiteren Erklaerungen ab.""",
    """# Aufgabe
Fuehre nun **Schritt 2 - Zielgruppen (WER - Unternehmen)** durch.

# Kontext: Validierte Ergebnisse aus vorherigen Schritten
{{previous_steps_data}}

# Anweisungen fuer Schritt 2
*   Identifiziere B2B-Zielsegmente (Branchen/Unternehmensarten/Groessen/Regionen) basierend auf dem gegebenen Angebot.
*   **Output:** Tabelle: *Zielbranche/Segment | Typische Unternehmensmerkmale | Region(en) | Relevanzbeleg (URL)*.
*   **Format-Anforderung:** Antworte NUR mit den Ergebnissen fuer diesen einen Schritt. Deine Antwort muss mit der Ueberschrift \"## Schritt 2: Zielgruppen (Unternehmen)\" beginnen und die Markdown-Tabelle enthalten.""",
    """# Aufgabe
Fuehre nun **Schritt 3 - Zielpersonen/Rollen (WER - Personen)** durch.

# Kontext: Validierte Ergebnisse aus vorherigen Schritten
{{previous_steps_data}}

# Anweisungen fuer Schritt 3
*   Fuer jede Zielbranche: mind. 4 **spezifische** Rollen mit Verantwortungsbereich und Kaufbeteiligung (E, I, D, U nach RACI-Logik). Erfinde **keine** Personen; leite Rollen logisch aus Problem-/Prozessbezug ab.
*   **Output:** Tabelle: *Rolle (praezise) | Verantwortungsbereich | Warum relevant fuer Produkt | Kaufbeteiligung (E/I/D/U) | Quelle/Indiz (URL oder Referenz)*.
*   **Format-Anforderung:** Antworte NUR mit den Ergebnissen fuer diesen einen Schritt. Deine Antwort muss mit der Ueberschrift \"## Schritt 3: Zielpersonen (Rollen)\" beginnen.""",
    """# Aufgabe
Fuehre nun **Schritt 4 - Painpoints je Rolle (WARUM)** durch.

# Kontext: Validierte Ergebnisse aus vorherigen Schritten
{{previous_steps_data}}

# Anweisungen fuer Schritt 4
*   Formuliere pro Rolle 3-5 konkrete Painpoints (Beobachtungen, keine Features).
*   Tagge jeden Painpoint mit einer Kategorie: **Kosten | Zeit | Risiko | Compliance | Qualitaet | Mitarbeiterbindung.**
*   Fuege eine **Impact-Schaetzung (EUR, h, %)** als Hypothese hinzu.
*   **Output:** Tabelle: *Rolle | Painpoint (konkret, messbar/operativ) | Kategorie | Auswirkung (Kosten, Risiko, Zeit) | Impact-Schaetzung (EUR, h, %) | Dringlichkeit (hoch/mittel/niedrig) | Quelle/Indiz (URL)*.
*   **Format-Anforderung:** Antworte NUR mit den Ergebnissen fuer diesen einen Schritt. Deine Antwort muss mit der Ueberschrift \"## Schritt 4: Painpoints je Rolle\" beginnen.""",
    """# Aufgabe
Fuehre nun **Schritt 5 - Gains & Nutzen je Rolle (WARUM wechseln)** durch.

# Kontext: Validierte Ergebnisse aus vorherigen Schritten
{{previous_steps_data}}

# Anweisungen fuer Schritt 5
*   Basierend auf den identifizierten Painpoints, formuliere pro Rolle 2-3 konkrete Gains (Vorteile/Nutzen).
*   Quantifiziere den Nutzen als Hypothese (z.B. Einsparung in EUR, Zeitgewinn in h, Effizienzsteigerung in %).
*   **Output:** Tabelle: *Rolle | Gain (konkreter Nutzen) | Quantifizierung (Hypothese in EUR, h, %) | Quelle/Indiz (URL)*.
*   **Format-Anforderung:** Antworte NUR mit den Ergebnissen fuer diesen einen Schritt. Deine Antwort muss mit der Ueberschrift \"## Schritt 5: Gains & Nutzen je Rolle\" beginnen.""",
    """# Aufgabe
Fuehre nun **Schritt 6 - Marketingbotschaft (WIE sprechen)** durch.

# Kontext: Validierte Ergebnisse aus vorherigen Schritten
{{previous_steps_data}}

# Eingaben fuer diesen Schritt
*   **Gewuenschte Kanaele fuer die Botschaft:** `{{channels}}`

# Anweisungen fuer Schritt 6: Chain-of-Thought-Analyse & Texterstellung

**FOKUS:** Erstelle die Botschaften **AUSSCHLIESSLICH** fuer die vorgegebene **Fokus-Branche: {{focus_industry}}**.
Ignoriere alle anderen Branchen. Dein Ziel ist es, fuer JEDE Rolle innerhalb dieser EINEN Branche eine spezifische Botschaft zu entwickeln.

Fuehre fuer jede **[Rolle]** innerhalb der **[Fokus-Branche: {{focus_industry}}]** den folgenden Denkprozess durch:

1.  **Schritt 6.1 (Analyse): Produkt-Rollen-Fit.**
    *   Welches Produkt/welche Loesung aus der "Angebot"-Tabelle (Schritt 1) ist fuer die **[Rolle]** am relevantesten?

2.  **Schritt 6.2 (Analyse): Branchen-Use-Case.**
    *   Was sind 1-2 typische Anwendungsfaelle fuer das ausgewaehlte Produkt in der **{{focus_industry}}**? Was macht die **[Rolle]** damit konkret?

3.  **Schritt 6.3 (Analyse): Nutzen-Quantifizierung.**
    *   Betrachte die Painpoints (Schritt 4) und Gains (Schritt 5) fuer die **[Rolle]**.
    *   Leite daraus einen konkreten, fuer die **[Rolle]** relevanten KPI ab.

4.  **Schritt 6.4 (Synthese): Botschaft formulieren.**
    *   Synthetisiere die Erkenntnisse aus 6.1-6.3 zu einer praegnanten Kernbotschaft (2-3 Saetze) nach der Struktur: **Beobachtung (Problem) -> Niedrigschwellige Loesungsidee -> Produkt-Bruecke -> Quantifizierter Nutzen.**
    *   Erstelle Varianten dieser Botschaft fuer die Kanaele: {{channels}}.

# Output Format
Erstelle ONLY die finale Markdown-Tabelle.
*   **Table Columns:** *Fokus-Branche | Rolle | Kernbotschaft (2-3 sentences) | {{channels}}*.
*   **Requirement:** Your response must start with the heading \"## Schritt 6: Botschaften\" and contain ONLY die vollstaendige Markdown-Tabelle.""",
    """# Aufgabe
Fuehre **Schritt 7 - Customer Journey & Buying Center** durch.

# Kontext: Validierte Ergebnisse aus vorherigen Schritten
{{previous_steps_data}}

# Fokus
Beziehe dich auf die **Fokus-Branche: {{focus_industry}}**.

# Anweisungen fuer Schritt 7
*   Analysiere die Kaufreise ("Journey") vom ersten Trigger bis zum Vertrag.
*   Identifiziere fuer jede Phase die **Dynamik im Buying Center**: Wer treibt an (Champion), wer bremst oder prueft (Gatekeeper/Evaluator), wer entscheidet (Decider)?
*   Gehe besonders auf **technische und organisatorische Barrieren** ein (z.B. IT-Sicherheit, Schnittstellen wie Aufzugssteuerung, Prozessintegration).
*   Definiere **konkrete Assets**, die die jeweilige Rolle in dieser Phase benoetigt, um Einwaende zu entkraeften oder interne Mitstreiter zu ueberzeugen (z.B. "API-Dokumentation fuer Aufzugsbauer", "ROI-Rechner fuer CFO", "Sicherheits-Whitepaper").
*   **Output:** Erstelle eine Markdown-Tabelle mit exakt diesen Spalten: *Phase | Rolle | Funktion (Buying Center) | Zentrale Frage / Beduerfnis | Moeglicher Deal-Breaker | Benoetigtes Asset / Format*.
*   **Format-Anforderung:** Antworte NUR mit den Ergebnissen fuer diesen einen Schritt. Deine Antwort muss mit der Ueberschrift \"## Schritt 7: Customer Journey\" beginnen."""
]

SYSTEM_PROMPT_EN = """# System Role

You are a **B2B Marketing Researcher & Copywriter**. You analyze a company URL, identify the offer, target groups, specific target roles, their pain points and gains, and based on this, you formulate an appreciative, expert marketing message. **Answer only with results, no thought processes.** Support every statement with a specific page URL from the analyzed domain. Explicitly mark uncertainties. Your response must always be in the target language {{language}}.

# Working Principles

1.  **Source Prioritization:** Product Page -> Solutions Page -> Industries/References -> Company/About Us Page -> Blog/News (for verification only).
2.  **Fact Policy:** Only derive from the domain of the provided URL; for assumptions, mark them as a *hypothesis*.
3.  B2B Tone: Knowledgeable, respectful, not salesy.
4.  Clarity & Structure: Document each stage as a Markdown table + an optional short summary.
5.  At least 4 specific roles per target group (not generic).
6.  Concise & Useful: Precise wording; no clichés."""

STEP_PROMPTS_EN = [
    """# Task
Perform **Step 1 - Understand the Offer (WHAT)** for the following company.

# Inputs
*   **Company URL:** `{{company_url}}`
*   **Target Language of Output:** `{{language}}`
*   **Region(s) / Markets (optional):** `{{regions}}`
*   **Product/Solution Focus (optional):** `{{focus}}`

# Instructions for Step 1
*   Extract product(s)/service(s), core features, differentiation, and relevant value propositions.
*   Create a short summary (max. 4 bullet points) of the key findings.
*   **Output:** Table with columns: *Product/Solution | Description (1-2 sentences) | Core Features | Differentiation | Primary Source (URL)*.
*   **Format Requirement:** Respond ONLY with the results for this single step. Your response must start with the heading \"## Step 1: Offer (WHAT)\" and include the short summary and the Markdown table. Do not provide any other explanations.""",
    """# Task
Now perform **Step 2 - Target Groups (WHO - Companies)**.

# Context: Validated results from previous steps
{{previous_steps_data}}

# Instructions for Step 2
*   Identify B2B target segments (industries/company types/sizes/regions) based on the given offer.
*   **Output:** Table: *Target Industry/Segment | Typical Company Characteristics | Region(s) | Proof of Relevance (URL)*.
*   **Format Requirement:** Respond ONLY with the results for this single step. Your response must start with the heading \"## Step 2: Target Groups (Companies)\" and include the Markdown table.""",
    """# Task
Now perform **Step 3 - Personas/Roles (WHO - People)**.

# Context: Validated results from previous steps
{{previous_steps_data}}

# Instructions for Step 3
*   For each target industry: at least 4 **specific** roles with their area of responsibility and involvement in purchasing (E, I, D, U based on RACI logic). Do **not** invent people; logically derive roles from problem/process context.
*   **Output:** Table: *Role (precise) | Area of Responsibility | Why relevant for the product | Buying Involvement (E/I/D/U) | Source/Indication (URL or reference)*.
*   **Format Requirement:** Respond ONLY with the results for this single step. Your response must start with the heading \"## Step 3: Personas (Roles)\".""",
    """# Task
Now perform **Step 4 - Pain Points per Role (WHY)**.

# Context: Validated results from previous steps
{{previous_steps_data}}

# Instructions for Step 4
*   For each role, formulate 3-5 specific pain points (observations, not features).
*   Tag each pain point with a category: **Cost | Time | Risk | Compliance | Quality | Employee Retention.**
*   Add an **Impact Estimate (EUR, h, %)** as a hypothesis.
*   **Output:** Table: *Role | Pain Point (specific, measurable/operational) | Category | Impact (Cost, Risk, Time) | Impact Estimate (EUR, h, %) | Urgency (high/medium/low) | Source/Indication (URL)*.
*   **Format Requirement:** Respond ONLY with the results for this single step. Your response must start with the heading \"## Step 4: Pain Points per Role\".""",
    """# Task
Now perform **Step 5 - Gains & Benefits per Role (WHY switch)**.

# Context: Validated results from previous steps
{{previous_steps_data}}

# Instructions for Step 5
*   Based on the identified pain points, formulate 2-3 concrete gains (advantages/benefits) for each role.
*   Quantify the benefit as a hypothesis (e.g., savings in EUR, time gained in h, efficiency increase in %).
*   **Output:** Table: *Role | Gain (specific benefit) | Quantification (Hypothesis in EUR, h, %) | Source/Indication (URL)*.
*   **Format Requirement:** Respond ONLY with the results for this single step. Your response must start with the heading \"## Step 5: Gains & Benefits per Role\".""",
    """# Task
Now perform **Step 6 - Marketing Message (HOW to speak)**.

# Context: Validated results from previous steps
{{previous_steps_data}}

# Inputs for this step
*   **Desired channels for the message:** `{{channels}}`

# Instructions for Step 6: Chain-of-Thought Analysis & Copywriting

**FOCUS:** Create messages **EXCLUSIVELY** for the provided **Focus Industry: {{focus_industry}}**.
Ignore all other industries. Your goal is to create a specific message for EACH role within this ONE industry.

For each **[Role]** within the **[Focus Industry: {{focus_industry}}]**, perform the following thought process:

1.  **Step 6.1 (Analysis): Product-Role Fit.**
    *   Which product/solution from the "Offer" table (Step 1) is most relevant for the **[Role]**?

2.  **Step 6.2 (Analysis): Industry Use Case.**
    *   What are 1-2 typical use cases for the selected product in the **[Focus Industry]**? What does the **[Role]** actually do with it?

3.  **Step 6.3 (Analysis): Benefit Quantification.**
    *   Look at the Pain Points (Step 4) and Gains (Step 5) for the **[Role]**.
    *   Derive a concrete KPI relevant to the **[Role]**.

4.  **Step 6.4 (Synthesis): Formulate Message.**
    *   Synthesize the findings from 6.1-6.3 into a concise core message (2-3 sentences) following the structure: **Observation (Problem) -> Low-threshold Solution Idea -> Product Bridge -> Quantified Benefit.**
    *   Create variants of this message for the channels: {{channels}}.

# Output Format
Create ONLY the final Markdown table.
*   **Table Columns:** *Focus Industry | Role | Core Message (2-3 sentences) | {{channels}}*.
*   **Requirement:** Your response must start with the heading \"## Step 6: Messages\" and contain ONLY the complete Markdown table.""",
    """# Task
Perform **Step 7 - Customer Journey & Buying Center**.

# Context: Validated results from previous steps
{{previous_steps_data}}

# Focus
Refer to the **Focus Industry: {{focus_industry}}**.

# Instructions for Step 7
*   Analyze the purchase journey ("Journey") from the first trigger to the contract.
*   Identify the **Buying Center dynamics** for each phase: Who drives it (Champion), who slows it down or audits (Gatekeeper/Evaluator), who decides (Decider)?
*   Focus specifically on **technical and organizational barriers** (e.g., IT security, interfaces like elevator control, process integration).
*   Define **concrete assets** that each role needs in this phase to invalidate objections or convince internal stakeholders (e.g., "API documentation for elevator manufacturers", "ROI calculator for CFO", "Security Whitepaper").
*   **Output:** Create a Markdown table with exactly these columns: *Phase | Role | Function (Buying Center) | Key Question / Need | Potential Deal-Breaker | Needed Asset / Format*.
*   **Format Requirement:** Respond ONLY with the results for this single step. Your response must start with the heading \"## Step 7: Customer Journey\"."""
]

PROMPTS = {
    'de': {
        'SYSTEM_PROMPT': SYSTEM_PROMPT_DE,
        'STEP_PROMPTS': STEP_PROMPTS_DE,
        'STEP_TITLES': {
            'offer': 'Schritt 1: Angebot (WAS)',
            'targetGroups': 'Schritt 2: Zielgruppen (WER - Unternehmen)',
            'personas': 'Schritt 3: Zielpersonen/Rollen (WER - Personen)',
            'painPoints': 'Schritt 4: Painpoints je Rolle (WARUM)',
            'gains': 'Schritt 5: Gains & Nutzen je Rolle (WARUM wechseln)',
            'messages': 'Schritt 6: Marketingbotschaften je Segment & Rolle (WIE sprechen)',
            'customerJourney': 'Schritt 7: Customer Journey & Buying Center',
        },
        'SUMMARY_TITLE': 'Kurzresuemee:',
        'SUMMARY_TEXT_FOR_STEP1': [
            "Die Angebotsanalyse wurde erfolgreich auf Basis der Website-Inhalte generiert.",
            "Dies ist der erste Schritt des Prozesses, der vom neuen Python-Backend ausgefuehrt wird."
        ]
    },
    'en': {
        'SYSTEM_PROMPT': SYSTEM_PROMPT_EN,
        'STEP_PROMPTS': STEP_PROMPTS_EN,
        'STEP_TITLES': {
            'offer': 'Step 1: Offer (WHAT)',
            'targetGroups': 'Step 2: Target Groups (WHO - Companies)',
            'personas': 'Step 3: Personas/Roles (WHO - People)',
            'painPoints': 'Step 4: Pain Points per Role (WHY)',
            'gains': 'Step 5: Gains & Benefits per Role (WHY switch)',
            'messages': 'Step 6: Marketing Messages per Segment & Role (HOW to speak)',
            'customerJourney': 'Step 7: Customer Journey & Buying Center',
        },
        'SUMMARY_TITLE': 'Summary:',
        'SUMMARY_TEXT_FOR_STEP1': [
            "The offer analysis has been successfully generated based on website content.",
            "This is the first step of the process, executed by the new Python backend."
        ]
    }
}

# --- API & SCRAPING HELPERS ---

def load_api_key():
    try:
        with open("gemini_api_key.txt", "r") as f:
            return f.read().strip()
    except FileNotFoundError:
        logging.error("API key file 'gemini_api_key.txt' not found.")
        return None

def call_gemini_api(prompt, api_key, retries=3):
    url = f"https://generativelanguage.googleapis.com/v1/models/gemini-2.5-flash:generateContent?key={api_key}"
    headers = {'Content-Type': 'application/json'}
    payload = {"contents": [{"parts": [{"text": prompt}]}]}

    for attempt in range(retries):
        try:
            # Increased timeout to 600s (10 minutes) for complex Step 6 generation
            response = requests.post(url, headers=headers, json=payload, timeout=600)
            response.raise_for_status()
            result = response.json()
            if 'candidates' in result and result['candidates']:
                candidate = result['candidates'][0]
                if 'content' in candidate and 'parts' in candidate['content']:
                    return candidate['content']['parts'][0]['text']
            logging.warning(f"Unexpected API response structure: {result}")
            return ""
        except requests.exceptions.HTTPError as e:
            # Retry on server errors (500, 502, 503, 504)
            if e.response.status_code in [500, 502, 503, 504] and attempt < retries - 1:
                wait_time = (attempt + 1) * 5
                logging.warning(f"API Error {e.response.status_code}. Retrying in {wait_time}s...")
                time.sleep(wait_time)
                continue
            logging.error(f"Error calling Gemini API: {e}")
            raise
        except Exception as e:
            # Retry on connection errors
            if attempt < retries - 1:
                wait_time = (attempt + 1) * 5
                logging.warning(f"API Connection Error: {e}. Retrying in {wait_time}s...")
                time.sleep(wait_time)
                continue
            logging.error(f"Final Error calling Gemini API: {e}")
            raise

def get_text_from_url(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'lxml')

        # 1. Remove specific noise tags (including header/footer to avoid navigation links)
        for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'meta', 'link']):
            element.decompose()

        # 2. Clean attributes but keep structure and HREF
        for tag in soup.find_all(True):
            # We want to keep the tag (e.g. <h1>, <a>), but clean attributes
            current_attrs = dict(tag.attrs)
            for attr in current_attrs:
                # Keep 'href' for links so the LLM can extract the source URL
                if tag.name == 'a' and attr == 'href':
                    continue
                # Remove everything else (class, id, style, onclick, etc.)
                del tag[attr]

        # 3. Return the HTML structure (body only if possible)
        body = soup.find('body')
        if body:
            html_content = str(body)
        else:
            html_content = str(soup)

        # 4. Minimize whitespace to save tokens (remove empty lines)
        lines = [line.strip() for line in html_content.split('\n') if line.strip()]
        return "\n".join(lines)

    except Exception as e:
        logging.warning(f"Could not fetch or read URL {url}: {e}")
        return ""

def find_relevant_links(base_url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(base_url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'lxml')
        base_netloc = urlparse(base_url).netloc
        relevant_links = set()
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            link_text = a_tag.get_text(strip=True).lower()
            if any(keyword in href.lower() or keyword in link_text for keyword in LINK_KEYWORDS):
                abs_url = urljoin(base_url, href)
                if urlparse(abs_url).netloc == base_netloc:
                    relevant_links.add(abs_url)
        return list(relevant_links)[:10]
    except Exception as e:
        logging.warning(f"Could not scrape base URL {base_url} for links: {e}")
        return []

def clean_llm_response(text):
    """Sanitizes the LLM response to remove excessive whitespace and common artifacts."""
    if not text: return ""
    # 1. Replace multiple spaces/newlines with single ones (within a reasonable limit)
    # But preserve single newlines for markdown structure
    text = re.sub(r'[ \t]{5,}', ' ', text) # Replace 5+ spaces/tabs with 1 space
    # 2. Remove non-printable characters (except common ones)
    text = "".join(ch for ch in text if ch.isprintable() or ch in "\n\r\t")
    # 3. Fix common table artifacts like empty pipes at the end of lines
    text = re.sub(r'\|\s*$', '|', text, flags=re.MULTILINE)
    return text.strip()

def parse_markdown_table(markdown_text):
    # Sanitize input first
    markdown_text = clean_llm_response(markdown_text)

    lines = [line.strip() for line in markdown_text.strip().split('\n') if line.strip()]
    table_lines = []

    # 1. Identify all lines that look like table rows (start and end with |)
    for line in lines:
        if line.startswith('|') and line.endswith('|'):
            table_lines.append(line)

    if not table_lines:
        return {"headers": [], "rows": []}

    # 2. Find the separator line (|---|---|...)
    separator_index = -1
    for i, line in enumerate(table_lines):
        # A separator line usually has at least one dash between pipes and no alphanumeric chars
        if '---' in line and not re.search(r'[a-zA-Z0-9]', line.replace('|', '').replace('-', '').replace(' ', '').replace(':', '')):
            separator_index = i
            break

    if separator_index == -1:
        # If no separator found, we might just have a list of rows where the first is the header
        # but usually LLMs provide the separator. Let's assume the first is header.
        header_line = table_lines[0]
        data_start = 1
    else:
        # Separator found. Header is the line before it.
        if separator_index == 0: return {"headers": [], "rows": []}
        header_line = table_lines[separator_index - 1]
        data_start = separator_index + 1

    # 3. Extract and clean headers
    headers = [re.sub(r'\*+([^*]+)\*+', r'\1', h.strip()).strip() for h in header_line.split('|') if h.strip()]
    if not headers: return {"headers": [], "rows": []}

    # 4. Extract and clean rows
    rows = []
    for line in table_lines[data_start:]:
        # Split by | and remove leading/trailing empty elements from the split result
        raw_cells = line.split('|')
        # Handle the leading/trailing empty strings caused by the outer pipes
        cells = [re.sub(r'\*+([^*]+)\*+', r'\1', c.strip()).strip() for c in raw_cells]

        # If the line starts and ends with |, the first and last elements are empty strings
        if line.startswith('|'): cells = cells[1:]
        if line.endswith('|'): cells = cells[:-1]

        # Pad or truncate row to match header length
        if len(cells) < len(headers):
            cells.extend([''] * (len(headers) - len(cells)))
        elif len(cells) > len(headers):
            cells = cells[:len(headers)]

        # Only add row if it's not another separator or empty
        if any(cells):
            rows.append(cells)

    return {"headers": headers, "rows": rows}

def format_context_for_prompt(analysis_data, language):
    context = ""
    current_prompts = PROMPTS[language]
    step_titles = current_prompts['STEP_TITLES']
    step_keys = ['offer', 'targetGroups', 'personas', 'painPoints', 'gains', 'messages', 'customerJourney']
    for i, step_key in enumerate(step_keys):
        step_data = analysis_data.get(step_key)
        if step_data:
            title = step_titles.get(step_key, f"Step {i+1}")
            context += f"\n\n## {title}\n\n"
            summary = step_data.get('summary')
            if summary:
                context += f"**{current_prompts['SUMMARY_TITLE']}**\n"
                context += "\n".join([f"* {s}" for s in summary]) + "\n\n"
            headers, rows = step_data.get('headers', []), step_data.get('rows', [])
            if headers and rows:
                context += f"| {' | '.join(headers)} |\n| {' | '.join(['---']*len(headers))} |\n"
                for row in rows:
                    padded_row = row + [''] * (len(headers) - len(row))
                    context += f"| {' | '.join(padded_row)} |\n"
            context += "\n"
    return context

# --- CORE LOGIC ---

def start_generation(url, language, regions, focus):
    logging.info(f"Starting Step 1 for URL: {url} in language: {language}")
    api_key = load_api_key()
    if not api_key: raise ValueError("Gemini API key is missing.")

    # 1. Scraping Strategy: Main Page + Relevant Sub-pages
    urls_to_scrape = sorted(list(set([url] + find_relevant_links(url))))
    grounding_text = ""
    logging.info(f"Identified {len(urls_to_scrape)} pages to scrape.")

    for u in urls_to_scrape:
        logging.info(f"  - Scraping: {u}")
        text_content = get_text_from_url(u)
        if text_content:
            # Inject SOURCE_URL marker for the LLM
            grounding_text += f"SOURCE_URL: {u}\nCONTENT (Simplified HTML):\n{text_content}\n\n{'='*50}\n\n"

    if not grounding_text.strip(): raise RuntimeError(f"Failed to scrape content from {url}")

    current_prompts = PROMPTS[language]
    system_instruction = current_prompts['SYSTEM_PROMPT'].replace('{{language}}', language)

    # Updated Prompt: Removed length limit and added instruction for SOURCE_URL
    grounded_offer_prompt = f"{system_instruction}\n\n# TASK\nAnalyze the provided website content to understand the company's offerings. Your response MUST be a Markdown table.\n\n# CONTEXT\n- Website Content: The input provided is **Simplified HTML**. Use the structure (e.g. <h1>-<h6> headers, <ul> lists, <div> groupings) to identify distinct products or services.\n- **Content Data (with SOURCE_URL markers):** \n```html\n{grounding_text}\n```\n- Target Language: {language}\n- Company URL: {url}\n- Focus: {focus or 'N/A'}\n- Regions: {regions or 'N/A'}\n\n# INSTRUCTIONS\n1. Identify products/services by looking for recurring HTML patterns (e.g. a Header followed by a description and a 'Learn More' link).\n2. Create Markdown table: Produkt/Loesung | Beschreibung (1-2 Saetze) | Kernfunktionen | Differenzierung | Primaere Quelle (URL)\n3. **IMPORTANT:** For the 'Primaere Quelle (URL)' column, look for the `<a href='...'>` tag NEAREST to the product description. Combine it with the `SOURCE_URL` if it's a relative link. Do not just link the homepage.\n4. Response must be ONLY the table starting with '## {current_prompts['STEP_TITLES']['offer']}'."

    # Log the full prompt (Input)
    save_detailed_log("step1_offer", "prompt", grounded_offer_prompt)

    response_text = call_gemini_api(grounded_offer_prompt, api_key)

    # Log the full response (Output)
    save_detailed_log("step1_offer", "response", response_text)

    step1_title = current_prompts['STEP_TITLES']['offer']
    # Flexible header matching
    title_match = re.search(rf'^##\s*(?:Schritt|Step)\s*1.*$', response_text, re.IGNORECASE | re.MULTILINE)
    content = response_text[title_match.end():].strip() if title_match else response_text
    table_data = parse_markdown_table(content)

    return {
        "_initial_inputs": {"url": url, "language": language, "regions": regions, "focus": focus},
        "offer": {"summary": current_prompts['SUMMARY_TEXT_FOR_STEP1'], "headers": table_data['headers'], "rows": table_data['rows']}
    }

def next_step(language, context_file, generation_step, channels, focus_industry=None):
    logging.info(f"Starting Step {generation_step} in language: {language}")
    api_key = load_api_key()
    if not api_key: raise ValueError("Gemini API key is missing.")
    with open(context_file, 'r', encoding='utf-8') as f: analysis_data = json.load(f)
    current_prompts = PROMPTS[language]
    system_instruction = current_prompts['SYSTEM_PROMPT'].replace('{{language}}', language)
    step_prompt_template = current_prompts['STEP_PROMPTS'][generation_step - 1]
    previous_steps_markdown = format_context_for_prompt(analysis_data, language)
    prompt = step_prompt_template.replace('{{previous_steps_data}}', previous_steps_markdown)
    if '{{channels}}' in prompt: prompt = prompt.replace('{{channels}}', channels or 'LinkedIn, Kaltmail, Landingpage')

    # Inject focus industry if provided (for Step 6)
    if '{{focus_industry}}' in prompt:
        prompt = prompt.replace('{{focus_industry}}', focus_industry or 'Primary Industry')

    initial_inputs = analysis_data.get('_initial_inputs', {})

    # Helper to safely get string values even if they are None/null in the JSON
    def get_safe(key):
        val = initial_inputs.get(key)
        return str(val) if val is not None else 'N/A'

    prompt = prompt.replace('{{company_url}}', get_safe('url')).replace('{{language}}', language).replace('{{regions}}', get_safe('regions')).replace('{{focus}}', get_safe('focus'))
    full_prompt = f"{system_instruction}\n\n{prompt}"

    # Log the full prompt
    save_detailed_log(f"step{generation_step}", "prompt", full_prompt)

    response_text = call_gemini_api(full_prompt, api_key)

    # Log the full response
    save_detailed_log(f"step{generation_step}", "response", response_text)

    step_key = ['offer', 'targetGroups', 'personas', 'painPoints', 'gains', 'messages', 'customerJourney'][generation_step - 1]
    expected_title = current_prompts['STEP_TITLES'][step_key]
    # Flexible header matching
    title_match = re.search(rf'^##\s*(?:Schritt|Step)\s*{generation_step}.*$', response_text, re.IGNORECASE | re.MULTILINE)
    content = response_text[title_match.end():].strip() if title_match else response_text
    table_data = parse_markdown_table(content)

    # Fixed Regex: Added proper grouping (?: ... ) around the stop tokens
    summary_match = re.search(r'\*\*(?:Kurzresuemee|Summary).*?:\*\*\s*([\s\S]*?)(?:\| ---|## (?:Schritt|Step))', response_text, re.IGNORECASE)
    summary = [re.sub(r'^\*\s*|^-\s*|^\d+\.\s*', '', s.strip()) for s in summary_match[1].split('\n') if s.strip()] if summary_match else []
    return {step_key: {"summary": summary, "headers": table_data['headers'], "rows": table_data['rows']}}

def enrich_product(product_name, product_url, language):
    logging.info(f"Enriching product: {product_name} ({product_url})")
    api_key = load_api_key()
    if not api_key: raise ValueError("Gemini API key is missing.")

    grounding_text = ""
    if product_url:
        grounding_text = get_text_from_url(product_url)

    prompt_text = f"""
# ANWEISUNG
Du bist ein B2B-Marketing-Analyst. Deine Aufgabe ist es, die Daten für EIN Produkt zu generieren.
Basierend auf dem Produktnamen und (optional) dem Inhalt der Produkt-URL, fülle die Spalten einer Markdown-Tabelle aus.
Die Ausgabe MUSS eine einzelne, kommaseparierte Zeile sein, die in eine Tabelle passt. KEINE Header, KEIN Markdown, nur die Werte.

# PRODUKT
- Name: "{product_name}"
- URL-Inhalt: "{grounding_text[:3000]}..."

# SPALTEN
Produkt/Lösung | Beschreibung (1-2 Sätze) | Kernfunktionen | Differenzierung | Primäre Quelle (URL)

# BEISPIEL-OUTPUT
Saugroboter NR1500,Ein professioneller Saugroboter für große Büroflächen.,Autonome Navigation;Intelligente Kartierung;Lange Akkulaufzeit,Fokus auf B2B-Markt;Datenschutzkonform,https://nexaro.com/products/nr1500

# DEINE AUFGABE
Erstelle jetzt die kommaseparierte Zeile für das Produkt "{product_name}".
"""

    response_text = call_gemini_api(prompt_text, api_key)

    # Return as a simple list of strings
    return [cell.strip() for cell in response_text.split(',')]

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--mode', required=True)
    parser.add_argument('--url')
    parser.add_argument('--focus')
    parser.add_argument('--regions')
    parser.add_argument('--context_file')
    parser.add_argument('--generation_step', type=int)
    parser.add_argument('--channels')
    parser.add_argument('--language', required=True)
    parser.add_argument('--focus_industry') # New argument
    parser.add_argument('--product_name')
    parser.add_argument('--product_url')
    args = parser.parse_args()
    try:
        if args.mode == 'start_generation': result = start_generation(args.url, args.language, args.regions, args.focus)
        elif args.mode == 'next_step': result = next_step(args.language, args.context_file, args.generation_step, args.channels, args.focus_industry)
        elif args.mode == 'enrich_product': result = enrich_product(args.product_name, args.product_url, args.language)
        sys.stdout.write(json.dumps(result, ensure_ascii=False))
    except Exception as e:
        logging.error(f"Error: {e}", exc_info=True)
        sys.stdout.write(json.dumps({"error": str(e)}, ensure_ascii=False))
        sys.exit(1)

if __name__ == '__main__': main()