# -*- coding: utf-8 -*- import argparse import json import sys import logging import os import re from urllib.parse import urljoin, urlparse import requests import time from bs4 import BeautifulSoup from datetime import datetime # Logging Setup log_dir = "Log_from_docker" if not os.path.exists(log_dir): os.makedirs(log_dir) timestamp = datetime.now().strftime("%Y-%m-%d") log_file = os.path.join(log_dir, f"{timestamp}_b2b_marketing.log") logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(log_file, mode='a', encoding='utf-8'), logging.StreamHandler(sys.stderr) ] ) def save_detailed_log(step_name, content_type, content): """Saves detailed logs (prompts, raw responses) to separate files for inspection.""" try: ts = datetime.now().strftime("%H-%M-%S") filename = f"{ts}_{step_name}_{content_type}.txt" filepath = os.path.join(log_dir, filename) with open(filepath, "w", encoding="utf-8") as f: f.write(content) logging.info(f"Detailed log saved: {filepath}") except Exception as e: logging.error(f"Failed to save detailed log: {e}") LINK_KEYWORDS = ['product', 'solution', 'service', 'industrie', 'branche', 'anwendung', 'produkt', 'loesung', 'dienstleistung', 'portfolio', 'angebot'] # --- PROMPTS (Single line strings with explicit newlines for maximum compatibility) --- SYSTEM_PROMPT_DE = """# Systemrolle Du bist ein **B2B-Marketing-Researcher & Copywriter**. Du analysierst eine Unternehmens-URL, identifizierst Angebot, Zielgruppen, konkrete Zielrollen, deren Painpoints sowie Gains und formulierst darauf basierend eine wertschaetzende, fachkundige Marketingbotschaft. **Antworte nur mit Ergebnissen, keine Gedankengaenge.** Belege jede Aussage mit einer konkreten Seiten-URL der analysierten Domain. Kennzeichne Unsicherheiten explizit. Deine Antwort muss immer in der Zielsprache {{language}} sein. # Arbeitsprinzipien 1. **Quellenpriorisierung:** Produktseite -> Loesungsseite -> Branchen/Industrien/Referenzen -> Unternehmens-/Ueber-uns-Seite -> Blog/News (nur zur Verifikation). 2. **Faktenpolicy:** Nur aus der Domain der bereitgestellten URL ableiten; bei Vermutungen: als *Hypothese* kennzeichnen. 3. **B2B-Ton:** sachkundig, respektvoll, nicht marktschreierisch. 4. **Klarheit & Struktur:** Jede Stufe als Markdown-Tabelle + optionales Kurzresuemee dokumentieren. 5. **Mindestens 4 spezifische Rollen** je Zielgruppe (nicht generisch). 6. **Kompakt & nuetzlich:** Praezise Formulierungen; keine Floskeln.""" STEP_PROMPTS_DE = [ """# Aufgabe Fuehre **Schritt 1 - Angebot verstehen (WAS)** fuer das folgende Unternehmen durch. # Eingaben * **Unternehmens-URL:** `{{company_url}}` * **Zielsprache der Ausgabe:** `{{language}}` * **Region(en) / Maerkte (optional):** `{{regions}}` * **Produkt-/Loesungsfokus (optional):** `{{focus}}` # Anweisungen fuer Schritt 1 * Extrahiere Produkt(e)/Leistung(en), Kernfunktionen, Differenzierung, relevante Werteversprechen. * Erstelle ein kurzes Resuemee (max. 4 Bulletpoints) der wichtigsten Erkenntnisse. * **Output:** Tabelle mit Spalten: *Produkt/Loesung | Beschreibung (1-2 Saetze) | Kernfunktionen | Differenzierung | Primaere Quelle (URL)*. * **Format-Anforderung:** Antworte NUR mit den Ergebnissen fuer diesen einen Schritt. Deine Antwort muss mit der Ueberschrift \"## Schritt 1: Angebot (WAS)\" beginnen und das Kurzresuemee sowie die Markdown-Tabelle enthalten. Gib keine weiteren Erklaerungen ab.""", """# Aufgabe Fuehre nun **Schritt 2 - Zielgruppen (WER - Unternehmen)** durch. # Kontext: Validierte Ergebnisse aus vorherigen Schritten {{previous_steps_data}} # Anweisungen fuer Schritt 2 * Identifiziere B2B-Zielsegmente (Branchen/Unternehmensarten/Groessen/Regionen) basierend auf dem gegebenen Angebot. * **Output:** Tabelle: *Zielbranche/Segment | Typische Unternehmensmerkmale | Region(en) | Relevanzbeleg (URL)*. * **Format-Anforderung:** Antworte NUR mit den Ergebnissen fuer diesen einen Schritt. Deine Antwort muss mit der Ueberschrift \"## Schritt 2: Zielgruppen (Unternehmen)\" beginnen und die Markdown-Tabelle enthalten.""", """# Aufgabe Fuehre nun **Schritt 3 - Zielpersonen/Rollen (WER - Personen)** durch. # Kontext: Validierte Ergebnisse aus vorherigen Schritten {{previous_steps_data}} # Anweisungen fuer Schritt 3 * Fuer jede Zielbranche: mind. 4 **spezifische** Rollen mit Verantwortungsbereich und Kaufbeteiligung (E, I, D, U nach RACI-Logik). Erfinde **keine** Personen; leite Rollen logisch aus Problem-/Prozessbezug ab. * **Output:** Tabelle: *Rolle (praezise) | Verantwortungsbereich | Warum relevant fuer Produkt | Kaufbeteiligung (E/I/D/U) | Quelle/Indiz (URL oder Referenz)*. * **Format-Anforderung:** Antworte NUR mit den Ergebnissen fuer diesen einen Schritt. Deine Antwort muss mit der Ueberschrift \"## Schritt 3: Zielpersonen (Rollen)\" beginnen.""", """# Aufgabe Fuehre nun **Schritt 4 - Painpoints je Rolle (WARUM)** durch. # Kontext: Validierte Ergebnisse aus vorherigen Schritten {{previous_steps_data}} # Anweisungen fuer Schritt 4 * Formuliere pro Rolle 3-5 konkrete Painpoints (Beobachtungen, keine Features). * Tagge jeden Painpoint mit einer Kategorie: **Kosten | Zeit | Risiko | Compliance | Qualitaet | Mitarbeiterbindung.** * Fuege eine **Impact-Schaetzung (EUR, h, %)** als Hypothese hinzu. * **Output:** Tabelle: *Rolle | Painpoint (konkret, messbar/operativ) | Kategorie | Auswirkung (Kosten, Risiko, Zeit) | Impact-Schaetzung (EUR, h, %) | Dringlichkeit (hoch/mittel/niedrig) | Quelle/Indiz (URL)*. * **Format-Anforderung:** Antworte NUR mit den Ergebnissen fuer diesen einen Schritt. Deine Antwort muss mit der Ueberschrift \"## Schritt 4: Painpoints je Rolle\" beginnen.""", """# Aufgabe Fuehre nun **Schritt 5 - Gains & Nutzen je Rolle (WARUM wechseln)** durch. # Kontext: Validierte Ergebnisse aus vorherigen Schritten {{previous_steps_data}} # Anweisungen fuer Schritt 5 * Basierend auf den identifizierten Painpoints, formuliere pro Rolle 2-3 konkrete Gains (Vorteile/Nutzen). * Quantifiziere den Nutzen als Hypothese (z.B. Einsparung in EUR, Zeitgewinn in h, Effizienzsteigerung in %). * **Output:** Tabelle: *Rolle | Gain (konkreter Nutzen) | Quantifizierung (Hypothese in EUR, h, %) | Quelle/Indiz (URL)*. * **Format-Anforderung:** Antworte NUR mit den Ergebnissen fuer diesen einen Schritt. Deine Antwort muss mit der Ueberschrift \"## Schritt 5: Gains & Nutzen je Rolle\" beginnen.""", """# Aufgabe Fuehre nun **Schritt 6 - Marketingbotschaft (WIE sprechen)** durch. # Kontext: Validierte Ergebnisse aus vorherigen Schritten {{previous_steps_data}} # Eingaben fuer diesen Schritt * **Gewuenschte Kanaele fuer die Botschaft:** `{{channels}}` # Anweisungen fuer Schritt 6: Chain-of-Thought-Analyse & Texterstellung **FOKUS:** Erstelle die Botschaften **AUSSCHLIESSLICH** fuer die vorgegebene **Fokus-Branche: {{focus_industry}}**. Ignoriere alle anderen Branchen. Dein Ziel ist es, fuer JEDE Rolle innerhalb dieser EINEN Branche eine spezifische Botschaft zu entwickeln. Fuehre fuer jede **[Rolle]** innerhalb der **[Fokus-Branche: {{focus_industry}}]** den folgenden Denkprozess durch: 1. **Schritt 6.1 (Analyse): Produkt-Rollen-Fit.** * Welches Produkt/welche Loesung aus der "Angebot"-Tabelle (Schritt 1) ist fuer die **[Rolle]** am relevantesten? 2. **Schritt 6.2 (Analyse): Branchen-Use-Case.** * Was sind 1-2 typische Anwendungsfaelle fuer das ausgewaehlte Produkt in der **{{focus_industry}}**? Was macht die **[Rolle]** damit konkret? 3. **Schritt 6.3 (Analyse): Nutzen-Quantifizierung.** * Betrachte die Painpoints (Schritt 4) und Gains (Schritt 5) fuer die **[Rolle]**. * Leite daraus einen konkreten, fuer die **[Rolle]** relevanten KPI ab. 4. **Schritt 6.4 (Synthese): Botschaft formulieren.** * Synthetisiere die Erkenntnisse aus 6.1-6.3 zu einer praegnanten Kernbotschaft (2-3 Saetze) nach der Struktur: **Beobachtung (Problem) -> Niedrigschwellige Loesungsidee -> Produkt-Bruecke -> Quantifizierter Nutzen.** * Erstelle Varianten dieser Botschaft fuer die Kanaele: {{channels}}. # Output Format Erstelle ONLY die finale Markdown-Tabelle. * **Table Columns:** *Fokus-Branche | Rolle | Kernbotschaft (2-3 sentences) | {{channels}}*. * **Requirement:** Your response must start with the heading \"## Schritt 6: Botschaften\" and contain ONLY die vollstaendige Markdown-Tabelle.""", """# Aufgabe Fuehre **Schritt 7 - Customer Journey & Buying Center** durch. # Kontext: Validierte Ergebnisse aus vorherigen Schritten {{previous_steps_data}} # Fokus Beziehe dich auf die **Fokus-Branche: {{focus_industry}}**. # Anweisungen fuer Schritt 7 * Analysiere die Kaufreise ("Journey") vom ersten Trigger bis zum Vertrag. * Identifiziere fuer jede Phase die **Dynamik im Buying Center**: Wer treibt an (Champion), wer bremst oder prueft (Gatekeeper/Evaluator), wer entscheidet (Decider)? * Gehe besonders auf **technische und organisatorische Barrieren** ein (z.B. IT-Sicherheit, Schnittstellen wie Aufzugssteuerung, Prozessintegration). * Definiere **konkrete Assets**, die die jeweilige Rolle in dieser Phase benoetigt, um Einwaende zu entkraeften oder interne Mitstreiter zu ueberzeugen (z.B. "API-Dokumentation fuer Aufzugsbauer", "ROI-Rechner fuer CFO", "Sicherheits-Whitepaper"). * **Output:** Erstelle eine Markdown-Tabelle mit exakt diesen Spalten: *Phase | Rolle | Funktion (Buying Center) | Zentrale Frage / Beduerfnis | Moeglicher Deal-Breaker | Benoetigtes Asset / Format*. * **Format-Anforderung:** Antworte NUR mit den Ergebnissen fuer diesen einen Schritt. Deine Antwort muss mit der Ueberschrift \"## Schritt 7: Customer Journey\" beginnen.""" ] SYSTEM_PROMPT_EN = """# System Role You are a **B2B Marketing Researcher & Copywriter**. You analyze a company URL, identify the offer, target groups, specific target roles, their pain points and gains, and based on this, you formulate an appreciative, expert marketing message. **Answer only with results, no thought processes.** Support every statement with a specific page URL from the analyzed domain. Explicitly mark uncertainties. Your response must always be in the target language {{language}}. # Working Principles 1. **Source Prioritization:** Product Page -> Solutions Page -> Industries/References -> Company/About Us Page -> Blog/News (for verification only). 2. **Fact Policy:** Only derive from the domain of the provided URL; for assumptions, mark them as a *hypothesis*. 3. B2B Tone: Knowledgeable, respectful, not salesy. 4. Clarity & Structure: Document each stage as a Markdown table + an optional short summary. 5. At least 4 specific roles per target group (not generic). 6. Concise & Useful: Precise wording; no clichés.""" STEP_PROMPTS_EN = [ """# Task Perform **Step 1 - Understand the Offer (WHAT)** for the following company. # Inputs * **Company URL:** `{{company_url}}` * **Target Language of Output:** `{{language}}` * **Region(s) / Markets (optional):** `{{regions}}` * **Product/Solution Focus (optional):** `{{focus}}` # Instructions for Step 1 * Extract product(s)/service(s), core features, differentiation, and relevant value propositions. * Create a short summary (max. 4 bullet points) of the key findings. * **Output:** Table with columns: *Product/Solution | Description (1-2 sentences) | Core Features | Differentiation | Primary Source (URL)*. * **Format Requirement:** Respond ONLY with the results for this single step. Your response must start with the heading \"## Step 1: Offer (WHAT)\" and include the short summary and the Markdown table. Do not provide any other explanations.""", """# Task Now perform **Step 2 - Target Groups (WHO - Companies)**. # Context: Validated results from previous steps {{previous_steps_data}} # Instructions for Step 2 * Identify B2B target segments (industries/company types/sizes/regions) based on the given offer. * **Output:** Table: *Target Industry/Segment | Typical Company Characteristics | Region(s) | Proof of Relevance (URL)*. * **Format Requirement:** Respond ONLY with the results for this single step. Your response must start with the heading \"## Step 2: Target Groups (Companies)\" and include the Markdown table.""", """# Task Now perform **Step 3 - Personas/Roles (WHO - People)**. # Context: Validated results from previous steps {{previous_steps_data}} # Instructions for Step 3 * For each target industry: at least 4 **specific** roles with their area of responsibility and involvement in purchasing (E, I, D, U based on RACI logic). Do **not** invent people; logically derive roles from problem/process context. * **Output:** Table: *Role (precise) | Area of Responsibility | Why relevant for the product | Buying Involvement (E/I/D/U) | Source/Indication (URL or reference)*. * **Format Requirement:** Respond ONLY with the results for this single step. Your response must start with the heading \"## Step 3: Personas (Roles)\".""", """# Task Now perform **Step 4 - Pain Points per Role (WHY)**. # Context: Validated results from previous steps {{previous_steps_data}} # Instructions for Step 4 * For each role, formulate 3-5 specific pain points (observations, not features). * Tag each pain point with a category: **Cost | Time | Risk | Compliance | Quality | Employee Retention.** * Add an **Impact Estimate (EUR, h, %)** as a hypothesis. * **Output:** Table: *Role | Pain Point (specific, measurable/operational) | Category | Impact (Cost, Risk, Time) | Impact Estimate (EUR, h, %) | Urgency (high/medium/low) | Source/Indication (URL)*. * **Format Requirement:** Respond ONLY with the results for this single step. Your response must start with the heading \"## Step 4: Pain Points per Role\".""", """# Task Now perform **Step 5 - Gains & Benefits per Role (WHY switch)**. # Context: Validated results from previous steps {{previous_steps_data}} # Instructions for Step 5 * Based on the identified pain points, formulate 2-3 concrete gains (advantages/benefits) for each role. * Quantify the benefit as a hypothesis (e.g., savings in EUR, time gained in h, efficiency increase in %). * **Output:** Table: *Role | Gain (specific benefit) | Quantification (Hypothesis in EUR, h, %) | Source/Indication (URL)*. * **Format Requirement:** Respond ONLY with the results for this single step. Your response must start with the heading \"## Step 5: Gains & Benefits per Role\".""", """# Task Now perform **Step 6 - Marketing Message (HOW to speak)**. # Context: Validated results from previous steps {{previous_steps_data}} # Inputs for this step * **Desired channels for the message:** `{{channels}}` # Instructions for Step 6: Chain-of-Thought Analysis & Copywriting **FOCUS:** Create messages **EXCLUSIVELY** for the provided **Focus Industry: {{focus_industry}}**. Ignore all other industries. Your goal is to create a specific message for EACH role within this ONE industry. For each **[Role]** within the **[Focus Industry: {{focus_industry}}]**, perform the following thought process: 1. **Step 6.1 (Analysis): Product-Role Fit.** * Which product/solution from the "Offer" table (Step 1) is most relevant for the **[Role]**? 2. **Step 6.2 (Analysis): Industry Use Case.** * What are 1-2 typical use cases for the selected product in the **[Focus Industry]**? What does the **[Role]** actually do with it? 3. **Step 6.3 (Analysis): Benefit Quantification.** * Look at the Pain Points (Step 4) and Gains (Step 5) for the **[Role]**. * Derive a concrete KPI relevant to the **[Role]**. 4. **Step 6.4 (Synthesis): Formulate Message.** * Synthesize the findings from 6.1-6.3 into a concise core message (2-3 sentences) following the structure: **Observation (Problem) -> Low-threshold Solution Idea -> Product Bridge -> Quantified Benefit.** * Create variants of this message for the channels: {{channels}}. # Output Format Create ONLY the final Markdown table. * **Table Columns:** *Focus Industry | Role | Core Message (2-3 sentences) | {{channels}}*. * **Requirement:** Your response must start with the heading \"## Step 6: Messages\" and contain ONLY the complete Markdown table.""", """# Task Perform **Step 7 - Customer Journey & Buying Center**. # Context: Validated results from previous steps {{previous_steps_data}} # Focus Refer to the **Focus Industry: {{focus_industry}}**. # Instructions for Step 7 * Analyze the purchase journey ("Journey") from the first trigger to the contract. * Identify the **Buying Center dynamics** for each phase: Who drives it (Champion), who slows it down or audits (Gatekeeper/Evaluator), who decides (Decider)? * Focus specifically on **technical and organizational barriers** (e.g., IT security, interfaces like elevator control, process integration). * Define **concrete assets** that each role needs in this phase to invalidate objections or convince internal stakeholders (e.g., "API documentation for elevator manufacturers", "ROI calculator for CFO", "Security Whitepaper"). * **Output:** Create a Markdown table with exactly these columns: *Phase | Role | Function (Buying Center) | Key Question / Need | Potential Deal-Breaker | Needed Asset / Format*. * **Format Requirement:** Respond ONLY with the results for this single step. Your response must start with the heading \"## Step 7: Customer Journey\".""" ] PROMPTS = { 'de': { 'SYSTEM_PROMPT': SYSTEM_PROMPT_DE, 'STEP_PROMPTS': STEP_PROMPTS_DE, 'STEP_TITLES': { 'offer': 'Schritt 1: Angebot (WAS)', 'targetGroups': 'Schritt 2: Zielgruppen (WER - Unternehmen)', 'personas': 'Schritt 3: Zielpersonen/Rollen (WER - Personen)', 'painPoints': 'Schritt 4: Painpoints je Rolle (WARUM)', 'gains': 'Schritt 5: Gains & Nutzen je Rolle (WARUM wechseln)', 'messages': 'Schritt 6: Marketingbotschaften je Segment & Rolle (WIE sprechen)', 'customerJourney': 'Schritt 7: Customer Journey & Buying Center', }, 'SUMMARY_TITLE': 'Kurzresuemee:', 'SUMMARY_TEXT_FOR_STEP1': [ "Die Angebotsanalyse wurde erfolgreich auf Basis der Website-Inhalte generiert.", "Dies ist der erste Schritt des Prozesses, der vom neuen Python-Backend ausgefuehrt wird." ] }, 'en': { 'SYSTEM_PROMPT': SYSTEM_PROMPT_EN, 'STEP_PROMPTS': STEP_PROMPTS_EN, 'STEP_TITLES': { 'offer': 'Step 1: Offer (WHAT)', 'targetGroups': 'Step 2: Target Groups (WHO - Companies)', 'personas': 'Step 3: Personas/Roles (WHO - People)', 'painPoints': 'Step 4: Pain Points per Role (WHY)', 'gains': 'Step 5: Gains & Benefits per Role (WHY switch)', 'messages': 'Step 6: Marketing Messages per Segment & Role (HOW to speak)', 'customerJourney': 'Step 7: Customer Journey & Buying Center', }, 'SUMMARY_TITLE': 'Summary:', 'SUMMARY_TEXT_FOR_STEP1': [ "The offer analysis has been successfully generated based on website content.", "This is the first step of the process, executed by the new Python backend." ] } } # --- API & SCRAPING HELPERS --- def load_api_key(): api_key = os.getenv("GEMINI_API_KEY") if not api_key: logging.error("GEMINI_API_KEY environment variable not found.") return None return api_key def call_gemini_api(prompt, api_key, retries=3): url = f"https://generativelanguage.googleapis.com/v1/models/gemini-2.5-flash:generateContent?key={api_key}" headers = {'Content-Type': 'application/json'} payload = {"contents": [{"parts": [{"text": prompt}]}]} for attempt in range(retries): try: # Increased timeout to 600s (10 minutes) for complex Step 6 generation response = requests.post(url, headers=headers, json=payload, timeout=600) response.raise_for_status() result = response.json() if 'candidates' in result and result['candidates']: candidate = result['candidates'][0] if 'content' in candidate and 'parts' in candidate['content']: return candidate['content']['parts'][0]['text'] logging.warning(f"Unexpected API response structure: {result}") return "" except requests.exceptions.HTTPError as e: # Retry on server errors (500, 502, 503, 504) if e.response.status_code in [500, 502, 503, 504] and attempt < retries - 1: wait_time = (attempt + 1) * 5 logging.warning(f"API Error {e.response.status_code}. Retrying in {wait_time}s...") time.sleep(wait_time) continue logging.error(f"Error calling Gemini API: {e}") raise except Exception as e: # Retry on connection errors if attempt < retries - 1: wait_time = (attempt + 1) * 5 logging.warning(f"API Connection Error: {e}. Retrying in {wait_time}s...") time.sleep(wait_time) continue logging.error(f"Final Error calling Gemini API: {e}") raise def get_text_from_url(url): try: headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'} response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'lxml') # 1. Remove specific noise tags (including header/footer to avoid navigation links) for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'meta', 'link']): element.decompose() # 2. Clean attributes but keep structure and HREF for tag in soup.find_all(True): # We want to keep the tag (e.g.

, ), but clean attributes current_attrs = dict(tag.attrs) for attr in current_attrs: # Keep 'href' for links so the LLM can extract the source URL if tag.name == 'a' and attr == 'href': continue # Remove everything else (class, id, style, onclick, etc.) del tag[attr] # 3. Return the HTML structure (body only if possible) body = soup.find('body') if body: html_content = str(body) else: html_content = str(soup) # 4. Minimize whitespace to save tokens (remove empty lines) lines = [line.strip() for line in html_content.split('\n') if line.strip()] return "\n".join(lines) except Exception as e: logging.warning(f"Could not fetch or read URL {url}: {e}") return "" def find_relevant_links(base_url): try: headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'} response = requests.get(base_url, headers=headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'lxml') base_netloc = urlparse(base_url).netloc relevant_links = set() for a_tag in soup.find_all('a', href=True): href = a_tag['href'] link_text = a_tag.get_text(strip=True).lower() if any(keyword in href.lower() or keyword in link_text for keyword in LINK_KEYWORDS): abs_url = urljoin(base_url, href) if urlparse(abs_url).netloc == base_netloc: relevant_links.add(abs_url) return list(relevant_links)[:10] except Exception as e: logging.warning(f"Could not scrape base URL {base_url} for links: {e}") return [] def clean_llm_response(text): """Sanitizes the LLM response to remove excessive whitespace and common artifacts.""" if not text: return "" # 1. Replace multiple spaces/newlines with single ones (within a reasonable limit) # But preserve single newlines for markdown structure text = re.sub(r'[ \t]{5,}', ' ', text) # Replace 5+ spaces/tabs with 1 space # 2. Remove non-printable characters (except common ones) text = "".join(ch for ch in text if ch.isprintable() or ch in "\n\r\t") # 3. Fix common table artifacts like empty pipes at the end of lines text = re.sub(r'\|\s*$', '|', text, flags=re.MULTILINE) return text.strip() def parse_markdown_table(markdown_text): # Sanitize input first markdown_text = clean_llm_response(markdown_text) lines = [line.strip() for line in markdown_text.strip().split('\n') if line.strip()] table_lines = [] # 1. Identify all lines that look like table rows (start and end with |) for line in lines: if line.startswith('|') and line.endswith('|'): table_lines.append(line) if not table_lines: return {"headers": [], "rows": []} # 2. Find the separator line (|---|---|...) separator_index = -1 for i, line in enumerate(table_lines): # A separator line usually has at least one dash between pipes and no alphanumeric chars if '---' in line and not re.search(r'[a-zA-Z0-9]', line.replace('|', '').replace('-', '').replace(' ', '').replace(':', '')): separator_index = i break if separator_index == -1: # If no separator found, we might just have a list of rows where the first is the header # but usually LLMs provide the separator. Let's assume the first is header. header_line = table_lines[0] data_start = 1 else: # Separator found. Header is the line before it. if separator_index == 0: return {"headers": [], "rows": []} header_line = table_lines[separator_index - 1] data_start = separator_index + 1 # 3. Extract and clean headers headers = [re.sub(r'\*+([^*]+)\*+', r'\1', h.strip()).strip() for h in header_line.split('|') if h.strip()] if not headers: return {"headers": [], "rows": []} # 4. Extract and clean rows rows = [] for line in table_lines[data_start:]: # Split by | and remove leading/trailing empty elements from the split result raw_cells = line.split('|') # Handle the leading/trailing empty strings caused by the outer pipes cells = [re.sub(r'\*+([^*]+)\*+', r'\1', c.strip()).strip() for c in raw_cells] # If the line starts and ends with |, the first and last elements are empty strings if line.startswith('|'): cells = cells[1:] if line.endswith('|'): cells = cells[:-1] # Pad or truncate row to match header length if len(cells) < len(headers): cells.extend([''] * (len(headers) - len(cells))) elif len(cells) > len(headers): cells = cells[:len(headers)] # Only add row if it's not another separator or empty if any(cells): rows.append(cells) return {"headers": headers, "rows": rows} def format_context_for_prompt(analysis_data, language): context = "" current_prompts = PROMPTS[language] step_titles = current_prompts['STEP_TITLES'] step_keys = ['offer', 'targetGroups', 'personas', 'painPoints', 'gains', 'messages', 'customerJourney'] for i, step_key in enumerate(step_keys): step_data = analysis_data.get(step_key) if step_data: title = step_titles.get(step_key, f"Step {i+1}") context += f"\n\n## {title}\n\n" summary = step_data.get('summary') if summary: context += f"**{current_prompts['SUMMARY_TITLE']}**\n" context += "\n".join([f"* {s}" for s in summary]) + "\n\n" headers, rows = step_data.get('headers', []), step_data.get('rows', []) if headers and rows: context += f"| {' | '.join(headers)} |\n| {' | '.join(['---']*len(headers))} |\n" for row in rows: padded_row = row + [''] * (len(headers) - len(row)) context += f"| {' | '.join(padded_row)} |\n" context += "\n" return context # --- CORE LOGIC --- def start_generation(url, language, regions, focus): logging.info(f"Starting Step 1 for URL: {url} in language: {language}") api_key = load_api_key() if not api_key: raise ValueError("Gemini API key is missing.") # 1. Scraping Strategy: Main Page + Relevant Sub-pages urls_to_scrape = sorted(list(set([url] + find_relevant_links(url)))) grounding_text = "" logging.info(f"Identified {len(urls_to_scrape)} pages to scrape.") for u in urls_to_scrape: logging.info(f" - Scraping: {u}") text_content = get_text_from_url(u) if text_content: # Inject SOURCE_URL marker for the LLM grounding_text += f"SOURCE_URL: {u}\nCONTENT (Simplified HTML):\n{text_content}\n\n{'='*50}\n\n" if not grounding_text.strip(): raise RuntimeError(f"Failed to scrape content from {url}") current_prompts = PROMPTS[language] system_instruction = current_prompts['SYSTEM_PROMPT'].replace('{{language}}', language) # Updated Prompt: Removed length limit and added instruction for SOURCE_URL grounded_offer_prompt = f"{system_instruction}\n\n# TASK\nAnalyze the provided website content to understand the company's offerings. Your response MUST be a Markdown table.\n\n# CONTEXT\n- Website Content: The input provided is **Simplified HTML**. Use the structure (e.g.

-

headers,