Files
Brancheneinstufung2/b2b_marketing_orchestrator.py
2026-02-18 09:12:04 +00:00

683 lines
36 KiB
Python

# -*- coding: utf-8 -*-
import argparse
import json
import sys
import logging
import os
import re
from urllib.parse import urljoin, urlparse
import requests
import time
from bs4 import BeautifulSoup
from datetime import datetime
# Logging Setup
log_dir = "Log_from_docker"
if not os.path.exists(log_dir):
os.makedirs(log_dir)
timestamp = datetime.now().strftime("%Y-%m-%d")
log_file = os.path.join(log_dir, f"{timestamp}_b2b_marketing.log")
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(log_file, mode='a', encoding='utf-8'),
logging.StreamHandler(sys.stderr)
]
)
def save_detailed_log(step_name, content_type, content):
"""Saves detailed logs (prompts, raw responses) to separate files for inspection."""
try:
ts = datetime.now().strftime("%H-%M-%S")
filename = f"{ts}_{step_name}_{content_type}.txt"
filepath = os.path.join(log_dir, filename)
with open(filepath, "w", encoding="utf-8") as f:
f.write(content)
logging.info(f"Detailed log saved: {filepath}")
except Exception as e:
logging.error(f"Failed to save detailed log: {e}")
LINK_KEYWORDS = ['product', 'solution', 'service', 'industrie', 'branche', 'anwendung', 'produkt', 'loesung', 'dienstleistung', 'portfolio', 'angebot']
# --- PROMPTS (Single line strings with explicit newlines for maximum compatibility) ---
SYSTEM_PROMPT_DE = """# Systemrolle
Du bist ein **B2B-Marketing-Researcher & Copywriter**. Du analysierst eine Unternehmens-URL, identifizierst Angebot, Zielgruppen, konkrete Zielrollen, deren Painpoints sowie Gains und formulierst darauf basierend eine wertschaetzende, fachkundige Marketingbotschaft. **Antworte nur mit Ergebnissen, keine Gedankengaenge.** Belege jede Aussage mit einer konkreten Seiten-URL der analysierten Domain. Kennzeichne Unsicherheiten explizit. Deine Antwort muss immer in der Zielsprache {{language}} sein.
# Arbeitsprinzipien
1. **Quellenpriorisierung:** Produktseite -> Loesungsseite -> Branchen/Industrien/Referenzen -> Unternehmens-/Ueber-uns-Seite -> Blog/News (nur zur Verifikation).
2. **Faktenpolicy:** Nur aus der Domain der bereitgestellten URL ableiten; bei Vermutungen: als *Hypothese* kennzeichnen.
3. **B2B-Ton:** sachkundig, respektvoll, nicht marktschreierisch.
4. **Klarheit & Struktur:** Jede Stufe als Markdown-Tabelle + optionales Kurzresuemee dokumentieren.
5. **Mindestens 4 spezifische Rollen** je Zielgruppe (nicht generisch).
6. **Kompakt & nuetzlich:** Praezise Formulierungen; keine Floskeln."""
STEP_PROMPTS_DE = [
"""# Aufgabe
Fuehre **Schritt 1 - Angebot verstehen (WAS)** fuer das folgende Unternehmen durch.
# Eingaben
* **Unternehmens-URL:** `{{company_url}}`
* **Zielsprache der Ausgabe:** `{{language}}`
* **Region(en) / Maerkte (optional):** `{{regions}}`
* **Produkt-/Loesungsfokus (optional):** `{{focus}}`
# Anweisungen fuer Schritt 1
* Extrahiere Produkt(e)/Leistung(en), Kernfunktionen, Differenzierung, relevante Werteversprechen.
* Erstelle ein kurzes Resuemee (max. 4 Bulletpoints) der wichtigsten Erkenntnisse.
* **Output:** Tabelle mit Spalten: *Produkt/Loesung | Beschreibung (1-2 Saetze) | Kernfunktionen | Differenzierung | Primaere Quelle (URL)*.
* **Format-Anforderung:** Antworte NUR mit den Ergebnissen fuer diesen einen Schritt. Deine Antwort muss mit der Ueberschrift \"## Schritt 1: Angebot (WAS)\" beginnen und das Kurzresuemee sowie die Markdown-Tabelle enthalten. Gib keine weiteren Erklaerungen ab.""",
"""# Aufgabe
Fuehre nun **Schritt 2 - Zielgruppen (WER - Unternehmen)** durch.
# Kontext: Validierte Ergebnisse aus vorherigen Schritten
{{previous_steps_data}}
# Anweisungen fuer Schritt 2
* Identifiziere B2B-Zielsegmente (Branchen/Unternehmensarten/Groessen/Regionen) basierend auf dem gegebenen Angebot.
* **Output:** Tabelle: *Zielbranche/Segment | Typische Unternehmensmerkmale | Region(en) | Relevanzbeleg (URL)*.
* **Format-Anforderung:** Antworte NUR mit den Ergebnissen fuer diesen einen Schritt. Deine Antwort muss mit der Ueberschrift \"## Schritt 2: Zielgruppen (Unternehmen)\" beginnen und die Markdown-Tabelle enthalten.""",
"""# Aufgabe
Fuehre nun **Schritt 3 - Zielpersonen/Rollen (WER - Personen)** durch.
# Kontext: Validierte Ergebnisse aus vorherigen Schritten
{{previous_steps_data}}
# Anweisungen fuer Schritt 3
* Fuer jede Zielbranche: mind. 4 **spezifische** Rollen mit Verantwortungsbereich und Kaufbeteiligung (E, I, D, U nach RACI-Logik). Erfinde **keine** Personen; leite Rollen logisch aus Problem-/Prozessbezug ab.
* **Output:** Tabelle: *Rolle (praezise) | Verantwortungsbereich | Warum relevant fuer Produkt | Kaufbeteiligung (E/I/D/U) | Quelle/Indiz (URL oder Referenz)*.
* **Format-Anforderung:** Antworte NUR mit den Ergebnissen fuer diesen einen Schritt. Deine Antwort muss mit der Ueberschrift \"## Schritt 3: Zielpersonen (Rollen)\" beginnen.""",
"""# Aufgabe
Fuehre nun **Schritt 4 - Painpoints je Rolle (WARUM)** durch.
# Kontext: Validierte Ergebnisse aus vorherigen Schritten
{{previous_steps_data}}
# Anweisungen fuer Schritt 4
* Formuliere pro Rolle 3-5 konkrete Painpoints (Beobachtungen, keine Features).
* Tagge jeden Painpoint mit einer Kategorie: **Kosten | Zeit | Risiko | Compliance | Qualitaet | Mitarbeiterbindung.**
* Fuege eine **Impact-Schaetzung (EUR, h, %)** als Hypothese hinzu.
* **Output:** Tabelle: *Rolle | Painpoint (konkret, messbar/operativ) | Kategorie | Auswirkung (Kosten, Risiko, Zeit) | Impact-Schaetzung (EUR, h, %) | Dringlichkeit (hoch/mittel/niedrig) | Quelle/Indiz (URL)*.
* **Format-Anforderung:** Antworte NUR mit den Ergebnissen fuer diesen einen Schritt. Deine Antwort muss mit der Ueberschrift \"## Schritt 4: Painpoints je Rolle\" beginnen.""",
"""# Aufgabe
Fuehre nun **Schritt 5 - Gains & Nutzen je Rolle (WARUM wechseln)** durch.
# Kontext: Validierte Ergebnisse aus vorherigen Schritten
{{previous_steps_data}}
# Anweisungen fuer Schritt 5
* Basierend auf den identifizierten Painpoints, formuliere pro Rolle 2-3 konkrete Gains (Vorteile/Nutzen).
* Quantifiziere den Nutzen als Hypothese (z.B. Einsparung in EUR, Zeitgewinn in h, Effizienzsteigerung in %).
* **Output:** Tabelle: *Rolle | Gain (konkreter Nutzen) | Quantifizierung (Hypothese in EUR, h, %) | Quelle/Indiz (URL)*.
* **Format-Anforderung:** Antworte NUR mit den Ergebnissen fuer diesen einen Schritt. Deine Antwort muss mit der Ueberschrift \"## Schritt 5: Gains & Nutzen je Rolle\" beginnen.""",
"""# Aufgabe
Fuehre nun **Schritt 6 - Marketingbotschaft (WIE sprechen)** durch.
# Kontext: Validierte Ergebnisse aus vorherigen Schritten
{{previous_steps_data}}
# Eingaben fuer diesen Schritt
* **Gewuenschte Kanaele fuer die Botschaft:** `{{channels}}`
# Anweisungen fuer Schritt 6: Chain-of-Thought-Analyse & Texterstellung
**FOKUS:** Erstelle die Botschaften **AUSSCHLIESSLICH** fuer die vorgegebene **Fokus-Branche: {{focus_industry}}**.
Ignoriere alle anderen Branchen. Dein Ziel ist es, fuer JEDE Rolle innerhalb dieser EINEN Branche eine spezifische Botschaft zu entwickeln.
Fuehre fuer jede **[Rolle]** innerhalb der **[Fokus-Branche: {{focus_industry}}]** den folgenden Denkprozess durch:
1. **Schritt 6.1 (Analyse): Produkt-Rollen-Fit.**
* Welches Produkt/welche Loesung aus der "Angebot"-Tabelle (Schritt 1) ist fuer die **[Rolle]** am relevantesten?
2. **Schritt 6.2 (Analyse): Branchen-Use-Case.**
* Was sind 1-2 typische Anwendungsfaelle fuer das ausgewaehlte Produkt in der **{{focus_industry}}**? Was macht die **[Rolle]** damit konkret?
3. **Schritt 6.3 (Analyse): Nutzen-Quantifizierung.**
* Betrachte die Painpoints (Schritt 4) und Gains (Schritt 5) fuer die **[Rolle]**.
* Leite daraus einen konkreten, fuer die **[Rolle]** relevanten KPI ab.
4. **Schritt 6.4 (Synthese): Botschaft formulieren.**
* Synthetisiere die Erkenntnisse aus 6.1-6.3 zu einer praegnanten Kernbotschaft (2-3 Saetze) nach der Struktur: **Beobachtung (Problem) -> Niedrigschwellige Loesungsidee -> Produkt-Bruecke -> Quantifizierter Nutzen.**
* Erstelle Varianten dieser Botschaft fuer die Kanaele: {{channels}}.
# Output Format
Erstelle ONLY die finale Markdown-Tabelle.
* **Table Columns:** *Fokus-Branche | Rolle | Kernbotschaft (2-3 sentences) | {{channels}}*.
* **Requirement:** Your response must start with the heading \"## Schritt 6: Botschaften\" and contain ONLY die vollstaendige Markdown-Tabelle.""",
"""# Aufgabe
Fuehre **Schritt 7 - Customer Journey & Buying Center** durch.
# Kontext: Validierte Ergebnisse aus vorherigen Schritten
{{previous_steps_data}}
# Fokus
Beziehe dich auf die **Fokus-Branche: {{focus_industry}}**.
# Anweisungen fuer Schritt 7
* Analysiere die Kaufreise ("Journey") vom ersten Trigger bis zum Vertrag.
* Identifiziere fuer jede Phase die **Dynamik im Buying Center**: Wer treibt an (Champion), wer bremst oder prueft (Gatekeeper/Evaluator), wer entscheidet (Decider)?
* Gehe besonders auf **technische und organisatorische Barrieren** ein (z.B. IT-Sicherheit, Schnittstellen wie Aufzugssteuerung, Prozessintegration).
* Definiere **konkrete Assets**, die die jeweilige Rolle in dieser Phase benoetigt, um Einwaende zu entkraeften oder interne Mitstreiter zu ueberzeugen (z.B. "API-Dokumentation fuer Aufzugsbauer", "ROI-Rechner fuer CFO", "Sicherheits-Whitepaper").
* **Output:** Erstelle eine Markdown-Tabelle mit exakt diesen Spalten: *Phase | Rolle | Funktion (Buying Center) | Zentrale Frage / Beduerfnis | Moeglicher Deal-Breaker | Benoetigtes Asset / Format*.
* **Format-Anforderung:** Antworte NUR mit den Ergebnissen fuer diesen einen Schritt. Deine Antwort muss mit der Ueberschrift \"## Schritt 7: Customer Journey\" beginnen."""
]
SYSTEM_PROMPT_EN = """# System Role
You are a **B2B Marketing Researcher & Copywriter**. You analyze a company URL, identify the offer, target groups, specific target roles, their pain points and gains, and based on this, you formulate an appreciative, expert marketing message. **Answer only with results, no thought processes.** Support every statement with a specific page URL from the analyzed domain. Explicitly mark uncertainties. Your response must always be in the target language {{language}}.
# Working Principles
1. **Source Prioritization:** Product Page -> Solutions Page -> Industries/References -> Company/About Us Page -> Blog/News (for verification only).
2. **Fact Policy:** Only derive from the domain of the provided URL; for assumptions, mark them as a *hypothesis*.
3. B2B Tone: Knowledgeable, respectful, not salesy.
4. Clarity & Structure: Document each stage as a Markdown table + an optional short summary.
5. At least 4 specific roles per target group (not generic).
6. Concise & Useful: Precise wording; no clichés."""
STEP_PROMPTS_EN = [
"""# Task
Perform **Step 1 - Understand the Offer (WHAT)** for the following company.
# Inputs
* **Company URL:** `{{company_url}}`
* **Target Language of Output:** `{{language}}`
* **Region(s) / Markets (optional):** `{{regions}}`
* **Product/Solution Focus (optional):** `{{focus}}`
# Instructions for Step 1
* Extract product(s)/service(s), core features, differentiation, and relevant value propositions.
* Create a short summary (max. 4 bullet points) of the key findings.
* **Output:** Table with columns: *Product/Solution | Description (1-2 sentences) | Core Features | Differentiation | Primary Source (URL)*.
* **Format Requirement:** Respond ONLY with the results for this single step. Your response must start with the heading \"## Step 1: Offer (WHAT)\" and include the short summary and the Markdown table. Do not provide any other explanations.""",
"""# Task
Now perform **Step 2 - Target Groups (WHO - Companies)**.
# Context: Validated results from previous steps
{{previous_steps_data}}
# Instructions for Step 2
* Identify B2B target segments (industries/company types/sizes/regions) based on the given offer.
* **Output:** Table: *Target Industry/Segment | Typical Company Characteristics | Region(s) | Proof of Relevance (URL)*.
* **Format Requirement:** Respond ONLY with the results for this single step. Your response must start with the heading \"## Step 2: Target Groups (Companies)\" and include the Markdown table.""",
"""# Task
Now perform **Step 3 - Personas/Roles (WHO - People)**.
# Context: Validated results from previous steps
{{previous_steps_data}}
# Instructions for Step 3
* For each target industry: at least 4 **specific** roles with their area of responsibility and involvement in purchasing (E, I, D, U based on RACI logic). Do **not** invent people; logically derive roles from problem/process context.
* **Output:** Table: *Role (precise) | Area of Responsibility | Why relevant for the product | Buying Involvement (E/I/D/U) | Source/Indication (URL or reference)*.
* **Format Requirement:** Respond ONLY with the results for this single step. Your response must start with the heading \"## Step 3: Personas (Roles)\".""",
"""# Task
Now perform **Step 4 - Pain Points per Role (WHY)**.
# Context: Validated results from previous steps
{{previous_steps_data}}
# Instructions for Step 4
* For each role, formulate 3-5 specific pain points (observations, not features).
* Tag each pain point with a category: **Cost | Time | Risk | Compliance | Quality | Employee Retention.**
* Add an **Impact Estimate (EUR, h, %)** as a hypothesis.
* **Output:** Table: *Role | Pain Point (specific, measurable/operational) | Category | Impact (Cost, Risk, Time) | Impact Estimate (EUR, h, %) | Urgency (high/medium/low) | Source/Indication (URL)*.
* **Format Requirement:** Respond ONLY with the results for this single step. Your response must start with the heading \"## Step 4: Pain Points per Role\".""",
"""# Task
Now perform **Step 5 - Gains & Benefits per Role (WHY switch)**.
# Context: Validated results from previous steps
{{previous_steps_data}}
# Instructions for Step 5
* Based on the identified pain points, formulate 2-3 concrete gains (advantages/benefits) for each role.
* Quantify the benefit as a hypothesis (e.g., savings in EUR, time gained in h, efficiency increase in %).
* **Output:** Table: *Role | Gain (specific benefit) | Quantification (Hypothesis in EUR, h, %) | Source/Indication (URL)*.
* **Format Requirement:** Respond ONLY with the results for this single step. Your response must start with the heading \"## Step 5: Gains & Benefits per Role\".""",
"""# Task
Now perform **Step 6 - Marketing Message (HOW to speak)**.
# Context: Validated results from previous steps
{{previous_steps_data}}
# Inputs for this step
* **Desired channels for the message:** `{{channels}}`
# Instructions for Step 6: Chain-of-Thought Analysis & Copywriting
**FOCUS:** Create messages **EXCLUSIVELY** for the provided **Focus Industry: {{focus_industry}}**.
Ignore all other industries. Your goal is to create a specific message for EACH role within this ONE industry.
For each **[Role]** within the **[Focus Industry: {{focus_industry}}]**, perform the following thought process:
1. **Step 6.1 (Analysis): Product-Role Fit.**
* Which product/solution from the "Offer" table (Step 1) is most relevant for the **[Role]**?
2. **Step 6.2 (Analysis): Industry Use Case.**
* What are 1-2 typical use cases for the selected product in the **[Focus Industry]**? What does the **[Role]** actually do with it?
3. **Step 6.3 (Analysis): Benefit Quantification.**
* Look at the Pain Points (Step 4) and Gains (Step 5) for the **[Role]**.
* Derive a concrete KPI relevant to the **[Role]**.
4. **Step 6.4 (Synthesis): Formulate Message.**
* Synthesize the findings from 6.1-6.3 into a concise core message (2-3 sentences) following the structure: **Observation (Problem) -> Low-threshold Solution Idea -> Product Bridge -> Quantified Benefit.**
* Create variants of this message for the channels: {{channels}}.
# Output Format
Create ONLY the final Markdown table.
* **Table Columns:** *Focus Industry | Role | Core Message (2-3 sentences) | {{channels}}*.
* **Requirement:** Your response must start with the heading \"## Step 6: Messages\" and contain ONLY the complete Markdown table.""",
"""# Task
Perform **Step 7 - Customer Journey & Buying Center**.
# Context: Validated results from previous steps
{{previous_steps_data}}
# Focus
Refer to the **Focus Industry: {{focus_industry}}**.
# Instructions for Step 7
* Analyze the purchase journey ("Journey") from the first trigger to the contract.
* Identify the **Buying Center dynamics** for each phase: Who drives it (Champion), who slows it down or audits (Gatekeeper/Evaluator), who decides (Decider)?
* Focus specifically on **technical and organizational barriers** (e.g., IT security, interfaces like elevator control, process integration).
* Define **concrete assets** that each role needs in this phase to invalidate objections or convince internal stakeholders (e.g., "API documentation for elevator manufacturers", "ROI calculator for CFO", "Security Whitepaper").
* **Output:** Create a Markdown table with exactly these columns: *Phase | Role | Function (Buying Center) | Key Question / Need | Potential Deal-Breaker | Needed Asset / Format*.
* **Format Requirement:** Respond ONLY with the results for this single step. Your response must start with the heading \"## Step 7: Customer Journey\"."""
]
PROMPTS = {
'de': {
'SYSTEM_PROMPT': SYSTEM_PROMPT_DE,
'STEP_PROMPTS': STEP_PROMPTS_DE,
'STEP_TITLES': {
'offer': 'Schritt 1: Angebot (WAS)',
'targetGroups': 'Schritt 2: Zielgruppen (WER - Unternehmen)',
'personas': 'Schritt 3: Zielpersonen/Rollen (WER - Personen)',
'painPoints': 'Schritt 4: Painpoints je Rolle (WARUM)',
'gains': 'Schritt 5: Gains & Nutzen je Rolle (WARUM wechseln)',
'messages': 'Schritt 6: Marketingbotschaften je Segment & Rolle (WIE sprechen)',
'customerJourney': 'Schritt 7: Customer Journey & Buying Center',
},
'SUMMARY_TITLE': 'Kurzresuemee:',
'SUMMARY_TEXT_FOR_STEP1': [
"Die Angebotsanalyse wurde erfolgreich auf Basis der Website-Inhalte generiert.",
"Dies ist der erste Schritt des Prozesses, der vom neuen Python-Backend ausgefuehrt wird."
]
},
'en': {
'SYSTEM_PROMPT': SYSTEM_PROMPT_EN,
'STEP_PROMPTS': STEP_PROMPTS_EN,
'STEP_TITLES': {
'offer': 'Step 1: Offer (WHAT)',
'targetGroups': 'Step 2: Target Groups (WHO - Companies)',
'personas': 'Step 3: Personas/Roles (WHO - People)',
'painPoints': 'Step 4: Pain Points per Role (WHY)',
'gains': 'Step 5: Gains & Benefits per Role (WHY switch)',
'messages': 'Step 6: Marketing Messages per Segment & Role (HOW to speak)',
'customerJourney': 'Step 7: Customer Journey & Buying Center',
},
'SUMMARY_TITLE': 'Summary:',
'SUMMARY_TEXT_FOR_STEP1': [
"The offer analysis has been successfully generated based on website content.",
"This is the first step of the process, executed by the new Python backend."
]
}
}
# --- API & SCRAPING HELPERS ---
def load_api_key():
try:
with open("gemini_api_key.txt", "r") as f:
return f.read().strip()
except FileNotFoundError:
logging.error("API key file 'gemini_api_key.txt' not found.")
return None
def call_gemini_api(prompt, api_key, retries=3):
url = f"https://generativelanguage.googleapis.com/v1/models/gemini-2.5-flash:generateContent?key={api_key}"
headers = {'Content-Type': 'application/json'}
payload = {"contents": [{"parts": [{"text": prompt}]}]}
for attempt in range(retries):
try:
# Increased timeout to 600s (10 minutes) for complex Step 6 generation
response = requests.post(url, headers=headers, json=payload, timeout=600)
response.raise_for_status()
result = response.json()
if 'candidates' in result and result['candidates']:
candidate = result['candidates'][0]
if 'content' in candidate and 'parts' in candidate['content']:
return candidate['content']['parts'][0]['text']
logging.warning(f"Unexpected API response structure: {result}")
return ""
except requests.exceptions.HTTPError as e:
# Retry on server errors (500, 502, 503, 504)
if e.response.status_code in [500, 502, 503, 504] and attempt < retries - 1:
wait_time = (attempt + 1) * 5
logging.warning(f"API Error {e.response.status_code}. Retrying in {wait_time}s...")
time.sleep(wait_time)
continue
logging.error(f"Error calling Gemini API: {e}")
raise
except Exception as e:
# Retry on connection errors
if attempt < retries - 1:
wait_time = (attempt + 1) * 5
logging.warning(f"API Connection Error: {e}. Retrying in {wait_time}s...")
time.sleep(wait_time)
continue
logging.error(f"Final Error calling Gemini API: {e}")
raise
def get_text_from_url(url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'lxml')
# 1. Remove specific noise tags (including header/footer to avoid navigation links)
for element in soup(['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'meta', 'link']):
element.decompose()
# 2. Clean attributes but keep structure and HREF
for tag in soup.find_all(True):
# We want to keep the tag (e.g. <h1>, <a>), but clean attributes
current_attrs = dict(tag.attrs)
for attr in current_attrs:
# Keep 'href' for links so the LLM can extract the source URL
if tag.name == 'a' and attr == 'href':
continue
# Remove everything else (class, id, style, onclick, etc.)
del tag[attr]
# 3. Return the HTML structure (body only if possible)
body = soup.find('body')
if body:
html_content = str(body)
else:
html_content = str(soup)
# 4. Minimize whitespace to save tokens (remove empty lines)
lines = [line.strip() for line in html_content.split('\n') if line.strip()]
return "\n".join(lines)
except Exception as e:
logging.warning(f"Could not fetch or read URL {url}: {e}")
return ""
def find_relevant_links(base_url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
response = requests.get(base_url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'lxml')
base_netloc = urlparse(base_url).netloc
relevant_links = set()
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
link_text = a_tag.get_text(strip=True).lower()
if any(keyword in href.lower() or keyword in link_text for keyword in LINK_KEYWORDS):
abs_url = urljoin(base_url, href)
if urlparse(abs_url).netloc == base_netloc:
relevant_links.add(abs_url)
return list(relevant_links)[:10]
except Exception as e:
logging.warning(f"Could not scrape base URL {base_url} for links: {e}")
return []
def clean_llm_response(text):
"""Sanitizes the LLM response to remove excessive whitespace and common artifacts."""
if not text: return ""
# 1. Replace multiple spaces/newlines with single ones (within a reasonable limit)
# But preserve single newlines for markdown structure
text = re.sub(r'[ \t]{5,}', ' ', text) # Replace 5+ spaces/tabs with 1 space
# 2. Remove non-printable characters (except common ones)
text = "".join(ch for ch in text if ch.isprintable() or ch in "\n\r\t")
# 3. Fix common table artifacts like empty pipes at the end of lines
text = re.sub(r'\|\s*$', '|', text, flags=re.MULTILINE)
return text.strip()
def parse_markdown_table(markdown_text):
# Sanitize input first
markdown_text = clean_llm_response(markdown_text)
lines = [line.strip() for line in markdown_text.strip().split('\n') if line.strip()]
table_lines = []
# 1. Identify all lines that look like table rows (start and end with |)
for line in lines:
if line.startswith('|') and line.endswith('|'):
table_lines.append(line)
if not table_lines:
return {"headers": [], "rows": []}
# 2. Find the separator line (|---|---|...)
separator_index = -1
for i, line in enumerate(table_lines):
# A separator line usually has at least one dash between pipes and no alphanumeric chars
if '---' in line and not re.search(r'[a-zA-Z0-9]', line.replace('|', '').replace('-', '').replace(' ', '').replace(':', '')):
separator_index = i
break
if separator_index == -1:
# If no separator found, we might just have a list of rows where the first is the header
# but usually LLMs provide the separator. Let's assume the first is header.
header_line = table_lines[0]
data_start = 1
else:
# Separator found. Header is the line before it.
if separator_index == 0: return {"headers": [], "rows": []}
header_line = table_lines[separator_index - 1]
data_start = separator_index + 1
# 3. Extract and clean headers
headers = [re.sub(r'\*+([^*]+)\*+', r'\1', h.strip()).strip() for h in header_line.split('|') if h.strip()]
if not headers: return {"headers": [], "rows": []}
# 4. Extract and clean rows
rows = []
for line in table_lines[data_start:]:
# Split by | and remove leading/trailing empty elements from the split result
raw_cells = line.split('|')
# Handle the leading/trailing empty strings caused by the outer pipes
cells = [re.sub(r'\*+([^*]+)\*+', r'\1', c.strip()).strip() for c in raw_cells]
# If the line starts and ends with |, the first and last elements are empty strings
if line.startswith('|'): cells = cells[1:]
if line.endswith('|'): cells = cells[:-1]
# Pad or truncate row to match header length
if len(cells) < len(headers):
cells.extend([''] * (len(headers) - len(cells)))
elif len(cells) > len(headers):
cells = cells[:len(headers)]
# Only add row if it's not another separator or empty
if any(cells):
rows.append(cells)
return {"headers": headers, "rows": rows}
def format_context_for_prompt(analysis_data, language):
context = ""
current_prompts = PROMPTS[language]
step_titles = current_prompts['STEP_TITLES']
step_keys = ['offer', 'targetGroups', 'personas', 'painPoints', 'gains', 'messages', 'customerJourney']
for i, step_key in enumerate(step_keys):
step_data = analysis_data.get(step_key)
if step_data:
title = step_titles.get(step_key, f"Step {i+1}")
context += f"\n\n## {title}\n\n"
summary = step_data.get('summary')
if summary:
context += f"**{current_prompts['SUMMARY_TITLE']}**\n"
context += "\n".join([f"* {s}" for s in summary]) + "\n\n"
headers, rows = step_data.get('headers', []), step_data.get('rows', [])
if headers and rows:
context += f"| {' | '.join(headers)} |\n| {' | '.join(['---']*len(headers))} |\n"
for row in rows:
padded_row = row + [''] * (len(headers) - len(row))
context += f"| {' | '.join(padded_row)} |\n"
context += "\n"
return context
# --- CORE LOGIC ---
def start_generation(url, language, regions, focus):
logging.info(f"Starting Step 1 for URL: {url} in language: {language}")
api_key = load_api_key()
if not api_key: raise ValueError("Gemini API key is missing.")
# 1. Scraping Strategy: Main Page + Relevant Sub-pages
urls_to_scrape = sorted(list(set([url] + find_relevant_links(url))))
grounding_text = ""
logging.info(f"Identified {len(urls_to_scrape)} pages to scrape.")
for u in urls_to_scrape:
logging.info(f" - Scraping: {u}")
text_content = get_text_from_url(u)
if text_content:
# Inject SOURCE_URL marker for the LLM
grounding_text += f"SOURCE_URL: {u}\nCONTENT (Simplified HTML):\n{text_content}\n\n{'='*50}\n\n"
if not grounding_text.strip(): raise RuntimeError(f"Failed to scrape content from {url}")
current_prompts = PROMPTS[language]
system_instruction = current_prompts['SYSTEM_PROMPT'].replace('{{language}}', language)
# Updated Prompt: Removed length limit and added instruction for SOURCE_URL
grounded_offer_prompt = f"{system_instruction}\n\n# TASK\nAnalyze the provided website content to understand the company's offerings. Your response MUST be a Markdown table.\n\n# CONTEXT\n- Website Content: The input provided is **Simplified HTML**. Use the structure (e.g. <h1>-<h6> headers, <ul> lists, <div> groupings) to identify distinct products or services.\n- **Content Data (with SOURCE_URL markers):** \n```html\n{grounding_text}\n```\n- Target Language: {language}\n- Company URL: {url}\n- Focus: {focus or 'N/A'}\n- Regions: {regions or 'N/A'}\n\n# INSTRUCTIONS\n1. Identify products/services by looking for recurring HTML patterns (e.g. a Header followed by a description and a 'Learn More' link).\n2. Create Markdown table: Produkt/Loesung | Beschreibung (1-2 Saetze) | Kernfunktionen | Differenzierung | Primaere Quelle (URL)\n3. **IMPORTANT:** For the 'Primaere Quelle (URL)' column, look for the `<a href='...'>` tag NEAREST to the product description. Combine it with the `SOURCE_URL` if it's a relative link. Do not just link the homepage.\n4. Response must be ONLY the table starting with '## {current_prompts['STEP_TITLES']['offer']}'."
# Log the full prompt (Input)
save_detailed_log("step1_offer", "prompt", grounded_offer_prompt)
response_text = call_gemini_api(grounded_offer_prompt, api_key)
# Log the full response (Output)
save_detailed_log("step1_offer", "response", response_text)
step1_title = current_prompts['STEP_TITLES']['offer']
# Flexible header matching
title_match = re.search(rf'^##\s*(?:Schritt|Step)\s*1.*$', response_text, re.IGNORECASE | re.MULTILINE)
content = response_text[title_match.end():].strip() if title_match else response_text
table_data = parse_markdown_table(content)
return {
"_initial_inputs": {"url": url, "language": language, "regions": regions, "focus": focus},
"offer": {"summary": current_prompts['SUMMARY_TEXT_FOR_STEP1'], "headers": table_data['headers'], "rows": table_data['rows']}
}
def next_step(language, context_file, generation_step, channels, focus_industry=None):
logging.info(f"Starting Step {generation_step} in language: {language}")
api_key = load_api_key()
if not api_key: raise ValueError("Gemini API key is missing.")
with open(context_file, 'r', encoding='utf-8') as f: analysis_data = json.load(f)
current_prompts = PROMPTS[language]
system_instruction = current_prompts['SYSTEM_PROMPT'].replace('{{language}}', language)
step_prompt_template = current_prompts['STEP_PROMPTS'][generation_step - 1]
previous_steps_markdown = format_context_for_prompt(analysis_data, language)
prompt = step_prompt_template.replace('{{previous_steps_data}}', previous_steps_markdown)
if '{{channels}}' in prompt: prompt = prompt.replace('{{channels}}', channels or 'LinkedIn, Kaltmail, Landingpage')
# Inject focus industry if provided (for Step 6)
if '{{focus_industry}}' in prompt:
prompt = prompt.replace('{{focus_industry}}', focus_industry or 'Primary Industry')
initial_inputs = analysis_data.get('_initial_inputs', {})
# Helper to safely get string values even if they are None/null in the JSON
def get_safe(key):
val = initial_inputs.get(key)
return str(val) if val is not None else 'N/A'
prompt = prompt.replace('{{company_url}}', get_safe('url')).replace('{{language}}', language).replace('{{regions}}', get_safe('regions')).replace('{{focus}}', get_safe('focus'))
full_prompt = f"{system_instruction}\n\n{prompt}"
# Log the full prompt
save_detailed_log(f"step{generation_step}", "prompt", full_prompt)
response_text = call_gemini_api(full_prompt, api_key)
# Log the full response
save_detailed_log(f"step{generation_step}", "response", response_text)
step_key = ['offer', 'targetGroups', 'personas', 'painPoints', 'gains', 'messages', 'customerJourney'][generation_step - 1]
expected_title = current_prompts['STEP_TITLES'][step_key]
# Flexible header matching
title_match = re.search(rf'^##\s*(?:Schritt|Step)\s*{generation_step}.*$', response_text, re.IGNORECASE | re.MULTILINE)
content = response_text[title_match.end():].strip() if title_match else response_text
table_data = parse_markdown_table(content)
# Fixed Regex: Added proper grouping (?: ... ) around the stop tokens
summary_match = re.search(r'\*\*(?:Kurzresuemee|Summary).*?:\*\*\s*([\s\S]*?)(?:\| ---|## (?:Schritt|Step))', response_text, re.IGNORECASE)
summary = [re.sub(r'^\*\s*|^-\s*|^\d+\.\s*', '', s.strip()) for s in summary_match[1].split('\n') if s.strip()] if summary_match else []
return {step_key: {"summary": summary, "headers": table_data['headers'], "rows": table_data['rows']}}
def enrich_product(product_name, product_url, language):
logging.info(f"Enriching product: {product_name} ({product_url})")
api_key = load_api_key()
if not api_key: raise ValueError("Gemini API key is missing.")
grounding_text = ""
if product_url:
grounding_text = get_text_from_url(product_url)
prompt_text = f"""
# ANWEISUNG
Du bist ein B2B-Marketing-Analyst. Deine Aufgabe ist es, die Daten für EIN Produkt zu generieren.
Basierend auf dem Produktnamen und (optional) dem Inhalt der Produkt-URL, fülle die Spalten einer Markdown-Tabelle aus.
Die Ausgabe MUSS eine einzelne, kommaseparierte Zeile sein, die in eine Tabelle passt. KEINE Header, KEIN Markdown, nur die Werte.
# PRODUKT
- Name: "{product_name}"
- URL-Inhalt: "{grounding_text[:3000]}..."
# SPALTEN
Produkt/Lösung | Beschreibung (1-2 Sätze) | Kernfunktionen | Differenzierung | Primäre Quelle (URL)
# BEISPIEL-OUTPUT
Saugroboter NR1500,Ein professioneller Saugroboter für große Büroflächen.,Autonome Navigation;Intelligente Kartierung;Lange Akkulaufzeit,Fokus auf B2B-Markt;Datenschutzkonform,https://nexaro.com/products/nr1500
# DEINE AUFGABE
Erstelle jetzt die kommaseparierte Zeile für das Produkt "{product_name}".
"""
response_text = call_gemini_api(prompt_text, api_key)
# Return as a simple list of strings
return [cell.strip() for cell in response_text.split(',')]
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--mode', required=True)
parser.add_argument('--url')
parser.add_argument('--focus')
parser.add_argument('--regions')
parser.add_argument('--context_file')
parser.add_argument('--generation_step', type=int)
parser.add_argument('--channels')
parser.add_argument('--language', required=True)
parser.add_argument('--focus_industry') # New argument
parser.add_argument('--product_name')
parser.add_argument('--product_url')
args = parser.parse_args()
try:
if args.mode == 'start_generation': result = start_generation(args.url, args.language, args.regions, args.focus)
elif args.mode == 'next_step': result = next_step(args.language, args.context_file, args.generation_step, args.channels, args.focus_industry)
elif args.mode == 'enrich_product': result = enrich_product(args.product_name, args.product_url, args.language)
sys.stdout.write(json.dumps(result, ensure_ascii=False))
except Exception as e:
logging.error(f"Error: {e}", exc_info=True)
sys.stdout.write(json.dumps({"error": str(e)}, ensure_ascii=False))
sys.exit(1)
if __name__ == '__main__': main()