feat(gtm): Implement Meta-Framework for strategic analysis

Refactors the GTM orchestrator prompts (phases 2-9) to use a question-based strategic framework derived from the internal marketing blueprint. This new 'Meta-Framework' approach ensures strategic depth and prevents content pollution from irrelevant examples when analyzing new product categories.

- Updates orchestrator prompts in .
- Adds documentation in  explaining how to modify the new strategy logic.
- Includes minor fixes to the Node.js  and dependency updates in .
This commit is contained in:
2026-01-14 15:34:15 +00:00
parent dd85931561
commit 63243cd344
5 changed files with 4427 additions and 57 deletions

View File

@@ -433,33 +433,78 @@ def find_relevant_links(base_url):
logging.warning(f"Could not scrape base URL {base_url} for links: {e}")
return []
def clean_llm_response(text):
"""Sanitizes the LLM response to remove excessive whitespace and common artifacts."""
if not text: return ""
# 1. Replace multiple spaces/newlines with single ones (within a reasonable limit)
# But preserve single newlines for markdown structure
text = re.sub(r'[ \t]{5,}', ' ', text) # Replace 5+ spaces/tabs with 1 space
# 2. Remove non-printable characters (except common ones)
text = "".join(ch for ch in text if ch.isprintable() or ch in "\n\r\t")
# 3. Fix common table artifacts like empty pipes at the end of lines
text = re.sub(r'\|\s*$', '|', text, flags=re.MULTILINE)
return text.strip()
def parse_markdown_table(markdown_text):
lines = [line.strip() for line in markdown_text.strip().split('\n')]
# Sanitize input first
markdown_text = clean_llm_response(markdown_text)
lines = [line.strip() for line in markdown_text.strip().split('\n') if line.strip()]
table_lines = []
in_table_section = False
# 1. Identify all lines that look like table rows (start and end with |)
for line in lines:
if re.match(r'^\|.*\|$', line) and '---' not in line:
in_table_section = True
if line.startswith('|') and line.endswith('|'):
table_lines.append(line)
elif in_table_section and '---' in line and re.match(r'^\|(:?-+:?)\|', line.replace(' ', '')):
table_lines.append(line)
elif in_table_section:
break
if not table_lines: return {"headers": [], "rows": []}
if not table_lines:
return {"headers": [], "rows": []}
# 2. Find the separator line (|---|---|...)
separator_index = -1
for i, line in enumerate(table_lines):
if '---' in line and re.match(r'^\|(:?-+:?)\|', line.replace(' ', '')):
# A separator line usually has at least one dash between pipes and no alphanumeric chars
if '---' in line and not re.search(r'[a-zA-Z0-9]', line.replace('|', '').replace('-', '').replace(' ', '').replace(':', '')):
separator_index = i
break
if separator_index == -1 or separator_index == 0: return {"headers": [], "rows": []}
header_line = table_lines[0]
if separator_index == -1:
# If no separator found, we might just have a list of rows where the first is the header
# but usually LLMs provide the separator. Let's assume the first is header.
header_line = table_lines[0]
data_start = 1
else:
# Separator found. Header is the line before it.
if separator_index == 0: return {"headers": [], "rows": []}
header_line = table_lines[separator_index - 1]
data_start = separator_index + 1
# 3. Extract and clean headers
headers = [re.sub(r'\*+([^*]+)\*+', r'\1', h.strip()).strip() for h in header_line.split('|') if h.strip()]
if not headers: return {"headers": [], "rows": []}
# 4. Extract and clean rows
rows = []
for line in table_lines[separator_index + 1:]:
for line in table_lines[data_start:]:
# Split by | and remove leading/trailing empty elements from the split result
raw_cells = line.split('|')
cells = [re.sub(r'\*+([^*]+)\*+', r'\1', c.strip()).strip() for c in raw_cells if c.strip()]
if len(cells) == len(headers):
# Handle the leading/trailing empty strings caused by the outer pipes
cells = [re.sub(r'\*+([^*]+)\*+', r'\1', c.strip()).strip() for c in raw_cells]
# If the line starts and ends with |, the first and last elements are empty strings
if line.startswith('|'): cells = cells[1:]
if line.endswith('|'): cells = cells[:-1]
# Pad or truncate row to match header length
if len(cells) < len(headers):
cells.extend([''] * (len(headers) - len(cells)))
elif len(cells) > len(headers):
cells = cells[:len(headers)]
# Only add row if it's not another separator or empty
if any(cells):
rows.append(cells)
return {"headers": headers, "rows": rows}
def format_context_for_prompt(analysis_data, language):
@@ -521,7 +566,8 @@ def start_generation(url, language, regions, focus):
save_detailed_log("step1_offer", "response", response_text)
step1_title = current_prompts['STEP_TITLES']['offer']
title_match = re.search(rf'## {re.escape(step1_title)}\s*', response_text, re.IGNORECASE)
# Flexible header matching
title_match = re.search(rf'^##\s*(?:Schritt|Step)\s*1.*$', response_text, re.IGNORECASE | re.MULTILINE)
content = response_text[title_match.end():].strip() if title_match else response_text
table_data = parse_markdown_table(content)
@@ -566,7 +612,8 @@ def next_step(language, context_file, generation_step, channels, focus_industry=
step_key = ['offer', 'targetGroups', 'personas', 'painPoints', 'gains', 'messages', 'customerJourney'][generation_step - 1]
expected_title = current_prompts['STEP_TITLES'][step_key]
title_match = re.search(rf'## {re.escape(expected_title)}\s*', response_text, re.IGNORECASE)
# Flexible header matching
title_match = re.search(rf'^##\s*(?:Schritt|Step)\s*{generation_step}.*$', response_text, re.IGNORECASE | re.MULTILINE)
content = response_text[title_match.end():].strip() if title_match else response_text
table_data = parse_markdown_table(content)