feat(gtm): Implement Meta-Framework for strategic analysis

Refactors the GTM orchestrator prompts (phases 2-9) to use a question-based strategic framework derived from the internal marketing blueprint. This new 'Meta-Framework' approach ensures strategic depth and prevents content pollution from irrelevant examples when analyzing new product categories. - Updates orchestrator prompts in . - Adds documentation in explaining how to modify the new strategy logic. - Includes minor fixes to the Node.js and dependency updates in .
2026-01-14 15:34:15 +00:00
parent dd85931561
commit 63243cd344
5 changed files with 4427 additions and 57 deletions
--- a/b2b_marketing_orchestrator.py
+++ b/b2b_marketing_orchestrator.py
@@ -433,33 +433,78 @@ def find_relevant_links(base_url):
        logging.warning(f"Could not scrape base URL {base_url} for links: {e}")
        return []
        
+def clean_llm_response(text):
+    """Sanitizes the LLM response to remove excessive whitespace and common artifacts."""
+    if not text: return ""
+    # 1. Replace multiple spaces/newlines with single ones (within a reasonable limit)
+    # But preserve single newlines for markdown structure
+    text = re.sub(r'[ \t]{5,}', ' ', text) # Replace 5+ spaces/tabs with 1 space
+    # 2. Remove non-printable characters (except common ones)
+    text = "".join(ch for ch in text if ch.isprintable() or ch in "\n\r\t")
+    # 3. Fix common table artifacts like empty pipes at the end of lines
+    text = re.sub(r'\|\s*$', '|', text, flags=re.MULTILINE)
+    return text.strip()
+
 def parse_markdown_table(markdown_text):
-    lines = [line.strip() for line in markdown_text.strip().split('\n')]
+    # Sanitize input first
+    markdown_text = clean_llm_response(markdown_text)
+    
+    lines = [line.strip() for line in markdown_text.strip().split('\n') if line.strip()]
    table_lines = []
-    in_table_section = False
+    
+    # 1. Identify all lines that look like table rows (start and end with |)
    for line in lines:
-        if re.match(r'^\|.*\|$', line) and '---' not in line:
-            in_table_section = True
+        if line.startswith('|') and line.endswith('|'):
            table_lines.append(line)
-        elif in_table_section and '---' in line and re.match(r'^\|(:?-+:?)\|', line.replace(' ', '')):
-            table_lines.append(line)
-        elif in_table_section:
-            break
-    if not table_lines: return {"headers": [], "rows": []}
+    
+    if not table_lines: 
+        return {"headers": [], "rows": []}
+    
+    # 2. Find the separator line (|---|---|...)
    separator_index = -1
    for i, line in enumerate(table_lines):
-        if '---' in line and re.match(r'^\|(:?-+:?)\|', line.replace(' ', '')):
+        # A separator line usually has at least one dash between pipes and no alphanumeric chars
+        if '---' in line and not re.search(r'[a-zA-Z0-9]', line.replace('|', '').replace('-', '').replace(' ', '').replace(':', '')):
            separator_index = i
            break
-    if separator_index == -1 or separator_index == 0: return {"headers": [], "rows": []}
-    header_line = table_lines[0]
+            
+    if separator_index == -1:
+        # If no separator found, we might just have a list of rows where the first is the header
+        # but usually LLMs provide the separator. Let's assume the first is header.
+        header_line = table_lines[0]
+        data_start = 1
+    else:
+        # Separator found. Header is the line before it.
+        if separator_index == 0: return {"headers": [], "rows": []}
+        header_line = table_lines[separator_index - 1]
+        data_start = separator_index + 1
+        
+    # 3. Extract and clean headers
    headers = [re.sub(r'\*+([^*]+)\*+', r'\1', h.strip()).strip() for h in header_line.split('|') if h.strip()]
+    if not headers: return {"headers": [], "rows": []}
+    
+    # 4. Extract and clean rows
    rows = []
-    for line in table_lines[separator_index + 1:]:
+    for line in table_lines[data_start:]:
+        # Split by | and remove leading/trailing empty elements from the split result
        raw_cells = line.split('|')
-        cells = [re.sub(r'\*+([^*]+)\*+', r'\1', c.strip()).strip() for c in raw_cells if c.strip()]
-        if len(cells) == len(headers):
+        # Handle the leading/trailing empty strings caused by the outer pipes
+        cells = [re.sub(r'\*+([^*]+)\*+', r'\1', c.strip()).strip() for c in raw_cells]
+        
+        # If the line starts and ends with |, the first and last elements are empty strings
+        if line.startswith('|'): cells = cells[1:]
+        if line.endswith('|'): cells = cells[:-1]
+        
+        # Pad or truncate row to match header length
+        if len(cells) < len(headers):
+            cells.extend([''] * (len(headers) - len(cells)))
+        elif len(cells) > len(headers):
+            cells = cells[:len(headers)]
+            
+        # Only add row if it's not another separator or empty
+        if any(cells):
            rows.append(cells)
+            
    return {"headers": headers, "rows": rows}

 def format_context_for_prompt(analysis_data, language):
@@ -521,7 +566,8 @@ def start_generation(url, language, regions, focus):
    save_detailed_log("step1_offer", "response", response_text)
    
    step1_title = current_prompts['STEP_TITLES']['offer']
-    title_match = re.search(rf'## {re.escape(step1_title)}\s*', response_text, re.IGNORECASE)
+    # Flexible header matching
+    title_match = re.search(rf'^##\s*(?:Schritt|Step)\s*1.*$', response_text, re.IGNORECASE | re.MULTILINE)
    content = response_text[title_match.end():].strip() if title_match else response_text
    table_data = parse_markdown_table(content)
    
@@ -566,7 +612,8 @@ def next_step(language, context_file, generation_step, channels, focus_industry=
    
    step_key = ['offer', 'targetGroups', 'personas', 'painPoints', 'gains', 'messages', 'customerJourney'][generation_step - 1]
    expected_title = current_prompts['STEP_TITLES'][step_key]
-    title_match = re.search(rf'## {re.escape(expected_title)}\s*', response_text, re.IGNORECASE)
+    # Flexible header matching
+    title_match = re.search(rf'^##\s*(?:Schritt|Step)\s*{generation_step}.*$', response_text, re.IGNORECASE | re.MULTILINE)
    content = response_text[title_match.end():].strip() if title_match else response_text
    table_data = parse_markdown_table(content)