Brancheneinstufung2/ARCHIVE_legacy_scripts/test_parser.py


import re
import json

def parse_markdown_table(markdown_text):
    lines = [line.strip() for line in markdown_text.strip().split('\n') if line.strip()]
    table_lines = []

    for line in lines:
        if line.startswith('|') and line.endswith('|'):
            table_lines.append(line)

    if not table_lines:
        return {"headers": [], "rows": []}

    separator_index = -1
    for i, line in enumerate(table_lines):
        if '---' in line and not re.search(r'[a-zA-Z0-9]', line.replace('|', '').replace('-', '').replace(' ', '').replace(':', '')):
            separator_index = i
            break

    if separator_index == -1:
        header_line = table_lines[0]
        data_start = 1
    else:
        if separator_index == 0: return {"headers": [], "rows": []}
        header_line = table_lines[separator_index - 1]
        data_start = separator_index + 1

    headers = [re.sub(r'\*+([^\*]+)\*+', r'\1', h.strip()).strip() for h in header_line.split('|') if h.strip()]
    if not headers: return {"headers": [], "rows": []}

    rows = []
    for line in table_lines[data_start:]:
        raw_cells = line.split('|')
        cells = [re.sub(r'\*+([^\*]+)\*+', r'\1', c.strip()).strip() for c in raw_cells]

        if line.startswith('|'): cells = cells[1:]
        if line.endswith('|'): cells = cells[:-1]

        if len(cells) < len(headers):
            cells.extend([''] * (len(headers) - len(cells)))
        elif len(cells) > len(headers):
            cells = cells[:len(headers)]

        if any(cells):
            rows.append(cells)

    return {"headers": headers, "rows": rows}

# Content from the log (simplified/cleaned of the huge gap for testing)
content = """
## Schritt 1: Angebot (WAS)

| Produkt/Lösung | Beschreibung (1-2 Sätze) | Kernfunktionen | Differenzierung | Primäre Quelle (URL) |
| --- | --- | --- | --- | --- |
| **AgreeDo (Meeting Management Software)** | AgreeDo ist eine webbasierte Anwendung... | **Kernfunktionen:**... | **Differenzierung:**... | `https://agreedo.com/` |
"""

result = parse_markdown_table(content)
print(json.dumps(result, indent=2))