import re import json def parse_markdown_table(markdown_text): lines = [line.strip() for line in markdown_text.strip().split('\n') if line.strip()] table_lines = [] for line in lines: if line.startswith('|') and line.endswith('|'): table_lines.append(line) if not table_lines: return {"headers": [], "rows": []} separator_index = -1 for i, line in enumerate(table_lines): if '---' in line and not re.search(r'[a-zA-Z0-9]', line.replace('|', '').replace('-', '').replace(' ', '').replace(':', '')): separator_index = i break if separator_index == -1: header_line = table_lines[0] data_start = 1 else: if separator_index == 0: return {"headers": [], "rows": []} header_line = table_lines[separator_index - 1] data_start = separator_index + 1 headers = [re.sub(r'\*+([^\*]+)\*+', r'\1', h.strip()).strip() for h in header_line.split('|') if h.strip()] if not headers: return {"headers": [], "rows": []} rows = [] for line in table_lines[data_start:]: raw_cells = line.split('|') cells = [re.sub(r'\*+([^\*]+)\*+', r'\1', c.strip()).strip() for c in raw_cells] if line.startswith('|'): cells = cells[1:] if line.endswith('|'): cells = cells[:-1] if len(cells) < len(headers): cells.extend([''] * (len(headers) - len(cells))) elif len(cells) > len(headers): cells = cells[:len(headers)] if any(cells): rows.append(cells) return {"headers": headers, "rows": rows} # Content from the log (simplified/cleaned of the huge gap for testing) content = """ ## Schritt 1: Angebot (WAS) | Produkt/Lösung | Beschreibung (1-2 Sätze) | Kernfunktionen | Differenzierung | Primäre Quelle (URL) | | --- | --- | --- | --- | --- | | **AgreeDo (Meeting Management Software)** | AgreeDo ist eine webbasierte Anwendung... | **Kernfunktionen:**... | **Differenzierung:**... | `https://agreedo.com/` | """ result = parse_markdown_table(content) print(json.dumps(result, indent=2))