From da00d461e1230236e5de3d35abfb24dad426e6e9 Mon Sep 17 00:00:00 2001 From: Floke Date: Sat, 24 Jan 2026 20:43:33 +0000 Subject: [PATCH] feat(transcription): v0.4.0 with structured json, inline editing and deletion - Backend: Switched prompt to JSON output for structured data - Backend: Added PUT /chunks/{id} endpoint for persistence - Backend: Fixed app.py imports and initialization logic - Frontend: Complete rewrite for Unified View (flattened chunks) - Frontend: Added Inline Editing (Text/Speaker) and Row Deletion - Docs: Updated TRANSCRIPTION_TOOL.md with v0.4 features --- TRANSCRIPTION_TOOL.md | 55 ++-- transcription-tool/backend/app.py | 67 +++- .../backend/services/orchestrator.py | 33 +- .../backend/services/transcription_service.py | 27 +- transcription-tool/frontend/src/App.tsx | 306 ++++++++++++++---- 5 files changed, 389 insertions(+), 99 deletions(-) diff --git a/TRANSCRIPTION_TOOL.md b/TRANSCRIPTION_TOOL.md index 8359dad1..60bbcd10 100644 --- a/TRANSCRIPTION_TOOL.md +++ b/TRANSCRIPTION_TOOL.md @@ -1,7 +1,7 @@ # Meeting Assistant (Transcription Tool) -**Version:** 0.1.0 -**Status:** Beta (Core Functionality) +**Version:** 0.4.0 +**Status:** Beta (Functional with Editing) Der **Meeting Assistant** ist ein lokaler Micro-Service zur Transkription und Analyse von Audio-Dateien (Meetings, Calls, Interviews). Er kombiniert die Datensicherheit einer lokalen Datenhaltung mit der Leistungsfähigkeit von Googles **Gemini 2.0 Flash** Modell für kostengünstige, hochqualitative Speech-to-Text Umwandlung. @@ -15,15 +15,15 @@ Der Service folgt dem "Sidecar"-Pattern im Docker-Stack und ist vollständig in * **Backend:** FastAPI (Python) unter `/tr/api/`. * **Processing:** * **FFmpeg:** Zerlegt große Audio-Dateien (> 2 Stunden) in verarbeitbare 30-Minuten-Chunks. - * **Gemini 2.0 Flash:** Führt die eigentliche Transkription durch (via `google-genai` SDK). - * **SQLite:** Speichert Metadaten, Status und Ergebnisse. -* **Storage:** Lokales Docker-Volume für Audio-Uploads (werden nach Verarbeitung *nicht* gelöscht, um Re-Analysen zu ermöglichen). + * **Gemini 2.0 Flash:** Führt die Transkription durch und liefert strukturiertes JSON (Sprecher, Zeitstempel, Text). + * **SQLite:** Speichert Metadaten, Status und die bearbeitbaren JSON-Segmente. +* **Storage:** Lokales Docker-Volume für Audio-Uploads. ### Datenfluss 1. **Upload:** User lädt MP3 hoch -> Speicherung in `/app/uploads_audio`. -2. **Chunking:** Backend startet Background-Task -> FFmpeg erstellt Segmente in `/app/uploads_audio/chunks/{id}/`. -3. **Transkription:** Loop über alle Chunks -> Upload zu Gemini File API -> Generierung -> Löschung aus Gemini Cloud -> Speicherung in DB. -4. **Assemblierung:** (Geplant für v0.2) Zusammenfügen der Text-Fragmente. +2. **Chunking:** Backend startet Background-Task -> FFmpeg erstellt Segmente. +3. **Transkription:** Loop über Chunks -> Upload zu Gemini -> JSON-Extraktion -> Offset-Berechnung -> DB-Speicherung. +4. **Assemblierung:** Das Frontend lädt alle Chunks eines Meetings und stellt sie als eine durchgehende Liste dar. --- @@ -34,8 +34,10 @@ Basis-URL: `/tr/api` | Methode | Pfad | Beschreibung | | :--- | :--- | :--- | | `GET` | `/meetings` | Liste aller Meetings inkl. Status. | -| `POST` | `/upload` | Upload einer Audio-Datei (`multipart/form-data`). Startet Prozess. | -| `GET` | `/health` | Status-Check. | +| `POST` | `/upload` | Upload einer Audio-Datei (`multipart/form-data`). | +| `GET` | `/meetings/{id}` | Lädt Meeting-Details inklusive aller Text-Chunks (JSON). | +| `DELETE` | `/meetings/{id}` | Löscht ein Meeting inkl. Dateien komplett. | +| `PUT` | `/chunks/{id}` | Aktualisiert den Inhalt (Text/Sprecher) eines spezifischen 30-Min-Chunks. | --- @@ -45,38 +47,39 @@ Datei: `transcripts.db` ### `meetings` * `id`: PK -* `title`: Dateiname (z.B. "Jours_Fixe_Januar.mp3") -* `status`: `UPLOADED` -> `SPLITTING` -> `TRANSCRIBING` -> `COMPLETED` -* `duration_seconds`: Gesamtlänge -* `file_path`: Lokaler Pfad +* `title`, `status`, `duration_seconds`, `file_path`. ### `transcript_chunks` * `id`: PK * `meeting_id`: FK * `chunk_index`: 0, 1, 2... -* `raw_text`: Das rohe Transkript von Gemini. +* `raw_text`: Backup des rohen Gemini-Outputs. +* `json_content`: **JSON** (Editierbar). Struktur: `[{ "time": "MM:SS", "absolute_seconds": 120, "speaker": "A", "text": "..." }]` --- -## 4. Konfiguration & Limits +## 4. Features & Bedienung -* **Max Upload Size:** 500 MB (konfiguriert in Nginx). -* **Chunk Size:** 30 Minuten (1800 Sekunden). -* **Modell:** `gemini-2.0-flash` (Temperatur 0.1 für Faktentreue). -* **Kosten:** Gemini 2.0 Flash ist extrem günstig (Audio-Input), aber beachten Sie die API-Limits bei sehr vielen parallelen Uploads. +### Transkription +* Upload von MP3/WAV Dateien (bis 500MB). +* Automatische Erkennung von Sprechern (Speaker A, Speaker B). + +### Editor-Modus (v0.4) +* **Inline Editing:** Klicken Sie auf einen Sprechernamen oder Text, um ihn direkt zu bearbeiten. Änderungen werden sofort gespeichert. +* **Zeilen Löschen:** Fahren Sie mit der Maus über eine Zeile und klicken Sie auf das rote "X", um irrelevante Teile (z.B. Smalltalk) zu entfernen. +* **Sprecher-Aliasing (Ansicht):** Klicken Sie auf den blauen Sprechernamen ("Speaker A"), um ihn für die *aktuelle Sitzung* umzubenennen (z.B. in "Thomas"). *Hinweis: Dies ändert aktuell nur die Ansicht, nicht die Datenbank für alle Zeilen.* --- ## 5. Roadmap / Next Steps -* **v0.2:** Frontend-Detailansicht mit Volltext-Anzeige. -* **v0.3:** Sprecher-Identifikation (Mapping "Speaker A" -> "Thomas"). -* **v0.4:** "Meeting Notes" Generator (Zusammenfassung via LLM). +* **v0.5: Global Rename:** Button "Alle 'Speaker A' dauerhaft in DB umbenennen". +* **v0.6: AI Analysis:** "Erstelle Meeting Notes" Button basierend auf dem korrigierten Transkript. +* **v0.7:** Export als Word/PDF. --- ## 6. Troubleshooting -* **Upload bricht ab:** Prüfen Sie, ob die Datei > 500MB ist oder der Nginx-Timeout (1800s) greift. -* **Status bleibt bei "TRANSCRIBING":** Prüfen Sie die Docker-Logs (`docker logs transcription-app`). Evtl. ist der API-Key ungültig oder das Quota erschöpft. -* **FFmpeg Fehler:** Stellen Sie sicher, dass das Eingangsformat valide ist (MP3, WAV, M4A werden meist unterstützt). +* **Legacy Format:** Bei Dateien, die vor v0.3 hochgeladen wurden, erscheint ein Warnhinweis. Bitte neu hochladen, um die Editier-Funktionen zu nutzen. +* **Upload bricht ab:** Prüfen Sie die Dateigröße (< 500MB). \ No newline at end of file diff --git a/transcription-tool/backend/app.py b/transcription-tool/backend/app.py index 88f8c006..7b40c0a5 100644 --- a/transcription-tool/backend/app.py +++ b/transcription-tool/backend/app.py @@ -1,6 +1,8 @@ -from fastapi import FastAPI, Depends, HTTPException, UploadFile, File, BackgroundTasks +from fastapi import FastAPI, Depends, HTTPException, UploadFile, File, BackgroundTasks, Body +from fastapi.staticfiles import StaticFiles from fastapi.middleware.cors import CORSMiddleware -from sqlalchemy.orm import Session +from sqlalchemy.orm import Session, joinedload +from typing import List, Dict, Any import os import shutil import uuid @@ -10,6 +12,7 @@ from .config import settings from .database import init_db, get_db, Meeting, TranscriptChunk, AnalysisResult, SessionLocal from .services.orchestrator import process_meeting_task +# Initialize FastAPI App app = FastAPI( title=settings.APP_NAME, version=settings.VERSION, @@ -36,6 +39,33 @@ def health(): def list_meetings(db: Session = Depends(get_db)): return db.query(Meeting).order_by(Meeting.created_at.desc()).all() +@app.get("/api/meetings/{meeting_id}") +def get_meeting(meeting_id: int, db: Session = Depends(get_db)): + meeting = db.query(Meeting).options( + joinedload(Meeting.chunks) + ).filter(Meeting.id == meeting_id).first() + + if not meeting: + raise HTTPException(404, detail="Meeting not found") + + # Sort chunks by index + meeting.chunks.sort(key=lambda x: x.chunk_index) + + return meeting + +@app.put("/api/chunks/{chunk_id}") +def update_chunk(chunk_id: int, payload: Dict[str, Any] = Body(...), db: Session = Depends(get_db)): + chunk = db.query(TranscriptChunk).filter(TranscriptChunk.id == chunk_id).first() + if not chunk: + raise HTTPException(404, detail="Chunk not found") + + # Update JSON content (e.g. after editing/deleting lines) + if "json_content" in payload: + chunk.json_content = payload["json_content"] + db.commit() + + return {"status": "updated"} + @app.post("/api/upload") async def upload_audio( background_tasks: BackgroundTasks, @@ -67,6 +97,39 @@ async def upload_audio( return meeting +@app.delete("/api/meetings/{meeting_id}") +def delete_meeting(meeting_id: int, db: Session = Depends(get_db)): + meeting = db.query(Meeting).filter(Meeting.id == meeting_id).first() + if not meeting: + raise HTTPException(404, detail="Meeting not found") + + # 1. Delete Files + try: + if os.path.exists(meeting.file_path): + os.remove(meeting.file_path) + + # Delete chunks dir + chunk_dir = os.path.join(settings.UPLOAD_DIR, "chunks", str(meeting_id)) + if os.path.exists(chunk_dir): + shutil.rmtree(chunk_dir) + except Exception as e: + print(f"Error deleting files: {e}") + + # 2. Delete DB Entry (Cascade deletes chunks/analyses) + db.delete(meeting) + db.commit() + return {"status": "deleted"} + +# Serve Frontend +# This must be the last route definition to avoid catching API routes +static_path = "/frontend_static" +if not os.path.exists(static_path): + # Fallback for local development if not in Docker + static_path = os.path.join(os.path.dirname(__file__), "../frontend/dist") + +if os.path.exists(static_path): + app.mount("/", StaticFiles(directory=static_path, html=True), name="static") + if __name__ == "__main__": import uvicorn uvicorn.run("backend.app:app", host="0.0.0.0", port=8001, reload=True) diff --git a/transcription-tool/backend/services/orchestrator.py b/transcription-tool/backend/services/orchestrator.py index bca23b72..4806febe 100644 --- a/transcription-tool/backend/services/orchestrator.py +++ b/transcription-tool/backend/services/orchestrator.py @@ -1,4 +1,5 @@ import logging +import json from sqlalchemy.orm import Session from .ffmpeg_service import FFmpegService from .transcription_service import TranscriptionService @@ -7,6 +8,17 @@ from ..config import settings logger = logging.getLogger(__name__) +def parse_time_to_seconds(time_str): + try: + parts = time_str.split(':') + if len(parts) == 2: # MM:SS + return int(parts[0]) * 60 + int(parts[1]) + elif len(parts) == 3: # HH:MM:SS + return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2]) + except: + return 0 + return 0 + def process_meeting_task(meeting_id: int, db_session_factory): db = db_session_factory() meeting = db.query(Meeting).filter(Meeting.id == meeting_id).first() @@ -35,11 +47,30 @@ def process_meeting_task(meeting_id: int, db_session_factory): result = transcriber.transcribe_chunk(chunk_path, offset) + # Parse JSON and Adjust Timestamps + json_data = [] + try: + raw_json = json.loads(result["raw_text"]) + if isinstance(raw_json, list): + for entry in raw_json: + seconds = parse_time_to_seconds(entry.get("time", "00:00")) + absolute_seconds = seconds + offset + entry["absolute_seconds"] = absolute_seconds + + h = int(absolute_seconds // 3600) + m = int((absolute_seconds % 3600) // 60) + s = int(absolute_seconds % 60) + entry["display_time"] = f"{h:02}:{m:02}:{s:02}" + json_data.append(entry) + except Exception as e: + logger.error(f"JSON Parsing failed for chunk {i}: {e}") + # Save chunk result db_chunk = TranscriptChunk( meeting_id=meeting.id, chunk_index=i, - raw_text=result["raw_text"] + raw_text=result["raw_text"], + json_content=json_data ) db.add(db_chunk) all_text.append(result["raw_text"]) diff --git a/transcription-tool/backend/services/transcription_service.py b/transcription-tool/backend/services/transcription_service.py index 4f747e5c..f8c6e375 100644 --- a/transcription-tool/backend/services/transcription_service.py +++ b/transcription-tool/backend/services/transcription_service.py @@ -19,8 +19,8 @@ class TranscriptionService: """ logger.info(f"Uploading chunk {file_path} to Gemini...") - # 1. Upload file - media_file = self.client.files.upload(path=file_path) + # 1. Upload file (positional argument) + media_file = self.client.files.upload(file=file_path) # 2. Wait for processing (usually fast for audio) while media_file.state == "PROCESSING": @@ -32,12 +32,18 @@ class TranscriptionService: # 3. Transcribe with Diarization and Timestamps prompt = """ - Transkribiere dieses Audio wortgetreu. - Identifiziere die Sprecher (Sprecher A, Sprecher B, etc.). - Gib das Ergebnis als strukturierte Liste mit Timestamps aus. - Wichtig: Das Audio ist ein Teil eines größeren Gesprächs. - Antworte NUR mit dem Transkript im Format: - [MM:SS] Sprecher X: Text + Transkribiere dieses Audio wortgetreu. + Identifiziere die Sprecher (Speaker A, Speaker B, etc.). + + Gib das Ergebnis als JSON-Liste zurück. + Format: + [ + { + "time": "MM:SS", + "speaker": "Speaker A", + "text": "..." + } + ] """ logger.info(f"Generating transcription for {file_path}...") @@ -45,7 +51,8 @@ class TranscriptionService: model="gemini-2.0-flash", contents=[media_file, prompt], config=types.GenerateContentConfig( - temperature=0.1, # Low temp for accuracy + temperature=0.1, + response_mime_type="application/json" ) ) @@ -53,6 +60,6 @@ class TranscriptionService: self.client.files.delete(name=media_file.name) return { - "raw_text": response.text, + "raw_text": response.text, # This is now a JSON string "offset": offset_seconds } diff --git a/transcription-tool/frontend/src/App.tsx b/transcription-tool/frontend/src/App.tsx index 3f412fa9..c3657413 100644 --- a/transcription-tool/frontend/src/App.tsx +++ b/transcription-tool/frontend/src/App.tsx @@ -1,10 +1,25 @@ -import { useState, useEffect } from 'react' +import React, { useState, useEffect } from 'react' import axios from 'axios' -import { Upload, Mic, FileText, Clock, CheckCircle2, Loader2, AlertCircle, ChevronRight } from 'lucide-react' +import { Upload, FileText, Clock, CheckCircle2, Loader2, AlertCircle, Trash2, ArrowLeft, Copy, User, X } from 'lucide-react' import clsx from 'clsx' const API_BASE = '/tr/api' +interface TranscriptMessage { + time: string + display_time: string + absolute_seconds: number + speaker: string + text: string +} + +interface Chunk { + id: number + chunk_index: number + raw_text: string + json_content: TranscriptMessage[] | null +} + interface Meeting { id: number title: string @@ -12,13 +27,24 @@ interface Meeting { date_recorded: string duration_seconds?: number created_at: string + chunks?: Chunk[] } export default function App() { + const [view, setView] = useState<'list' | 'detail'>('list') + const [selectedId, setSelectedId] = useState(null) + const [meetings, setMeetings] = useState([]) const [uploading, setUploading] = useState(false) const [error, setError] = useState(null) + const [detailMeeting, setDetailMeeting] = useState(null) + const [loadingDetail, setLoadingDetail] = useState(false) + + // Editing State + const [editingRow, setEditingRow] = useState<{chunkId: number, idx: number, field: 'speaker' | 'text'} | null>(null) + const [editValue, setEditValue] = useState("") + const fetchMeetings = async () => { try { const res = await axios.get(`${API_BASE}/meetings`) @@ -28,94 +54,254 @@ export default function App() { } } + const fetchDetail = async (id: number) => { + setLoadingDetail(true) + try { + const res = await axios.get(`${API_BASE}/meetings/${id}`) + setDetailMeeting(res.data) + } catch (e) { + setError("Could not load details") + } finally { + setLoadingDetail(false) + } + } + useEffect(() => { - fetchMeetings() - const interval = setInterval(fetchMeetings, 5000) // Poll every 5s - return () => clearInterval(interval) - }, []) + if (view === 'list') { + fetchMeetings() + const interval = setInterval(fetchMeetings, 5000) + return () => clearInterval(interval) + } else if (view === 'detail' && selectedId) { + fetchDetail(selectedId) + } + }, [view, selectedId]) const handleUpload = async (e: React.ChangeEvent) => { const file = e.target.files?.[0] if (!file) return - setUploading(true) setError(null) const formData = new FormData() formData.append('file', file) - try { await axios.post(`${API_BASE}/upload`, formData) + fetchMeetings() + } catch (e) { setError("Upload failed.") } finally { setUploading(false) } + } + + const handleDeleteMeeting = async (e: React.MouseEvent, id: number) => { + e.stopPropagation() + if (!confirm("Delete meeting permanently?")) return + try { + await axios.delete(`${API_BASE}/meetings/${id}`) + if (selectedId === id) { setView('list'); setSelectedId(null); } fetchMeetings() + } catch (e) { alert("Delete failed") } + } + + // --- EDITING LOGIC --- + + const saveChunkUpdate = async (chunkId: number, newJson: TranscriptMessage[]) => { + try { + await axios.put(`${API_BASE}/chunks/${chunkId}`, { json_content: newJson }) } catch (e) { - setError("Upload failed. Make sure the file is not too large.") - } finally { - setUploading(false) + console.error("Failed to save chunk", e) + alert("Failed to save changes") } } + const handleUpdateRow = async (chunkId: number, idx: number, field: 'speaker' | 'text', value: string) => { + if (!detailMeeting) return + + const newChunks = detailMeeting.chunks!.map(c => { + if (c.id === chunkId && c.json_content) { + const newContent = [...c.json_content] + newContent[idx] = { ...newContent[idx], [field]: value } + return { ...c, json_content: newContent } + } + return c + }) + + setDetailMeeting({ ...detailMeeting, chunks: newChunks }) + + const updatedChunk = newChunks.find(c => c.id === chunkId) + if (updatedChunk?.json_content) { + await saveChunkUpdate(chunkId, updatedChunk.json_content) + } + setEditingRow(null) + } + + const handleDeleteRow = async (chunkId: number, idx: number) => { + if (!confirm("Remove this line?")) return + if (!detailMeeting) return + + const newChunks = detailMeeting.chunks!.map(c => { + if (c.id === chunkId && c.json_content) { + const newContent = c.json_content.filter((_, i) => i !== idx) + return { ...c, json_content: newContent } + } + return c + }) + + setDetailMeeting({ ...detailMeeting, chunks: newChunks }) + const updatedChunk = newChunks.find(c => c.id === chunkId) + if (updatedChunk?.json_content) { + await saveChunkUpdate(chunkId, updatedChunk.json_content) + } + } + + // --- RENDER --- + + if (view === 'detail') { + // Flatten for rendering but keep ref to chunkId + const flatMessages = detailMeeting?.chunks?.flatMap(c => + (c.json_content || []).map((msg, idx) => ({ ...msg, _chunkId: c.id, _idx: idx })) + ).sort((a,b) => a.absolute_seconds - b.absolute_seconds) || [] + + return ( +
+
+ + + {loadingDetail || !detailMeeting ? ( +
+ ) : ( + <> +
+
+

{detailMeeting.title}

+
+ {new Date(detailMeeting.created_at).toLocaleString()} + {detailMeeting.status} +
+
+
+ + +
+
+ +
+ {flatMessages.length > 0 ? ( +
+ {flatMessages.map((msg, uniqueIdx) => { + const isEditingSpeaker = editingRow?.chunkId === msg._chunkId && editingRow?.idx === msg._idx && editingRow?.field === 'speaker'; + const isEditingText = editingRow?.chunkId === msg._chunkId && editingRow?.idx === msg._idx && editingRow?.field === 'text'; + + return ( +
+ {/* Time & Delete */} +
+ {msg.display_time || "00:00"} + +
+ +
+ {/* Speaker */} +
+ {isEditingSpeaker ? ( + setEditValue(e.target.value)} + onBlur={() => handleUpdateRow(msg._chunkId, msg._idx, 'speaker', editValue)} + onKeyDown={e => e.key === 'Enter' && handleUpdateRow(msg._chunkId, msg._idx, 'speaker', editValue)} + /> + ) : ( +
{ setEditingRow({chunkId: msg._chunkId, idx: msg._idx, field: 'speaker'}); setEditValue(msg.speaker); }} + title="Click to rename THIS speaker occurrence" + > + + {msg.speaker} +
+ )} +
+ + {/* Text */} +
+ {isEditingText ? ( +