feat(transcription): add meeting assistant micro-service v0.1.0

- Added FastAPI backend with FFmpeg and Gemini 2.0 integration - Added React frontend with upload and meeting list - Integrated into main docker-compose stack and dashboard
2026-01-24 16:34:01 +00:00
parent b16babb032
commit 0858df6f25
25 changed files with 721 additions and 2 deletions
--- a/transcription-tool/Dockerfile
+++ b/transcription-tool/Dockerfile
@@ -0,0 +1,37 @@
+# --- STAGE 1: Build Frontend ---
+FROM node:20-slim AS frontend-builder
+WORKDIR /build
+COPY frontend/package*.json ./
+RUN npm install
+COPY frontend/ ./
+RUN npm run build
+
+# --- STAGE 2: Backend & Runtime ---
+FROM python:3.11-slim
+WORKDIR /app
+
+# System Dependencies (FFmpeg ist essenziell)
+RUN apt-get update && apt-get install -y \
+    ffmpeg \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy Requirements & Install
+COPY backend/requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy Built Frontend from Stage 1
+COPY --from=frontend-builder /build/dist /frontend_static
+
+# Copy Backend Source
+COPY backend ./backend
+
+# Environment Variables
+ENV PYTHONPATH=/app
+ENV PYTHONUNBUFFERED=1
+
+# Expose Port
+EXPOSE 8001
+
+# Start FastAPI
+CMD ["uvicorn", "backend.app:app", "--host", "0.0.0.0", "--port", "8001", "--reload"]
--- a/transcription-tool/backend/init.py
+++ b/transcription-tool/backend/init.py
--- a/transcription-tool/backend/app.py
+++ b/transcription-tool/backend/app.py
@@ -0,0 +1,72 @@
+from fastapi import FastAPI, Depends, HTTPException, UploadFile, File, BackgroundTasks
+from fastapi.middleware.cors import CORSMiddleware
+from sqlalchemy.orm import Session
+import os
+import shutil
+import uuid
+from datetime import datetime
+
+from .config import settings
+from .database import init_db, get_db, Meeting, TranscriptChunk, AnalysisResult, SessionLocal
+from .services.orchestrator import process_meeting_task
+
+app = FastAPI(
+    title=settings.APP_NAME,
+    version=settings.VERSION,
+    root_path="/tr"
+)
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+@app.on_event("startup")
+def startup_event():
+    init_db()
+
+@app.get("/api/health")
+def health():
+    return {"status": "ok", "version": settings.VERSION}
+
+@app.get("/api/meetings")
+def list_meetings(db: Session = Depends(get_db)):
+    return db.query(Meeting).order_by(Meeting.created_at.desc()).all()
+
+@app.post("/api/upload")
+async def upload_audio(
+    background_tasks: BackgroundTasks,
+    file: UploadFile = File(...),
+    db: Session = Depends(get_db)
+):
+    # 1. Save File
+    file_id = str(uuid.uuid4())
+    ext = os.path.splitext(file.filename)[1]
+    filename = f"{file_id}{ext}"
+    file_path = os.path.join(settings.UPLOAD_DIR, filename)
+    
+    with open(file_path, "wb") as buffer:
+        shutil.copyfileobj(file.file, buffer)
+    
+    # 2. Create DB Entry
+    meeting = Meeting(
+        title=file.filename,
+        filename=filename,
+        file_path=file_path,
+        status="UPLOADED"
+    )
+    db.add(meeting)
+    db.commit()
+    db.refresh(meeting)
+    
+    # 3. Trigger Processing in Background
+    background_tasks.add_task(process_meeting_task, meeting.id, SessionLocal)
+    
+    return meeting
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("backend.app:app", host="0.0.0.0", port=8001, reload=True)
--- a/transcription-tool/backend/config.py
+++ b/transcription-tool/backend/config.py
@@ -0,0 +1,27 @@
+import os
+from pydantic_settings import BaseSettings
+from typing import Optional
+
+class Settings(BaseSettings):
+    APP_NAME: str = "Transcription Engine"
+    VERSION: str = "0.1.0"
+    DATABASE_URL: str = "sqlite:////app/transcripts.db"
+    UPLOAD_DIR: str = "/app/uploads_audio"
+    GEMINI_API_KEY: Optional[str] = None
+    CHUNK_DURATION_SEC: int = 1800  # 30 Minutes
+
+    class Config:
+        env_file = ".env"
+
+settings = Settings()
+
+# Auto-load API Key
+if not settings.GEMINI_API_KEY:
+    key_path = "/app/gemini_api_key.txt"
+    if os.path.exists(key_path):
+        with open(key_path, "r") as f:
+            settings.GEMINI_API_KEY = f.read().strip()
+
+# Ensure Upload Dir exists
+os.makedirs(settings.UPLOAD_DIR, exist_ok=True)
+os.makedirs(os.path.join(settings.UPLOAD_DIR, "chunks"), exist_ok=True)
--- a/transcription-tool/backend/database.py
+++ b/transcription-tool/backend/database.py
@@ -0,0 +1,63 @@
+from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime, ForeignKey, Float, JSON
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import sessionmaker, relationship
+from datetime import datetime
+from .config import settings
+
+engine = create_engine(settings.DATABASE_URL, connect_args={"check_same_thread": False})
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+Base = declarative_base()
+
+class Meeting(Base):
+    __tablename__ = "meetings"
+
+    id = Column(Integer, primary_key=True, index=True)
+    title = Column(String, index=True)
+    filename = Column(String)
+    file_path = Column(String)
+    date_recorded = Column(DateTime, default=datetime.utcnow)
+    
+    duration_seconds = Column(Float, nullable=True)
+    status = Column(String, default="UPLOADED") # UPLOADED, SPLITTING, TRANSCRIBING, ANALYZING, COMPLETED, ERROR
+    
+    participants = Column(JSON, nullable=True) # List of names
+    summary = Column(Text, nullable=True)
+    
+    created_at = Column(DateTime, default=datetime.utcnow)
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+
+    chunks = relationship("TranscriptChunk", back_populates="meeting", cascade="all, delete-orphan")
+    analysis_results = relationship("AnalysisResult", back_populates="meeting", cascade="all, delete-orphan")
+
+class TranscriptChunk(Base):
+    __tablename__ = "transcript_chunks"
+
+    id = Column(Integer, primary_key=True, index=True)
+    meeting_id = Column(Integer, ForeignKey("meetings.id"))
+    chunk_index = Column(Integer)
+    
+    raw_text = Column(Text)
+    json_content = Column(JSON, nullable=True) # Structured timestamps/speakers
+    
+    meeting = relationship("Meeting", back_populates="chunks")
+
+class AnalysisResult(Base):
+    __tablename__ = "analysis_results"
+
+    id = Column(Integer, primary_key=True, index=True)
+    meeting_id = Column(Integer, ForeignKey("meetings.id"))
+    prompt_key = Column(String) # summary, tasks, notes
+    result_text = Column(Text)
+    
+    created_at = Column(DateTime, default=datetime.utcnow)
+    meeting = relationship("Meeting", back_populates="analysis_results")
+
+def init_db():
+    Base.metadata.create_all(bind=engine)
+
+def get_db():
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close()
--- a/transcription-tool/backend/lib/init.py
+++ b/transcription-tool/backend/lib/init.py
--- a/transcription-tool/backend/requirements.txt
+++ b/transcription-tool/backend/requirements.txt
@@ -0,0 +1,10 @@
+fastapi
+uvicorn
+sqlalchemy
+pydantic
+pydantic-settings
+python-multipart
+requests
+google-genai
+python-dotenv
+aiofiles
--- a/transcription-tool/backend/services/init.py
+++ b/transcription-tool/backend/services/init.py
--- a/transcription-tool/backend/services/ffmpeg_service.py
+++ b/transcription-tool/backend/services/ffmpeg_service.py
@@ -0,0 +1,49 @@
+import subprocess
+import os
+import logging
+from ..config import settings
+
+logger = logging.getLogger(__name__)
+
+class FFmpegService:
+    def split_audio(self, input_path: str, meeting_id: int) -> list:
+        """
+        Splits audio into 30min chunks using ffmpeg segment muxer.
+        Returns a list of paths to the created chunks.
+        """
+        output_dir = os.path.join(settings.UPLOAD_DIR, "chunks", str(meeting_id))
+        os.makedirs(output_dir, exist_ok=True)
+        
+        output_pattern = os.path.join(output_dir, "chunk_%03d.mp3")
+        
+        # ffmpeg -i input.mp3 -f segment -segment_time 1800 -c copy chunk_%03d.mp3
+        cmd = [
+            "ffmpeg", "-i", input_path,
+            "-f", "segment",
+            "-segment_time", str(settings.CHUNK_DURATION_SEC),
+            "-c", "copy",
+            output_pattern
+        ]
+        
+        logger.info(f"Splitting {input_path} into segments...")
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        
+        if result.returncode != 0:
+            logger.error(f"FFmpeg Error: {result.stderr}")
+            raise Exception("Failed to split audio file.")
+            
+        chunks = sorted([os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.endswith(".mp3")])
+        logger.info(f"Created {len(chunks)} chunks.")
+        return chunks
+
+    def get_duration(self, input_path: str) -> float:
+        """Gets duration of audio file in seconds."""
+        cmd = [
+            "ffprobe", "-v", "error", "-show_entries", "format=duration",
+            "-of", "default=noprint_wrappers=1:nokey=1", input_path
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        try:
+            return float(result.stdout.strip())
+        except:
+            return 0.0
--- a/transcription-tool/backend/services/orchestrator.py
+++ b/transcription-tool/backend/services/orchestrator.py
@@ -0,0 +1,60 @@
+import logging
+from sqlalchemy.orm import Session
+from .ffmpeg_service import FFmpegService
+from .transcription_service import TranscriptionService
+from ..database import Meeting, TranscriptChunk
+from ..config import settings
+
+logger = logging.getLogger(__name__)
+
+def process_meeting_task(meeting_id: int, db_session_factory):
+    db = db_session_factory()
+    meeting = db.query(Meeting).filter(Meeting.id == meeting_id).first()
+    if not meeting:
+        return
+
+    try:
+        ffmpeg = FFmpegService()
+        transcriber = TranscriptionService()
+
+        # Phase 1: Split
+        meeting.status = "SPLITTING"
+        db.commit()
+        
+        meeting.duration_seconds = ffmpeg.get_duration(meeting.file_path)
+        chunks = ffmpeg.split_audio(meeting.file_path, meeting.id)
+        
+        # Phase 2: Transcribe
+        meeting.status = "TRANSCRIBING"
+        db.commit()
+        
+        all_text = []
+        for i, chunk_path in enumerate(chunks):
+            offset = i * settings.CHUNK_DURATION_SEC
+            logger.info(f"Processing chunk {i+1}/{len(chunks)} with offset {offset}s")
+            
+            result = transcriber.transcribe_chunk(chunk_path, offset)
+            
+            # Save chunk result
+            db_chunk = TranscriptChunk(
+                meeting_id=meeting.id,
+                chunk_index=i,
+                raw_text=result["raw_text"]
+            )
+            db.add(db_chunk)
+            all_text.append(result["raw_text"])
+            db.commit()
+            
+        # Phase 3: Finalize
+        meeting.status = "COMPLETED"
+        # Combine summary (first attempt - can be refined later with separate LLM call)
+        # meeting.summary = ... 
+        db.commit()
+        logger.info(f"Meeting {meeting.id} processing completed.")
+
+    except Exception as e:
+        logger.error(f"Error processing meeting {meeting_id}: {e}", exc_info=True)
+        meeting.status = "ERROR"
+        db.commit()
+    finally:
+        db.close()
--- a/transcription-tool/backend/services/transcription_service.py
+++ b/transcription-tool/backend/services/transcription_service.py
@@ -0,0 +1,58 @@
+import os
+import time
+import logging
+from google import genai
+from google.genai import types
+from ..config import settings
+
+logger = logging.getLogger(__name__)
+
+class TranscriptionService:
+    def __init__(self):
+        if not settings.GEMINI_API_KEY:
+            raise Exception("Gemini API Key missing.")
+        self.client = genai.Client(api_key=settings.GEMINI_API_KEY)
+
+    def transcribe_chunk(self, file_path: str, offset_seconds: int = 0) -> dict:
+        """
+        Uploads a chunk to Gemini and returns the transcription with timestamps.
+        """
+        logger.info(f"Uploading chunk {file_path} to Gemini...")
+        
+        # 1. Upload file
+        media_file = self.client.files.upload(path=file_path)
+        
+        # 2. Wait for processing (usually fast for audio)
+        while media_file.state == "PROCESSING":
+            time.sleep(2)
+            media_file = self.client.files.get(name=media_file.name)
+            
+        if media_file.state == "FAILED":
+            raise Exception("File processing failed at Gemini.")
+
+        # 3. Transcribe with Diarization and Timestamps
+        prompt = """
+        Transkribiere dieses Audio wortgetreu. 
+        Identifiziere die Sprecher (Sprecher A, Sprecher B, etc.).
+        Gib das Ergebnis als strukturierte Liste mit Timestamps aus.
+        Wichtig: Das Audio ist ein Teil eines größeren Gesprächs. 
+        Antworte NUR mit dem Transkript im Format:
+        [MM:SS] Sprecher X: Text
+        """
+        
+        logger.info(f"Generating transcription for {file_path}...")
+        response = self.client.models.generate_content(
+            model="gemini-2.0-flash",
+            contents=[media_file, prompt],
+            config=types.GenerateContentConfig(
+                temperature=0.1, # Low temp for accuracy
+            )
+        )
+
+        # Cleanup: Delete file from Gemini storage
+        self.client.files.delete(name=media_file.name)
+
+        return {
+            "raw_text": response.text,
+            "offset": offset_seconds
+        }
--- a/transcription-tool/frontend/index.html
+++ b/transcription-tool/frontend/index.html
@@ -0,0 +1,12 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>Meeting Assistant</title>
+  </head>
+  <body class="bg-slate-50 dark:bg-slate-950">
+    <div id="root"></div>
+    <script type="module" src="/src/main.tsx"></script>
+  </body>
+</html>
--- a/transcription-tool/frontend/package.json
+++ b/transcription-tool/frontend/package.json
@@ -0,0 +1,27 @@
+{
+  "name": "transcription-frontend",
+  "private": true,
+  "version": "0.1.0",
+  "type": "module",
+  "scripts": {
+    "dev": "vite",
+    "build": "tsc && vite build",
+    "preview": "vite preview"
+  },
+  "dependencies": {
+    "axios": "^1.6.2",
+    "clsx": "^2.0.0",
+    "lucide-react": "^0.294.0",
+    "react": "^18.2.0",
+    "react-dom": "^18.2.0",
+    "tailwind-merge": "^2.1.0",
+    "@types/react": "^18.2.37",
+    "@types/react-dom": "^18.2.15",
+    "@vitejs/plugin-react": "^4.2.0",
+    "autoprefixer": "^10.4.16",
+    "postcss": "^8.4.31",
+    "tailwindcss": "^3.3.5",
+    "typescript": "^5.2.2",
+    "vite": "^5.0.0"
+  }
+}
--- a/transcription-tool/frontend/postcss.config.js
+++ b/transcription-tool/frontend/postcss.config.js
@@ -0,0 +1,6 @@
+export default {
+  plugins: {
+    tailwindcss: {},
+    autoprefixer: {},
+  },
+}
--- a/transcription-tool/frontend/src/App.tsx
+++ b/transcription-tool/frontend/src/App.tsx
@@ -0,0 +1,121 @@
+import { useState, useEffect } from 'react'
+import axios from 'axios'
+import { Upload, Mic, FileText, Clock, CheckCircle2, Loader2, AlertCircle, ChevronRight } from 'lucide-react'
+import clsx from 'clsx'
+
+const API_BASE = '/tr/api'
+
+interface Meeting {
+  id: number
+  title: string
+  status: string
+  date_recorded: string
+  duration_seconds?: number
+  created_at: string
+}
+
+export default function App() {
+  const [meetings, setMeetings] = useState<Meeting[]>([])
+  const [uploading, setUploading] = useState(false)
+  const [error, setError] = useState<string | null>(null)
+
+  const fetchMeetings = async () => {
+    try {
+      const res = await axios.get(`${API_BASE}/meetings`)
+      setMeetings(res.data)
+    } catch (e) {
+      console.error("Failed to fetch meetings", e)
+    }
+  }
+
+  useEffect(() => {
+    fetchMeetings()
+    const interval = setInterval(fetchMeetings, 5000) // Poll every 5s
+    return () => clearInterval(interval)
+  }, [])
+
+  const handleUpload = async (e: React.ChangeEvent<HTMLInputElement>) => {
+    const file = e.target.files?.[0]
+    if (!file) return
+
+    setUploading(true)
+    setError(null)
+    const formData = new FormData()
+    formData.append('file', file)
+
+    try {
+      await axios.post(`${API_BASE}/upload`, formData)
+      fetchMeetings()
+    } catch (e) {
+      setError("Upload failed. Make sure the file is not too large.")
+    } finally {
+      setUploading(false)
+    }
+  }
+
+  return (
+    <div className="min-h-screen bg-slate-50 dark:bg-slate-950 text-slate-900 dark:text-slate-200">
+      <div className="max-w-5xl mx-auto px-4 py-12">
+        <header className="flex items-center justify-between mb-12">
+          <div>
+            <h1 className="text-3xl font-bold tracking-tight">Meeting Assistant</h1>
+            <p className="text-slate-500 mt-2">Transcribe and analyze your meetings with Gemini 2.0</p>
+          </div>
+          <label className={clsx(
+            "flex items-center gap-2 px-6 py-3 bg-blue-600 hover:bg-blue-500 text-white rounded-full font-semibold transition-all cursor-pointer shadow-lg shadow-blue-500/20",
+            uploading && "opacity-50 cursor-not-allowed"
+          )}>
+            {uploading ? <Loader2 className="h-5 w-5 animate-spin" /> : <Upload className="h-5 w-5" />}
+            {uploading ? "Uploading..." : "New Meeting"}
+            <input type="file" className="hidden" accept="audio/*" onChange={handleUpload} disabled={uploading} />
+          </label>
+        </header>
+
+        {error && (
+          <div className="mb-8 p-4 bg-red-50 dark:bg-red-900/20 border border-red-200 dark:border-red-800 rounded-xl text-red-600 dark:text-red-400 flex items-center gap-3">
+            <AlertCircle className="h-5 w-5" />
+            {error}
+          </div>
+        )}
+
+        <div className="grid gap-4">
+          {meetings.length === 0 ? (
+            <div className="text-center py-20 bg-white dark:bg-slate-900 rounded-3xl border-2 border-dashed border-slate-200 dark:border-slate-800">
+              <Mic className="h-12 w-12 mx-auto mb-4 text-slate-300" />
+              <p className="text-slate-500 font-medium">No meetings yet. Upload your first audio file.</p>
+            </div>
+          ) : (
+            meetings.map(m => (
+              <div key={m.id} className="group bg-white dark:bg-slate-900 p-6 rounded-2xl border border-slate-200 dark:border-slate-800 hover:shadow-xl transition-all flex items-center justify-between cursor-pointer">
+                <div className="flex items-center gap-4">
+                  <div className={clsx(
+                    "p-3 rounded-xl",
+                    m.status === 'COMPLETED' ? "bg-green-100 dark:bg-green-900/30 text-green-600" :
+                    m.status === 'ERROR' ? "bg-red-100 dark:bg-red-900/30 text-red-600" :
+                    "bg-blue-100 dark:bg-blue-900/30 text-blue-600 animate-pulse"
+                  )}>
+                    {m.status === 'COMPLETED' ? <CheckCircle2 className="h-6 w-6" /> : <FileText className="h-6 w-6" />}
+                  </div>
+                  <div>
+                    <h3 className="font-bold text-lg leading-tight">{m.title}</h3>
+                    <div className="flex items-center gap-4 mt-1 text-sm text-slate-500">
+                      <span className="flex items-center gap-1"><Clock className="h-3.5 w-3.5" /> {new Date(m.created_at).toLocaleDateString()}</span>
+                      {m.duration_seconds && (
+                        <span>{Math.round(m.duration_seconds / 60)} min</span>
+                      )}
+                      <span className={clsx(
+                        "font-semibold uppercase tracking-wider text-[10px] px-2 py-0.5 rounded",
+                        m.status === 'COMPLETED' ? "bg-green-100 text-green-700" : "bg-slate-100 text-slate-600"
+                      )}>{m.status}</span>
+                    </div>
+                  </div>
+                </div>
+                <ChevronRight className="h-6 w-6 text-slate-300 group-hover:text-blue-500 transition-colors" />
+              </div>
+            ))
+          )}
+        </div>
+      </div>
+    </div>
+  )
+}
--- a/transcription-tool/frontend/src/index.css
+++ b/transcription-tool/frontend/src/index.css
@@ -0,0 +1,7 @@
+@tailwind base;
+@tailwind components;
+@tailwind utilities;
+
+:root {
+  color-scheme: light dark;
+}
--- a/transcription-tool/frontend/src/main.tsx
+++ b/transcription-tool/frontend/src/main.tsx
@@ -0,0 +1,10 @@
+import React from 'react'
+import ReactDOM from 'react-dom/client'
+import App from './App'
+import './index.css'
+
+ReactDOM.createRoot(document.getElementById('root')!).render(
+  <React.StrictMode>
+    <App />
+  </React.StrictMode>,
+)
--- a/transcription-tool/frontend/src/vite-env.d.ts
+++ b/transcription-tool/frontend/src/vite-env.d.ts
@@ -0,0 +1 @@
+/// <reference types="vite/client" />
--- a/transcription-tool/frontend/tailwind.config.js
+++ b/transcription-tool/frontend/tailwind.config.js
@@ -0,0 +1,11 @@
+/** @type {import('tailwindcss').Config} */
+export default {
+  content: [
+    "./index.html",
+    "./src/**/*.{js,ts,jsx,tsx}",
+  ],
+  theme: {
+    extend: {},
+  },
+  plugins: [],
+}
--- a/transcription-tool/frontend/vite.config.ts
+++ b/transcription-tool/frontend/vite.config.ts
@@ -0,0 +1,15 @@
+import { defineConfig } from 'vite'
+import react from '@vitejs/plugin-react'
+
+export default defineConfig({
+  plugins: [react()],
+  base: '/tr/',
+  server: {
+    proxy: {
+      '/tr/api': {
+        target: 'http://localhost:8001',
+        changeOrigin: true,
+      }
+    }
+  }
+})