[30988f42] feat(transcription-tool): stabilize transcription with plain text parsing and add retry feature

2026-02-16 14:35:21 +00:00
parent d2e9ee2a70
commit b446894c23
6 changed files with 270 additions and 18 deletions
--- a/debug_transcription_raw.py
+++ b/debug_transcription_raw.py
@@ -0,0 +1,70 @@
+import sqlite3
+import json
+import os
+
+DB_PATH = "transcripts.db"
+
+def inspect_latest_meeting():
+    if not os.path.exists(DB_PATH):
+        print(f"Error: Database file '{DB_PATH}' not found.")
+        return
+
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+
+    # Get latest meeting
+    cursor.execute("SELECT id, title, created_at FROM meetings ORDER BY created_at DESC LIMIT 1")
+    meeting = cursor.fetchone()
+    
+    if not meeting:
+        print("No meetings found in DB.")
+        conn.close()
+        return
+
+    meeting_id, title, created_at = meeting
+    print(f"--- Inspecting Latest Meeting: ID {meeting_id} ('{title}') created at {created_at} ---")
+
+    # Get chunks for this meeting
+    cursor.execute("SELECT id, chunk_index, raw_text, json_content FROM transcript_chunks WHERE meeting_id = ? ORDER BY chunk_index", (meeting_id,))
+    chunks = cursor.fetchall()
+
+    if not chunks:
+        print("No chunks found for this meeting.")
+    
+    for chunk in chunks:
+        chunk_id, idx, raw_text, json_content = chunk
+        print(f"\n[Chunk {idx} (ID: {chunk_id})]")
+        
+        print(f"Stored JSON Content (Length): {len(json.loads(json_content)) if json_content else 'None/Empty'}")
+        
+        print("-" * 20 + " RAW TEXT START " + "-" * 20)
+        print(raw_text[:500]) # Print first 500 chars
+        print("..." if len(raw_text) > 500 else "")
+        print("-" * 20 + " RAW TEXT END " + "-" * 20)
+
+        # Try to parse manually to see error
+        try:
+            # Simulate cleaning logic from orchestrator
+            cleaned = raw_text.strip()
+            if cleaned.startswith("```json"):
+                cleaned = cleaned[7:]
+            elif cleaned.startswith("```"):
+                cleaned = cleaned[3:]
+            if cleaned.endswith("```"):
+                cleaned = cleaned[:-3]
+            cleaned = cleaned.strip()
+
+            parsed = json.loads(cleaned)
+            print("✅ Manual Parsing Successful!")
+        except json.JSONDecodeError as e:
+            print(f"❌ Manual Parsing Failed: {e}")
+            # Show context around error
+            if hasattr(e, 'pos'):
+                start = max(0, e.pos - 20)
+                end = min(len(cleaned), e.pos + 20)
+                print(f"   Context at error: ...{cleaned[start:end]}...")
+
+    conn.close()
+
+if __name__ == "__main__":
+    inspect_latest_meeting()
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -98,13 +98,13 @@ services:
    restart: unless-stopped
    volumes:
      - ./transcription-tool/backend:/app/backend
+      - ./transcription-tool/frontend/dist:/app/frontend/dist # Mount Frontend Build for Live Updates
      - ./transcripts.db:/app/transcripts.db
      - ./uploads_audio:/app/uploads_audio
      - ./gemini_api_key.txt:/app/gemini_api_key.txt
    environment:
      PYTHONUNBUFFERED: "1"
      DATABASE_URL: "sqlite:////app/transcripts.db"
-      GEMINI_API_KEY: "AIzaSyCFRmr1rOrkFKiEuh9GOCJNB2zfJsYmR68" # Placeholder, actual key is in file
    ports:
      - "8001:8001"

--- a/transcription-tool/backend/app.py
+++ b/transcription-tool/backend/app.py
@@ -99,6 +99,31 @@ async def upload_audio(
    
    return meeting

+@app.post("/api/meetings/{meeting_id}/retry")
+def retry_meeting(
+    meeting_id: int,
+    background_tasks: BackgroundTasks,
+    db: Session = Depends(get_db)
+):
+    meeting = db.query(Meeting).filter(Meeting.id == meeting_id).first()
+    if not meeting:
+        raise HTTPException(404, detail="Meeting not found")
+    
+    # Check if chunks directory exists
+    chunk_dir = os.path.join(settings.UPLOAD_DIR, "chunks", str(meeting_id))
+    if not os.path.exists(chunk_dir) or not os.listdir(chunk_dir):
+         raise HTTPException(400, detail="Original audio chunks not found. Please re-upload.")
+
+    # Reset status
+    meeting.status = "QUEUED"
+    db.commit()
+
+    # Trigger Retry Task
+    from .services.orchestrator import retry_meeting_task
+    background_tasks.add_task(retry_meeting_task, meeting.id, SessionLocal)
+    
+    return {"status": "started", "message": "Retrying transcription..."}
+
 from pydantic import BaseModel

 class InsightRequest(BaseModel):
@@ -201,9 +226,16 @@ def delete_meeting(meeting_id: int, db: Session = Depends(get_db)):

 # Serve Frontend
 # This must be the last route definition to avoid catching API routes
-static_path = "/frontend_static"
+
+# PRIORITY 1: Mounted Volume (Development / Live Update)
+static_path = "/app/frontend/dist"
+
+# PRIORITY 2: Built-in Image Path (Production)
+if not os.path.exists(static_path):
+    static_path = "/frontend_static"
+
+# PRIORITY 3: Local Development (running python directly)
 if not os.path.exists(static_path):
-    # Fallback for local development if not in Docker
    static_path = os.path.join(os.path.dirname(__file__), "../frontend/dist")

 if os.path.exists(static_path):
--- a/transcription-tool/backend/services/orchestrator.py
+++ b/transcription-tool/backend/services/orchestrator.py
@@ -19,6 +19,16 @@ def parse_time_to_seconds(time_str):
        return 0
    return 0

+def clean_json_string(text):
+    text = text.strip()
+    if text.startswith("```json"):
+        text = text[7:]
+    elif text.startswith("```"):
+        text = text[3:]
+    if text.endswith("```"):
+        text = text[:-3]
+    return text.strip()
+
 def process_meeting_task(meeting_id: int, db_session_factory):
    db = db_session_factory()
    meeting = db.query(Meeting).filter(Meeting.id == meeting_id).first()
@@ -50,7 +60,13 @@ def process_meeting_task(meeting_id: int, db_session_factory):
            # Parse JSON and Adjust Timestamps
            json_data = []
            try:
-                raw_json = json.loads(result["raw_text"])
+                cleaned_text = clean_json_string(result["raw_text"])
+                raw_json = json.loads(cleaned_text)
+                
+                # Check for wrapped structure (e.g. {"items": [...]}) if schema enforced it
+                if isinstance(raw_json, dict) and "items" in raw_json:
+                    raw_json = raw_json["items"] # Extract inner list
+
                if isinstance(raw_json, list):
                    for entry in raw_json:
                        seconds = parse_time_to_seconds(entry.get("time", "00:00"))
@@ -63,7 +79,7 @@ def process_meeting_task(meeting_id: int, db_session_factory):
                        entry["display_time"] = f"{h:02}:{m:02}:{s:02}"
                        json_data.append(entry)
            except Exception as e:
-                logger.error(f"JSON Parsing failed for chunk {i}: {e}")
+                logger.error(f"JSON Parsing failed for chunk {i}: {e}. Raw text start: {result['raw_text'][:100]}")

            # Save chunk result
            db_chunk = TranscriptChunk(
@@ -89,3 +105,94 @@ def process_meeting_task(meeting_id: int, db_session_factory):
        db.commit()
    finally:
        db.close()
+
+def retry_meeting_task(meeting_id: int, db_session_factory):
+    """
+    Retries transcription using existing chunks on disk.
+    Avoids re-splitting the original file.
+    """
+    db = db_session_factory()
+    meeting = db.query(Meeting).filter(Meeting.id == meeting_id).first()
+    if not meeting:
+        return
+
+    try:
+        import os
+        transcriber = TranscriptionService()
+        
+        # 0. Validate Chunk Directory
+        chunk_dir = os.path.join(settings.UPLOAD_DIR, "chunks", str(meeting_id))
+        if not os.path.exists(chunk_dir):
+            logger.error(f"Chunk directory not found for meeting {meeting_id}")
+            meeting.status = "ERROR"
+            db.commit()
+            return
+            
+        chunks = sorted([os.path.join(chunk_dir, f) for f in os.listdir(chunk_dir) if f.endswith(".mp3")])
+        if not chunks:
+             logger.error(f"No chunks found for meeting {meeting_id}")
+             meeting.status = "ERROR"
+             db.commit()
+             return
+
+        # Phase 1: Clear Old Chunks
+        meeting.status = "RETRYING"
+        db.query(TranscriptChunk).filter(TranscriptChunk.meeting_id == meeting_id).delete()
+        db.commit()
+        
+        # Phase 2: Transcribe
+        all_text = []
+        for i, chunk_path in enumerate(chunks):
+            offset = i * settings.CHUNK_DURATION_SEC
+            logger.info(f"Retrying chunk {i+1}/{len(chunks)} with offset {offset}s")
+            
+            result = transcriber.transcribe_chunk(chunk_path, offset)
+            
+            # Parse JSON and Adjust Timestamps (Same logic as process_meeting_task)
+            json_data = []
+            try:
+                # With response_schema, raw_text SHOULD be valid JSON directly
+                # But let's keep clean_json_string just in case specific models deviate
+                cleaned_text = clean_json_string(result["raw_text"])
+                raw_json = json.loads(cleaned_text)
+                
+                # Check for wrapped structure (e.g. {"items": [...]}) if schema enforced it
+                if isinstance(raw_json, dict) and "items" in raw_json:
+                    raw_json = raw_json["items"] # Extract inner list
+
+                if isinstance(raw_json, list):
+                    for entry in raw_json:
+                        seconds = parse_time_to_seconds(entry.get("time", "00:00"))
+                        absolute_seconds = seconds + offset
+                        entry["absolute_seconds"] = absolute_seconds
+                        
+                        h = int(absolute_seconds // 3600)
+                        m = int((absolute_seconds % 3600) // 60)
+                        s = int(absolute_seconds % 60)
+                        entry["display_time"] = f"{h:02}:{m:02}:{s:02}"
+                        json_data.append(entry)
+            except Exception as e:
+                logger.error(f"JSON Parsing failed for chunk {i}: {e}. Raw: {result['raw_text'][:100]}")
+
+            # Save chunk result
+            db_chunk = TranscriptChunk(
+                meeting_id=meeting.id,
+                chunk_index=i,
+                raw_text=result["raw_text"],
+                json_content=json_data
+            )
+            db.add(db_chunk)
+            all_text.append(result["raw_text"])
+            db.commit()
+            
+        # Phase 3: Finalize
+        meeting.status = "COMPLETED"
+        db.commit()
+        logger.info(f"Meeting {meeting.id} retry completed.")
+
+    except Exception as e:
+        logger.error(f"Error retrying meeting {meeting_id}: {e}", exc_info=True)
+        meeting.status = "ERROR"
+        db.commit()
+    finally:
+        db.close()
--- a/transcription-tool/backend/services/transcription_service.py
+++ b/transcription-tool/backend/services/transcription_service.py
@@ -30,20 +30,17 @@ class TranscriptionService:
        if media_file.state == "FAILED":
            raise Exception("File processing failed at Gemini.")

-        # 3. Transcribe with Diarization and Timestamps
+        # 3. Transcribe with Diarization and Timestamps (Plain Text Mode for Stability)
        prompt = """
        Transkribiere dieses Audio wortgetreu.
        Identifiziere die Sprecher (Speaker A, Speaker B, etc.).
        
-        Gib das Ergebnis als JSON-Liste zurück.
-        Format:
-        [
-          {
-            "time": "MM:SS",
-            "speaker": "Speaker A",
-            "text": "..."
-          }
-        ]
+        Gib das Ergebnis EXAKT in diesem Format zurück (pro Zeile ein Sprecherwechsel):
+        [MM:SS] Speaker Name: Gesprochener Text...
+        
+        Beispiel:
+        [00:00] Speaker A: Hallo zusammen.
+        [00:05] Speaker B: Guten Morgen.
        """
        
        logger.info(f"Generating transcription for {file_path}...")
@@ -52,14 +49,46 @@ class TranscriptionService:
            contents=[media_file, prompt],
            config=types.GenerateContentConfig(
                temperature=0.1,
-                response_mime_type="application/json"
+                max_output_tokens=8192
            )
        )

        # Cleanup: Delete file from Gemini storage
        self.client.files.delete(name=media_file.name)
-
+        
+        # Parse Plain Text to JSON
+        structured_data = self.parse_transcript(response.text)
+        import json
        return {
-            "raw_text": response.text, # This is now a JSON string
+            "raw_text": json.dumps(structured_data), # Return valid JSON string
            "offset": offset_seconds
        }
+
+    def parse_transcript(self, text: str) -> list:
+        """
+        Parses lines like '[00:12] Speaker A: Hello world' into structured JSON.
+        """
+        import re
+        results = []
+        # Regex to match: [MM:SS] Speaker: Text
+        # Flexible for MM:SS or H:MM:SS
+        pattern = re.compile(r"^\[(\d{1,2}:\d{2}(?::\d{2})?)\]\s*([^:]+):\s*(.+)$")
+        
+        for line in text.strip().split('\n'):
+            line = line.strip()
+            if not line: continue
+            
+            match = pattern.match(line)
+            if match:
+                time_str, speaker, content = match.groups()
+                results.append({
+                    "time": time_str,
+                    "speaker": speaker.strip(),
+                    "text": content.strip()
+                })
+            else:
+                # Fallback: Append to previous if it looks like continuation
+                if results and not line.startswith("["):
+                    results[-1]["text"] += " " + line
+        
+        return results
--- a/transcription-tool/frontend/src/App.tsx
+++ b/transcription-tool/frontend/src/App.tsx
@@ -394,6 +394,20 @@ export default function App() {
                    >
                        <Share2 className="h-5 w-5" />
                    </button>
+                    <button 
+                        onClick={async () => {
+                            if(!confirm("Retry transcription using existing audio chunks? This will overwrite the current transcript.")) return;
+                            try {
+                                await axios.post(`${API_BASE}/meetings/${detailMeeting.id}/retry`);
+                                alert("Retry started. Please wait for completion.");
+                                fetchDetail(detailMeeting.id);
+                            } catch(e) { alert("Retry failed."); }
+                        }}
+                        className="text-orange-500 hover:bg-orange-50 dark:hover:bg-orange-900/20 p-2 rounded" 
+                        title="Retry Transcription (Fix Format Issues)"
+                    >
+                        <Wand2 className="h-5 w-5" />
+                    </button>
                    <button onClick={(e) => handleDeleteMeeting(e, detailMeeting.id)} className="text-red-500 hover:bg-red-50 dark:hover:bg-red-900/20 p-2 rounded"><Trash2 className="h-5 w-5" /></button>
                </div>
              </header>