[30988f42] feat(transcription-tool): stabilize transcription with plain text parsing and add retry feature

2026-02-16 14:35:21 +00:00
parent ca32715185
commit 2265073c4c
9 changed files with 270 additions and 18 deletions
--- a/debug_transcription_raw.py
+++ b/debug_transcription_raw.py
@@ -0,0 +1,70 @@
 import sqlite3
 import json
 import os
 DB_PATH = "transcripts.db"
 def inspect_latest_meeting():
    if not os.path.exists(DB_PATH):
        print(f"Error: Database file '{DB_PATH}' not found.")
        return
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    # Get latest meeting
    cursor.execute("SELECT id, title, created_at FROM meetings ORDER BY created_at DESC LIMIT 1")
    meeting = cursor.fetchone()
    if not meeting:
        print("No meetings found in DB.")
        conn.close()
        return
    meeting_id, title, created_at = meeting
    print(f"--- Inspecting Latest Meeting: ID {meeting_id} ('{title}') created at {created_at} ---")
    # Get chunks for this meeting
    cursor.execute("SELECT id, chunk_index, raw_text, json_content FROM transcript_chunks WHERE meeting_id = ? ORDER BY chunk_index", (meeting_id,))
    chunks = cursor.fetchall()
    if not chunks:
        print("No chunks found for this meeting.")
    for chunk in chunks:
        chunk_id, idx, raw_text, json_content = chunk
        print(f"\n[Chunk {idx} (ID: {chunk_id})]")
        print(f"Stored JSON Content (Length): {len(json.loads(json_content)) if json_content else 'None/Empty'}")
        print("-" * 20 + " RAW TEXT START " + "-" * 20)
        print(raw_text[:500]) # Print first 500 chars
        print("..." if len(raw_text) > 500 else "")
        print("-" * 20 + " RAW TEXT END " + "-" * 20)
        # Try to parse manually to see error
        try:
            # Simulate cleaning logic from orchestrator
            cleaned = raw_text.strip()
            if cleaned.startswith("```json"):
                cleaned = cleaned[7:]
            elif cleaned.startswith("```"):
                cleaned = cleaned[3:]
            if cleaned.endswith("```"):
                cleaned = cleaned[:-3]
            cleaned = cleaned.strip()
            parsed = json.loads(cleaned)
            print("✅ Manual Parsing Successful!")
        except json.JSONDecodeError as e:
            print(f"❌ Manual Parsing Failed: {e}")
            # Show context around error
            if hasattr(e, 'pos'):
                start = max(0, e.pos - 20)
                end = min(len(cleaned), e.pos + 20)
                print(f"   Context at error: ...{cleaned[start:end]}...")
    conn.close()
 if __name__ == "__main__":
    inspect_latest_meeting()
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -98,13 +98,13 @@ services:
    restart: unless-stopped
    volumes:
      - ./transcription-tool/backend:/app/backend
      - ./transcription-tool/frontend/dist:/app/frontend/dist # Mount Frontend Build for Live Updates
      - ./transcripts.db:/app/transcripts.db
      - ./uploads_audio:/app/uploads_audio
      - ./gemini_api_key.txt:/app/gemini_api_key.txt
    environment:
      PYTHONUNBUFFERED: "1"
      DATABASE_URL: "sqlite:////app/transcripts.db"
      GEMINI_API_KEY: "AIzaSyCFRmr1rOrkFKiEuh9GOCJNB2zfJsYmR68" # Placeholder, actual key is in file
    ports:
      - "8001:8001"
--- a/transcription-tool/backend/app.py
+++ b/transcription-tool/backend/app.py
@@ -99,6 +99,31 @@ async def upload_audio(
    return meeting
@app.post("/api/meetings/{meeting_id}/retry")
 def retry_meeting(
    meeting_id: int,
    background_tasks: BackgroundTasks,
    db: Session = Depends(get_db)
 ):
    meeting = db.query(Meeting).filter(Meeting.id == meeting_id).first()
    if not meeting:
        raise HTTPException(404, detail="Meeting not found")
    # Check if chunks directory exists
    chunk_dir = os.path.join(settings.UPLOAD_DIR, "chunks", str(meeting_id))
    if not os.path.exists(chunk_dir) or not os.listdir(chunk_dir):
         raise HTTPException(400, detail="Original audio chunks not found. Please re-upload.")
    # Reset status
    meeting.status = "QUEUED"
    db.commit()
    # Trigger Retry Task
    from .services.orchestrator import retry_meeting_task
    background_tasks.add_task(retry_meeting_task, meeting.id, SessionLocal)
    return {"status": "started", "message": "Retrying transcription..."}
 from pydantic import BaseModel
 class InsightRequest(BaseModel):
@@ -201,9 +226,16 @@ def delete_meeting(meeting_id: int, db: Session = Depends(get_db)):
 # Serve Frontend
 # This must be the last route definition to avoid catching API routes
-static_path = "/frontend_static"
+
 # PRIORITY 1: Mounted Volume (Development / Live Update)
 static_path = "/app/frontend/dist"
 # PRIORITY 2: Built-in Image Path (Production)
 if not os.path.exists(static_path):
    static_path = "/frontend_static"
 # PRIORITY 3: Local Development (running python directly)
 if not os.path.exists(static_path):
    # Fallback for local development if not in Docker
    static_path = os.path.join(os.path.dirname(__file__), "../frontend/dist")
 if os.path.exists(static_path):
--- a/transcription-tool/backend/services/orchestrator.py
+++ b/transcription-tool/backend/services/orchestrator.py
@@ -19,6 +19,16 @@ def parse_time_to_seconds(time_str):
        return 0
    return 0
 def clean_json_string(text):
    text = text.strip()
    if text.startswith("```json"):
        text = text[7:]
    elif text.startswith("```"):
        text = text[3:]
    if text.endswith("```"):
        text = text[:-3]
    return text.strip()
 def process_meeting_task(meeting_id: int, db_session_factory):
    db = db_session_factory()
    meeting = db.query(Meeting).filter(Meeting.id == meeting_id).first()
@@ -50,7 +60,13 @@ def process_meeting_task(meeting_id: int, db_session_factory):
            # Parse JSON and Adjust Timestamps
            json_data = []
            try:
-                raw_json = json.loads(result["raw_text"])
+                cleaned_text = clean_json_string(result["raw_text"])
                raw_json = json.loads(cleaned_text)
                # Check for wrapped structure (e.g. {"items": [...]}) if schema enforced it
                if isinstance(raw_json, dict) and "items" in raw_json:
                    raw_json = raw_json["items"] # Extract inner list
                if isinstance(raw_json, list):
                    for entry in raw_json:
                        seconds = parse_time_to_seconds(entry.get("time", "00:00"))
@@ -63,7 +79,7 @@ def process_meeting_task(meeting_id: int, db_session_factory):
                        entry["display_time"] = f"{h:02}:{m:02}:{s:02}"
                        json_data.append(entry)
            except Exception as e:
-                logger.error(f"JSON Parsing failed for chunk {i}: {e}")
+                logger.error(f"JSON Parsing failed for chunk {i}: {e}. Raw text start: {result['raw_text'][:100]}")
            # Save chunk result
            db_chunk = TranscriptChunk(
@@ -89,3 +105,94 @@ def process_meeting_task(meeting_id: int, db_session_factory):
        db.commit()
    finally:
        db.close()
 def retry_meeting_task(meeting_id: int, db_session_factory):
    """
    Retries transcription using existing chunks on disk.
    Avoids re-splitting the original file.
    """
    db = db_session_factory()
    meeting = db.query(Meeting).filter(Meeting.id == meeting_id).first()
    if not meeting:
        return
    try:
        import os
        transcriber = TranscriptionService()
        # 0. Validate Chunk Directory
        chunk_dir = os.path.join(settings.UPLOAD_DIR, "chunks", str(meeting_id))
        if not os.path.exists(chunk_dir):
            logger.error(f"Chunk directory not found for meeting {meeting_id}")
            meeting.status = "ERROR"
            db.commit()
            return
        chunks = sorted([os.path.join(chunk_dir, f) for f in os.listdir(chunk_dir) if f.endswith(".mp3")])
        if not chunks:
             logger.error(f"No chunks found for meeting {meeting_id}")
             meeting.status = "ERROR"
             db.commit()
             return
        # Phase 1: Clear Old Chunks
        meeting.status = "RETRYING"
        db.query(TranscriptChunk).filter(TranscriptChunk.meeting_id == meeting_id).delete()
        db.commit()
        # Phase 2: Transcribe
        all_text = []
        for i, chunk_path in enumerate(chunks):
            offset = i * settings.CHUNK_DURATION_SEC
            logger.info(f"Retrying chunk {i+1}/{len(chunks)} with offset {offset}s")
            result = transcriber.transcribe_chunk(chunk_path, offset)
            # Parse JSON and Adjust Timestamps (Same logic as process_meeting_task)
            json_data = []
            try:
                # With response_schema, raw_text SHOULD be valid JSON directly
                # But let's keep clean_json_string just in case specific models deviate
                cleaned_text = clean_json_string(result["raw_text"])
                raw_json = json.loads(cleaned_text)
                # Check for wrapped structure (e.g. {"items": [...]}) if schema enforced it
                if isinstance(raw_json, dict) and "items" in raw_json:
                    raw_json = raw_json["items"] # Extract inner list
                if isinstance(raw_json, list):
                    for entry in raw_json:
                        seconds = parse_time_to_seconds(entry.get("time", "00:00"))
                        absolute_seconds = seconds + offset
                        entry["absolute_seconds"] = absolute_seconds
                        h = int(absolute_seconds // 3600)
                        m = int((absolute_seconds % 3600) // 60)
                        s = int(absolute_seconds % 60)
                        entry["display_time"] = f"{h:02}:{m:02}:{s:02}"
                        json_data.append(entry)
            except Exception as e:
                logger.error(f"JSON Parsing failed for chunk {i}: {e}. Raw: {result['raw_text'][:100]}")
            # Save chunk result
            db_chunk = TranscriptChunk(
                meeting_id=meeting.id,
                chunk_index=i,
                raw_text=result["raw_text"],
                json_content=json_data
            )
            db.add(db_chunk)
            all_text.append(result["raw_text"])
            db.commit()
        # Phase 3: Finalize
        meeting.status = "COMPLETED"
        db.commit()
        logger.info(f"Meeting {meeting.id} retry completed.")
    except Exception as e:
        logger.error(f"Error retrying meeting {meeting_id}: {e}", exc_info=True)
        meeting.status = "ERROR"
        db.commit()
    finally:
        db.close()
--- a/transcription-tool/backend/services/transcription_service.py
+++ b/transcription-tool/backend/services/transcription_service.py
@@ -30,20 +30,17 @@ class TranscriptionService:
        if media_file.state == "FAILED":
            raise Exception("File processing failed at Gemini.")
-        # 3. Transcribe with Diarization and Timestamps
+        # 3. Transcribe with Diarization and Timestamps (Plain Text Mode for Stability)
        prompt = """
        Transkribiere dieses Audio wortgetreu.
        Identifiziere die Sprecher (Speaker A, Speaker B, etc.).
-        Gib das Ergebnis als JSON-Liste zurück.
+        Gib das Ergebnis EXAKT in diesem Format zurück (pro Zeile ein Sprecherwechsel):
-        Format:
+        [MM:SS] Speaker Name: Gesprochener Text...
-        [
+        
-          {
+        Beispiel:
-            "time": "MM:SS",
+        [00:00] Speaker A: Hallo zusammen.
-            "speaker": "Speaker A",
+        [00:05] Speaker B: Guten Morgen.
            "text": "..."
          }
        ]
        """
        logger.info(f"Generating transcription for {file_path}...")
@@ -52,14 +49,46 @@ class TranscriptionService:
            contents=[media_file, prompt],
            config=types.GenerateContentConfig(
                temperature=0.1,
-                response_mime_type="application/json"
+                max_output_tokens=8192
            )
        )
        # Cleanup: Delete file from Gemini storage
        self.client.files.delete(name=media_file.name)
        # Parse Plain Text to JSON
        structured_data = self.parse_transcript(response.text)
        import json
        return {
-            "raw_text": response.text, # This is now a JSON string
+            "raw_text": json.dumps(structured_data), # Return valid JSON string
            "offset": offset_seconds
        }
    def parse_transcript(self, text: str) -> list:
        """
        Parses lines like '[00:12] Speaker A: Hello world' into structured JSON.
        """
        import re
        results = []
        # Regex to match: [MM:SS] Speaker: Text
        # Flexible for MM:SS or H:MM:SS
        pattern = re.compile(r"^\[(\d{1,2}:\d{2}(?::\d{2})?)\]\s*([^:]+):\s*(.+)$")
        for line in text.strip().split('\n'):
            line = line.strip()
            if not line: continue
            match = pattern.match(line)
            if match:
                time_str, speaker, content = match.groups()
                results.append({
                    "time": time_str,
                    "speaker": speaker.strip(),
                    "text": content.strip()
                })
            else:
                # Fallback: Append to previous if it looks like continuation
                if results and not line.startswith("["):
                    results[-1]["text"] += " " + line
        return results
--- a/transcription-tool/frontend/src/App.tsx
+++ b/transcription-tool/frontend/src/App.tsx
@@ -394,6 +394,20 @@ export default function App() {
                    >
                        <Share2 className="h-5 w-5" />
                    </button>
                    <button 
                        onClick={async () => {
                            if(!confirm("Retry transcription using existing audio chunks? This will overwrite the current transcript.")) return;
                            try {
                                await axios.post(`${API_BASE}/meetings/${detailMeeting.id}/retry`);
                                alert("Retry started. Please wait for completion.");
                                fetchDetail(detailMeeting.id);
                            } catch(e) { alert("Retry failed."); }
                        }}
                        className="text-orange-500 hover:bg-orange-50 dark:hover:bg-orange-900/20 p-2 rounded" 
                        title="Retry Transcription (Fix Format Issues)"
                    >
                        <Wand2 className="h-5 w-5" />
                    </button>
                    <button onClick={(e) => handleDeleteMeeting(e, detailMeeting.id)} className="text-red-500 hover:bg-red-50 dark:hover:bg-red-900/20 p-2 rounded"><Trash2 className="h-5 w-5" /></button>
                </div>
              </header>
--- a/uploads_audio/7cc29087-842f-4b47-b2df-ce34f8395ad4.m4a
+++ b/uploads_audio/7cc29087-842f-4b47-b2df-ce34f8395ad4.m4a
--- a/uploads_audio/chunks/6/chunk_000.mp3
+++ b/uploads_audio/chunks/6/chunk_000.mp3
--- a/uploads_audio/chunks/6/chunk_001.mp3
+++ b/uploads_audio/chunks/6/chunk_001.mp3