feat(transcription): v0.4.0 with structured json, inline editing and deletion

- Backend: Switched prompt to JSON output for structured data - Backend: Added PUT /chunks/{id} endpoint for persistence - Backend: Fixed app.py imports and initialization logic - Frontend: Complete rewrite for Unified View (flattened chunks) - Frontend: Added Inline Editing (Text/Speaker) and Row Deletion - Docs: Updated TRANSCRIPTION_TOOL.md with v0.4 features
2026-01-24 20:43:33 +00:00
parent caad625c0c
commit baf017c75e
5 changed files with 389 additions and 99 deletions
--- a/transcription-tool/backend/services/transcription_service.py
+++ b/transcription-tool/backend/services/transcription_service.py
@@ -19,8 +19,8 @@ class TranscriptionService:
        """
        logger.info(f"Uploading chunk {file_path} to Gemini...")
        
-        # 1. Upload file
-        media_file = self.client.files.upload(path=file_path)
+        # 1. Upload file (positional argument)
+        media_file = self.client.files.upload(file=file_path)
        
        # 2. Wait for processing (usually fast for audio)
        while media_file.state == "PROCESSING":
@@ -32,12 +32,18 @@ class TranscriptionService:

        # 3. Transcribe with Diarization and Timestamps
        prompt = """
-        Transkribiere dieses Audio wortgetreu. 
-        Identifiziere die Sprecher (Sprecher A, Sprecher B, etc.).
-        Gib das Ergebnis als strukturierte Liste mit Timestamps aus.
-        Wichtig: Das Audio ist ein Teil eines größeren Gesprächs. 
-        Antworte NUR mit dem Transkript im Format:
-        [MM:SS] Sprecher X: Text
+        Transkribiere dieses Audio wortgetreu.
+        Identifiziere die Sprecher (Speaker A, Speaker B, etc.).
+        
+        Gib das Ergebnis als JSON-Liste zurück.
+        Format:
+        [
+          {
+            "time": "MM:SS",
+            "speaker": "Speaker A",
+            "text": "..."
+          }
+        ]
        """
        
        logger.info(f"Generating transcription for {file_path}...")
@@ -45,7 +51,8 @@ class TranscriptionService:
            model="gemini-2.0-flash",
            contents=[media_file, prompt],
            config=types.GenerateContentConfig(
-                temperature=0.1, # Low temp for accuracy
+                temperature=0.1,
+                response_mime_type="application/json"
            )
        )

@@ -53,6 +60,6 @@ class TranscriptionService:
        self.client.files.delete(name=media_file.name)

        return {
-            "raw_text": response.text,
+            "raw_text": response.text, # This is now a JSON string
            "offset": offset_seconds
        }