feat: Documentation and Tool Config Update

2026-02-18 09:12:04 +00:00
parent 32d40c77f4
commit 46994e4ce0
38 changed files with 859 additions and 1763 deletions
--- a/transcription-tool/backend/services/transcription_service.py
+++ b/transcription-tool/backend/services/transcription_service.py
@@ -30,20 +30,17 @@ class TranscriptionService:
        if media_file.state == "FAILED":
            raise Exception("File processing failed at Gemini.")

-        # 3. Transcribe with Diarization and Timestamps
+        # 3. Transcribe with Diarization and Timestamps (Plain Text Mode for Stability)
        prompt = """
        Transkribiere dieses Audio wortgetreu.
        Identifiziere die Sprecher (Speaker A, Speaker B, etc.).
        
-        Gib das Ergebnis als JSON-Liste zurück.
-        Format:
-        [
-          {
-            "time": "MM:SS",
-            "speaker": "Speaker A",
-            "text": "..."
-          }
-        ]
+        Gib das Ergebnis EXAKT in diesem Format zurück (pro Zeile ein Sprecherwechsel):
+        [MM:SS] Speaker Name: Gesprochener Text...
+        
+        Beispiel:
+        [00:00] Speaker A: Hallo zusammen.
+        [00:05] Speaker B: Guten Morgen.
        """
        
        logger.info(f"Generating transcription for {file_path}...")
@@ -52,14 +49,46 @@ class TranscriptionService:
            contents=[media_file, prompt],
            config=types.GenerateContentConfig(
                temperature=0.1,
-                response_mime_type="application/json"
+                max_output_tokens=8192
            )
        )

        # Cleanup: Delete file from Gemini storage
        self.client.files.delete(name=media_file.name)
-
+        
+        # Parse Plain Text to JSON
+        structured_data = self.parse_transcript(response.text)
+        import json
        return {
-            "raw_text": response.text, # This is now a JSON string
+            "raw_text": json.dumps(structured_data), # Return valid JSON string
            "offset": offset_seconds
        }
+
+    def parse_transcript(self, text: str) -> list:
+        """
+        Parses lines like '[00:12] Speaker A: Hello world' into structured JSON.
+        """
+        import re
+        results = []
+        # Regex to match: [MM:SS] Speaker: Text
+        # Flexible for MM:SS or H:MM:SS
+        pattern = re.compile(r"^\[(\d{1,2}:\d{2}(?::\d{2})?)\]\s*([^:]+):\s*(.+)$")
+        
+        for line in text.strip().split('\n'):
+            line = line.strip()
+            if not line: continue
+            
+            match = pattern.match(line)
+            if match:
+                time_str, speaker, content = match.groups()
+                results.append({
+                    "time": time_str,
+                    "speaker": speaker.strip(),
+                    "text": content.strip()
+                })
+            else:
+                # Fallback: Append to previous if it looks like continuation
+                if results and not line.startswith("["):
+                    results[-1]["text"] += " " + line
+        
+        return results