Brancheneinstufung2/transcription-tool/backend/services/transcription_service.py

import os
import time
import logging
from google import genai
from google.genai import types
from ..config import settings

logger = logging.getLogger(__name__)

class TranscriptionService:
    def __init__(self):
        if not settings.GEMINI_API_KEY:
            raise Exception("Gemini API Key missing.")
        self.client = genai.Client(api_key=settings.GEMINI_API_KEY)

    def transcribe_chunk(self, file_path: str, offset_seconds: int = 0) -> dict:
        """
        Uploads a chunk to Gemini and returns the transcription with timestamps.
        """
        logger.info(f"Uploading chunk {file_path} to Gemini...")

        # 1. Upload file (positional argument)
        media_file = self.client.files.upload(file=file_path)

        # 2. Wait for processing (usually fast for audio)
        while media_file.state == "PROCESSING":
            time.sleep(2)
            media_file = self.client.files.get(name=media_file.name)

        if media_file.state == "FAILED":
            raise Exception("File processing failed at Gemini.")

        # 3. Transcribe with Diarization and Timestamps (Plain Text Mode for Stability)
        prompt = """
        Transkribiere dieses Audio wortgetreu.
        Identifiziere die Sprecher (Speaker A, Speaker B, etc.).

        Gib das Ergebnis EXAKT in diesem Format zurück (pro Zeile ein Sprecherwechsel):
        [MM:SS] Speaker Name: Gesprochener Text...

        Beispiel:
        [00:00] Speaker A: Hallo zusammen.
        [00:05] Speaker B: Guten Morgen.
        """

        logger.info(f"Generating transcription for {file_path}...")
        response = self.client.models.generate_content(
            model="gemini-2.0-flash",
            contents=[media_file, prompt],
            config=types.GenerateContentConfig(
                temperature=0.1,
                max_output_tokens=8192
            )
        )

        # Cleanup: Delete file from Gemini storage
        self.client.files.delete(name=media_file.name)

        # Parse Plain Text to JSON
        structured_data = self.parse_transcript(response.text)
        import json
        return {
            "raw_text": json.dumps(structured_data), # Return valid JSON string
            "offset": offset_seconds
        }

    def parse_transcript(self, text: str) -> list:
        """
        Parses lines like '[00:12] Speaker A: Hello world' into structured JSON.
        """
        import re
        results = []
        # Regex to match: [MM:SS] Speaker: Text
        # Flexible for MM:SS or H:MM:SS
        pattern = re.compile(r"^\[(\d{1,2}:\d{2}(?::\d{2})?)\]\s*([^:]+):\s*(.+)$")

        for line in text.strip().split('\n'):
            line = line.strip()
            if not line: continue

            match = pattern.match(line)
            if match:
                time_str, speaker, content = match.groups()
                results.append({
                    "time": time_str,
                    "speaker": speaker.strip(),
                    "text": content.strip()
                })
            else:
                # Fallback: Append to previous if it looks like continuation
                if results and not line.startswith("["):
                    results[-1]["text"] += " " + line

        return results