import os import time import logging from google import genai from google.genai import types from ..config import settings logger = logging.getLogger(__name__) class TranscriptionService: def __init__(self): if not settings.GEMINI_API_KEY: raise Exception("Gemini API Key missing.") self.client = genai.Client(api_key=settings.GEMINI_API_KEY) def transcribe_chunk(self, file_path: str, offset_seconds: int = 0) -> dict: """ Uploads a chunk to Gemini and returns the transcription with timestamps. """ logger.info(f"Uploading chunk {file_path} to Gemini...") # 1. Upload file (positional argument) media_file = self.client.files.upload(file=file_path) # 2. Wait for processing (usually fast for audio) while media_file.state == "PROCESSING": time.sleep(2) media_file = self.client.files.get(name=media_file.name) if media_file.state == "FAILED": raise Exception("File processing failed at Gemini.") # 3. Transcribe with Diarization and Timestamps (Plain Text Mode for Stability) prompt = """ Transkribiere dieses Audio wortgetreu. Identifiziere die Sprecher (Speaker A, Speaker B, etc.). Gib das Ergebnis EXAKT in diesem Format zurück (pro Zeile ein Sprecherwechsel): [MM:SS] Speaker Name: Gesprochener Text... Beispiel: [00:00] Speaker A: Hallo zusammen. [00:05] Speaker B: Guten Morgen. """ logger.info(f"Generating transcription for {file_path}...") response = self.client.models.generate_content( model="gemini-2.0-flash", contents=[media_file, prompt], config=types.GenerateContentConfig( temperature=0.1, max_output_tokens=8192 ) ) # Cleanup: Delete file from Gemini storage self.client.files.delete(name=media_file.name) # Parse Plain Text to JSON structured_data = self.parse_transcript(response.text) import json return { "raw_text": json.dumps(structured_data), # Return valid JSON string "offset": offset_seconds } def parse_transcript(self, text: str) -> list: """ Parses lines like '[00:12] Speaker A: Hello world' into structured JSON. """ import re results = [] # Regex to match: [MM:SS] Speaker: Text # Flexible for MM:SS or H:MM:SS pattern = re.compile(r"^\[(\d{1,2}:\d{2}(?::\d{2})?)\]\s*([^:]+):\s*(.+)$") for line in text.strip().split('\n'): line = line.strip() if not line: continue match = pattern.match(line) if match: time_str, speaker, content = match.groups() results.append({ "time": time_str, "speaker": speaker.strip(), "text": content.strip() }) else: # Fallback: Append to previous if it looks like continuation if results and not line.startswith("["): results[-1]["text"] += " " + line return results