import os import time import logging from google import genai from google.genai import types from ..config import settings logger = logging.getLogger(__name__) class TranscriptionService: def __init__(self): if not settings.GEMINI_API_KEY: raise Exception("Gemini API Key missing.") self.client = genai.Client(api_key=settings.GEMINI_API_KEY) def transcribe_chunk(self, file_path: str, offset_seconds: int = 0) -> dict: """ Uploads a chunk to Gemini and returns the transcription with timestamps. """ logger.info(f"Uploading chunk {file_path} to Gemini...") # 1. Upload file (positional argument) media_file = self.client.files.upload(file=file_path) # 2. Wait for processing (usually fast for audio) while media_file.state == "PROCESSING": time.sleep(2) media_file = self.client.files.get(name=media_file.name) if media_file.state == "FAILED": raise Exception("File processing failed at Gemini.") # 3. Transcribe with Diarization and Timestamps prompt = """ Transkribiere dieses Audio wortgetreu. Identifiziere die Sprecher (Speaker A, Speaker B, etc.). Gib das Ergebnis als JSON-Liste zurück. Format: [ { "time": "MM:SS", "speaker": "Speaker A", "text": "..." } ] """ logger.info(f"Generating transcription for {file_path}...") response = self.client.models.generate_content( model="gemini-2.0-flash", contents=[media_file, prompt], config=types.GenerateContentConfig( temperature=0.1, response_mime_type="application/json" ) ) # Cleanup: Delete file from Gemini storage self.client.files.delete(name=media_file.name) return { "raw_text": response.text, # This is now a JSON string "offset": offset_seconds }