95 lines
3.3 KiB
Python
95 lines
3.3 KiB
Python
import os
|
|
import time
|
|
import logging
|
|
from google import genai
|
|
from google.genai import types
|
|
from ..config import settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class TranscriptionService:
|
|
def __init__(self):
|
|
if not settings.GEMINI_API_KEY:
|
|
raise Exception("Gemini API Key missing.")
|
|
self.client = genai.Client(api_key=settings.GEMINI_API_KEY)
|
|
|
|
def transcribe_chunk(self, file_path: str, offset_seconds: int = 0) -> dict:
|
|
"""
|
|
Uploads a chunk to Gemini and returns the transcription with timestamps.
|
|
"""
|
|
logger.info(f"Uploading chunk {file_path} to Gemini...")
|
|
|
|
# 1. Upload file (positional argument)
|
|
media_file = self.client.files.upload(file=file_path)
|
|
|
|
# 2. Wait for processing (usually fast for audio)
|
|
while media_file.state == "PROCESSING":
|
|
time.sleep(2)
|
|
media_file = self.client.files.get(name=media_file.name)
|
|
|
|
if media_file.state == "FAILED":
|
|
raise Exception("File processing failed at Gemini.")
|
|
|
|
# 3. Transcribe with Diarization and Timestamps (Plain Text Mode for Stability)
|
|
prompt = """
|
|
Transkribiere dieses Audio wortgetreu.
|
|
Identifiziere die Sprecher (Speaker A, Speaker B, etc.).
|
|
|
|
Gib das Ergebnis EXAKT in diesem Format zurück (pro Zeile ein Sprecherwechsel):
|
|
[MM:SS] Speaker Name: Gesprochener Text...
|
|
|
|
Beispiel:
|
|
[00:00] Speaker A: Hallo zusammen.
|
|
[00:05] Speaker B: Guten Morgen.
|
|
"""
|
|
|
|
logger.info(f"Generating transcription for {file_path}...")
|
|
response = self.client.models.generate_content(
|
|
model="gemini-2.0-flash",
|
|
contents=[media_file, prompt],
|
|
config=types.GenerateContentConfig(
|
|
temperature=0.1,
|
|
max_output_tokens=8192
|
|
)
|
|
)
|
|
|
|
# Cleanup: Delete file from Gemini storage
|
|
self.client.files.delete(name=media_file.name)
|
|
|
|
# Parse Plain Text to JSON
|
|
structured_data = self.parse_transcript(response.text)
|
|
import json
|
|
return {
|
|
"raw_text": json.dumps(structured_data), # Return valid JSON string
|
|
"offset": offset_seconds
|
|
}
|
|
|
|
def parse_transcript(self, text: str) -> list:
|
|
"""
|
|
Parses lines like '[00:12] Speaker A: Hello world' into structured JSON.
|
|
"""
|
|
import re
|
|
results = []
|
|
# Regex to match: [MM:SS] Speaker: Text
|
|
# Flexible for MM:SS or H:MM:SS
|
|
pattern = re.compile(r"^\[(\d{1,2}:\d{2}(?::\d{2})?)\]\s*([^:]+):\s*(.+)$")
|
|
|
|
for line in text.strip().split('\n'):
|
|
line = line.strip()
|
|
if not line: continue
|
|
|
|
match = pattern.match(line)
|
|
if match:
|
|
time_str, speaker, content = match.groups()
|
|
results.append({
|
|
"time": time_str,
|
|
"speaker": speaker.strip(),
|
|
"text": content.strip()
|
|
})
|
|
else:
|
|
# Fallback: Append to previous if it looks like continuation
|
|
if results and not line.startswith("["):
|
|
results[-1]["text"] += " " + line
|
|
|
|
return results
|