[30988f42] feat(transcription-tool): stabilize transcription with plain text parsing and add retry feature
This commit is contained in:
70
debug_transcription_raw.py
Normal file
70
debug_transcription_raw.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import sqlite3
|
||||
import json
|
||||
import os
|
||||
|
||||
DB_PATH = "transcripts.db"
|
||||
|
||||
def inspect_latest_meeting():
|
||||
if not os.path.exists(DB_PATH):
|
||||
print(f"Error: Database file '{DB_PATH}' not found.")
|
||||
return
|
||||
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get latest meeting
|
||||
cursor.execute("SELECT id, title, created_at FROM meetings ORDER BY created_at DESC LIMIT 1")
|
||||
meeting = cursor.fetchone()
|
||||
|
||||
if not meeting:
|
||||
print("No meetings found in DB.")
|
||||
conn.close()
|
||||
return
|
||||
|
||||
meeting_id, title, created_at = meeting
|
||||
print(f"--- Inspecting Latest Meeting: ID {meeting_id} ('{title}') created at {created_at} ---")
|
||||
|
||||
# Get chunks for this meeting
|
||||
cursor.execute("SELECT id, chunk_index, raw_text, json_content FROM transcript_chunks WHERE meeting_id = ? ORDER BY chunk_index", (meeting_id,))
|
||||
chunks = cursor.fetchall()
|
||||
|
||||
if not chunks:
|
||||
print("No chunks found for this meeting.")
|
||||
|
||||
for chunk in chunks:
|
||||
chunk_id, idx, raw_text, json_content = chunk
|
||||
print(f"\n[Chunk {idx} (ID: {chunk_id})]")
|
||||
|
||||
print(f"Stored JSON Content (Length): {len(json.loads(json_content)) if json_content else 'None/Empty'}")
|
||||
|
||||
print("-" * 20 + " RAW TEXT START " + "-" * 20)
|
||||
print(raw_text[:500]) # Print first 500 chars
|
||||
print("..." if len(raw_text) > 500 else "")
|
||||
print("-" * 20 + " RAW TEXT END " + "-" * 20)
|
||||
|
||||
# Try to parse manually to see error
|
||||
try:
|
||||
# Simulate cleaning logic from orchestrator
|
||||
cleaned = raw_text.strip()
|
||||
if cleaned.startswith("```json"):
|
||||
cleaned = cleaned[7:]
|
||||
elif cleaned.startswith("```"):
|
||||
cleaned = cleaned[3:]
|
||||
if cleaned.endswith("```"):
|
||||
cleaned = cleaned[:-3]
|
||||
cleaned = cleaned.strip()
|
||||
|
||||
parsed = json.loads(cleaned)
|
||||
print("✅ Manual Parsing Successful!")
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"❌ Manual Parsing Failed: {e}")
|
||||
# Show context around error
|
||||
if hasattr(e, 'pos'):
|
||||
start = max(0, e.pos - 20)
|
||||
end = min(len(cleaned), e.pos + 20)
|
||||
print(f" Context at error: ...{cleaned[start:end]}...")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
inspect_latest_meeting()
|
||||
@@ -98,13 +98,13 @@ services:
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./transcription-tool/backend:/app/backend
|
||||
- ./transcription-tool/frontend/dist:/app/frontend/dist # Mount Frontend Build for Live Updates
|
||||
- ./transcripts.db:/app/transcripts.db
|
||||
- ./uploads_audio:/app/uploads_audio
|
||||
- ./gemini_api_key.txt:/app/gemini_api_key.txt
|
||||
environment:
|
||||
PYTHONUNBUFFERED: "1"
|
||||
DATABASE_URL: "sqlite:////app/transcripts.db"
|
||||
GEMINI_API_KEY: "AIzaSyCFRmr1rOrkFKiEuh9GOCJNB2zfJsYmR68" # Placeholder, actual key is in file
|
||||
ports:
|
||||
- "8001:8001"
|
||||
|
||||
|
||||
@@ -99,6 +99,31 @@ async def upload_audio(
|
||||
|
||||
return meeting
|
||||
|
||||
@app.post("/api/meetings/{meeting_id}/retry")
|
||||
def retry_meeting(
|
||||
meeting_id: int,
|
||||
background_tasks: BackgroundTasks,
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
meeting = db.query(Meeting).filter(Meeting.id == meeting_id).first()
|
||||
if not meeting:
|
||||
raise HTTPException(404, detail="Meeting not found")
|
||||
|
||||
# Check if chunks directory exists
|
||||
chunk_dir = os.path.join(settings.UPLOAD_DIR, "chunks", str(meeting_id))
|
||||
if not os.path.exists(chunk_dir) or not os.listdir(chunk_dir):
|
||||
raise HTTPException(400, detail="Original audio chunks not found. Please re-upload.")
|
||||
|
||||
# Reset status
|
||||
meeting.status = "QUEUED"
|
||||
db.commit()
|
||||
|
||||
# Trigger Retry Task
|
||||
from .services.orchestrator import retry_meeting_task
|
||||
background_tasks.add_task(retry_meeting_task, meeting.id, SessionLocal)
|
||||
|
||||
return {"status": "started", "message": "Retrying transcription..."}
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
class InsightRequest(BaseModel):
|
||||
@@ -201,9 +226,16 @@ def delete_meeting(meeting_id: int, db: Session = Depends(get_db)):
|
||||
|
||||
# Serve Frontend
|
||||
# This must be the last route definition to avoid catching API routes
|
||||
static_path = "/frontend_static"
|
||||
|
||||
# PRIORITY 1: Mounted Volume (Development / Live Update)
|
||||
static_path = "/app/frontend/dist"
|
||||
|
||||
# PRIORITY 2: Built-in Image Path (Production)
|
||||
if not os.path.exists(static_path):
|
||||
static_path = "/frontend_static"
|
||||
|
||||
# PRIORITY 3: Local Development (running python directly)
|
||||
if not os.path.exists(static_path):
|
||||
# Fallback for local development if not in Docker
|
||||
static_path = os.path.join(os.path.dirname(__file__), "../frontend/dist")
|
||||
|
||||
if os.path.exists(static_path):
|
||||
|
||||
@@ -19,6 +19,16 @@ def parse_time_to_seconds(time_str):
|
||||
return 0
|
||||
return 0
|
||||
|
||||
def clean_json_string(text):
|
||||
text = text.strip()
|
||||
if text.startswith("```json"):
|
||||
text = text[7:]
|
||||
elif text.startswith("```"):
|
||||
text = text[3:]
|
||||
if text.endswith("```"):
|
||||
text = text[:-3]
|
||||
return text.strip()
|
||||
|
||||
def process_meeting_task(meeting_id: int, db_session_factory):
|
||||
db = db_session_factory()
|
||||
meeting = db.query(Meeting).filter(Meeting.id == meeting_id).first()
|
||||
@@ -50,7 +60,13 @@ def process_meeting_task(meeting_id: int, db_session_factory):
|
||||
# Parse JSON and Adjust Timestamps
|
||||
json_data = []
|
||||
try:
|
||||
raw_json = json.loads(result["raw_text"])
|
||||
cleaned_text = clean_json_string(result["raw_text"])
|
||||
raw_json = json.loads(cleaned_text)
|
||||
|
||||
# Check for wrapped structure (e.g. {"items": [...]}) if schema enforced it
|
||||
if isinstance(raw_json, dict) and "items" in raw_json:
|
||||
raw_json = raw_json["items"] # Extract inner list
|
||||
|
||||
if isinstance(raw_json, list):
|
||||
for entry in raw_json:
|
||||
seconds = parse_time_to_seconds(entry.get("time", "00:00"))
|
||||
@@ -63,7 +79,7 @@ def process_meeting_task(meeting_id: int, db_session_factory):
|
||||
entry["display_time"] = f"{h:02}:{m:02}:{s:02}"
|
||||
json_data.append(entry)
|
||||
except Exception as e:
|
||||
logger.error(f"JSON Parsing failed for chunk {i}: {e}")
|
||||
logger.error(f"JSON Parsing failed for chunk {i}: {e}. Raw text start: {result['raw_text'][:100]}")
|
||||
|
||||
# Save chunk result
|
||||
db_chunk = TranscriptChunk(
|
||||
@@ -89,3 +105,94 @@ def process_meeting_task(meeting_id: int, db_session_factory):
|
||||
db.commit()
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
def retry_meeting_task(meeting_id: int, db_session_factory):
|
||||
"""
|
||||
Retries transcription using existing chunks on disk.
|
||||
Avoids re-splitting the original file.
|
||||
"""
|
||||
db = db_session_factory()
|
||||
meeting = db.query(Meeting).filter(Meeting.id == meeting_id).first()
|
||||
if not meeting:
|
||||
return
|
||||
|
||||
try:
|
||||
import os
|
||||
transcriber = TranscriptionService()
|
||||
|
||||
# 0. Validate Chunk Directory
|
||||
chunk_dir = os.path.join(settings.UPLOAD_DIR, "chunks", str(meeting_id))
|
||||
if not os.path.exists(chunk_dir):
|
||||
logger.error(f"Chunk directory not found for meeting {meeting_id}")
|
||||
meeting.status = "ERROR"
|
||||
db.commit()
|
||||
return
|
||||
|
||||
chunks = sorted([os.path.join(chunk_dir, f) for f in os.listdir(chunk_dir) if f.endswith(".mp3")])
|
||||
if not chunks:
|
||||
logger.error(f"No chunks found for meeting {meeting_id}")
|
||||
meeting.status = "ERROR"
|
||||
db.commit()
|
||||
return
|
||||
|
||||
# Phase 1: Clear Old Chunks
|
||||
meeting.status = "RETRYING"
|
||||
db.query(TranscriptChunk).filter(TranscriptChunk.meeting_id == meeting_id).delete()
|
||||
db.commit()
|
||||
|
||||
# Phase 2: Transcribe
|
||||
all_text = []
|
||||
for i, chunk_path in enumerate(chunks):
|
||||
offset = i * settings.CHUNK_DURATION_SEC
|
||||
logger.info(f"Retrying chunk {i+1}/{len(chunks)} with offset {offset}s")
|
||||
|
||||
result = transcriber.transcribe_chunk(chunk_path, offset)
|
||||
|
||||
# Parse JSON and Adjust Timestamps (Same logic as process_meeting_task)
|
||||
json_data = []
|
||||
try:
|
||||
# With response_schema, raw_text SHOULD be valid JSON directly
|
||||
# But let's keep clean_json_string just in case specific models deviate
|
||||
cleaned_text = clean_json_string(result["raw_text"])
|
||||
raw_json = json.loads(cleaned_text)
|
||||
|
||||
# Check for wrapped structure (e.g. {"items": [...]}) if schema enforced it
|
||||
if isinstance(raw_json, dict) and "items" in raw_json:
|
||||
raw_json = raw_json["items"] # Extract inner list
|
||||
|
||||
if isinstance(raw_json, list):
|
||||
for entry in raw_json:
|
||||
seconds = parse_time_to_seconds(entry.get("time", "00:00"))
|
||||
absolute_seconds = seconds + offset
|
||||
entry["absolute_seconds"] = absolute_seconds
|
||||
|
||||
h = int(absolute_seconds // 3600)
|
||||
m = int((absolute_seconds % 3600) // 60)
|
||||
s = int(absolute_seconds % 60)
|
||||
entry["display_time"] = f"{h:02}:{m:02}:{s:02}"
|
||||
json_data.append(entry)
|
||||
except Exception as e:
|
||||
logger.error(f"JSON Parsing failed for chunk {i}: {e}. Raw: {result['raw_text'][:100]}")
|
||||
|
||||
# Save chunk result
|
||||
db_chunk = TranscriptChunk(
|
||||
meeting_id=meeting.id,
|
||||
chunk_index=i,
|
||||
raw_text=result["raw_text"],
|
||||
json_content=json_data
|
||||
)
|
||||
db.add(db_chunk)
|
||||
all_text.append(result["raw_text"])
|
||||
db.commit()
|
||||
|
||||
# Phase 3: Finalize
|
||||
meeting.status = "COMPLETED"
|
||||
db.commit()
|
||||
logger.info(f"Meeting {meeting.id} retry completed.")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error retrying meeting {meeting_id}: {e}", exc_info=True)
|
||||
meeting.status = "ERROR"
|
||||
db.commit()
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
@@ -30,20 +30,17 @@ class TranscriptionService:
|
||||
if media_file.state == "FAILED":
|
||||
raise Exception("File processing failed at Gemini.")
|
||||
|
||||
# 3. Transcribe with Diarization and Timestamps
|
||||
# 3. Transcribe with Diarization and Timestamps (Plain Text Mode for Stability)
|
||||
prompt = """
|
||||
Transkribiere dieses Audio wortgetreu.
|
||||
Identifiziere die Sprecher (Speaker A, Speaker B, etc.).
|
||||
|
||||
Gib das Ergebnis als JSON-Liste zurück.
|
||||
Format:
|
||||
[
|
||||
{
|
||||
"time": "MM:SS",
|
||||
"speaker": "Speaker A",
|
||||
"text": "..."
|
||||
}
|
||||
]
|
||||
Gib das Ergebnis EXAKT in diesem Format zurück (pro Zeile ein Sprecherwechsel):
|
||||
[MM:SS] Speaker Name: Gesprochener Text...
|
||||
|
||||
Beispiel:
|
||||
[00:00] Speaker A: Hallo zusammen.
|
||||
[00:05] Speaker B: Guten Morgen.
|
||||
"""
|
||||
|
||||
logger.info(f"Generating transcription for {file_path}...")
|
||||
@@ -52,14 +49,46 @@ class TranscriptionService:
|
||||
contents=[media_file, prompt],
|
||||
config=types.GenerateContentConfig(
|
||||
temperature=0.1,
|
||||
response_mime_type="application/json"
|
||||
max_output_tokens=8192
|
||||
)
|
||||
)
|
||||
|
||||
# Cleanup: Delete file from Gemini storage
|
||||
self.client.files.delete(name=media_file.name)
|
||||
|
||||
|
||||
# Parse Plain Text to JSON
|
||||
structured_data = self.parse_transcript(response.text)
|
||||
import json
|
||||
return {
|
||||
"raw_text": response.text, # This is now a JSON string
|
||||
"raw_text": json.dumps(structured_data), # Return valid JSON string
|
||||
"offset": offset_seconds
|
||||
}
|
||||
|
||||
def parse_transcript(self, text: str) -> list:
|
||||
"""
|
||||
Parses lines like '[00:12] Speaker A: Hello world' into structured JSON.
|
||||
"""
|
||||
import re
|
||||
results = []
|
||||
# Regex to match: [MM:SS] Speaker: Text
|
||||
# Flexible for MM:SS or H:MM:SS
|
||||
pattern = re.compile(r"^\[(\d{1,2}:\d{2}(?::\d{2})?)\]\s*([^:]+):\s*(.+)$")
|
||||
|
||||
for line in text.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if not line: continue
|
||||
|
||||
match = pattern.match(line)
|
||||
if match:
|
||||
time_str, speaker, content = match.groups()
|
||||
results.append({
|
||||
"time": time_str,
|
||||
"speaker": speaker.strip(),
|
||||
"text": content.strip()
|
||||
})
|
||||
else:
|
||||
# Fallback: Append to previous if it looks like continuation
|
||||
if results and not line.startswith("["):
|
||||
results[-1]["text"] += " " + line
|
||||
|
||||
return results
|
||||
|
||||
@@ -394,6 +394,20 @@ export default function App() {
|
||||
>
|
||||
<Share2 className="h-5 w-5" />
|
||||
</button>
|
||||
<button
|
||||
onClick={async () => {
|
||||
if(!confirm("Retry transcription using existing audio chunks? This will overwrite the current transcript.")) return;
|
||||
try {
|
||||
await axios.post(`${API_BASE}/meetings/${detailMeeting.id}/retry`);
|
||||
alert("Retry started. Please wait for completion.");
|
||||
fetchDetail(detailMeeting.id);
|
||||
} catch(e) { alert("Retry failed."); }
|
||||
}}
|
||||
className="text-orange-500 hover:bg-orange-50 dark:hover:bg-orange-900/20 p-2 rounded"
|
||||
title="Retry Transcription (Fix Format Issues)"
|
||||
>
|
||||
<Wand2 className="h-5 w-5" />
|
||||
</button>
|
||||
<button onClick={(e) => handleDeleteMeeting(e, detailMeeting.id)} className="text-red-500 hover:bg-red-50 dark:hover:bg-red-900/20 p-2 rounded"><Trash2 className="h-5 w-5" /></button>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
Reference in New Issue
Block a user