[30988f42] feat(transcription-tool): stabilize transcription with plain text parsing and add retry feature
This commit is contained in:
70
debug_transcription_raw.py
Normal file
70
debug_transcription_raw.py
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
import sqlite3
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
DB_PATH = "transcripts.db"
|
||||||
|
|
||||||
|
def inspect_latest_meeting():
|
||||||
|
if not os.path.exists(DB_PATH):
|
||||||
|
print(f"Error: Database file '{DB_PATH}' not found.")
|
||||||
|
return
|
||||||
|
|
||||||
|
conn = sqlite3.connect(DB_PATH)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# Get latest meeting
|
||||||
|
cursor.execute("SELECT id, title, created_at FROM meetings ORDER BY created_at DESC LIMIT 1")
|
||||||
|
meeting = cursor.fetchone()
|
||||||
|
|
||||||
|
if not meeting:
|
||||||
|
print("No meetings found in DB.")
|
||||||
|
conn.close()
|
||||||
|
return
|
||||||
|
|
||||||
|
meeting_id, title, created_at = meeting
|
||||||
|
print(f"--- Inspecting Latest Meeting: ID {meeting_id} ('{title}') created at {created_at} ---")
|
||||||
|
|
||||||
|
# Get chunks for this meeting
|
||||||
|
cursor.execute("SELECT id, chunk_index, raw_text, json_content FROM transcript_chunks WHERE meeting_id = ? ORDER BY chunk_index", (meeting_id,))
|
||||||
|
chunks = cursor.fetchall()
|
||||||
|
|
||||||
|
if not chunks:
|
||||||
|
print("No chunks found for this meeting.")
|
||||||
|
|
||||||
|
for chunk in chunks:
|
||||||
|
chunk_id, idx, raw_text, json_content = chunk
|
||||||
|
print(f"\n[Chunk {idx} (ID: {chunk_id})]")
|
||||||
|
|
||||||
|
print(f"Stored JSON Content (Length): {len(json.loads(json_content)) if json_content else 'None/Empty'}")
|
||||||
|
|
||||||
|
print("-" * 20 + " RAW TEXT START " + "-" * 20)
|
||||||
|
print(raw_text[:500]) # Print first 500 chars
|
||||||
|
print("..." if len(raw_text) > 500 else "")
|
||||||
|
print("-" * 20 + " RAW TEXT END " + "-" * 20)
|
||||||
|
|
||||||
|
# Try to parse manually to see error
|
||||||
|
try:
|
||||||
|
# Simulate cleaning logic from orchestrator
|
||||||
|
cleaned = raw_text.strip()
|
||||||
|
if cleaned.startswith("```json"):
|
||||||
|
cleaned = cleaned[7:]
|
||||||
|
elif cleaned.startswith("```"):
|
||||||
|
cleaned = cleaned[3:]
|
||||||
|
if cleaned.endswith("```"):
|
||||||
|
cleaned = cleaned[:-3]
|
||||||
|
cleaned = cleaned.strip()
|
||||||
|
|
||||||
|
parsed = json.loads(cleaned)
|
||||||
|
print("✅ Manual Parsing Successful!")
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
print(f"❌ Manual Parsing Failed: {e}")
|
||||||
|
# Show context around error
|
||||||
|
if hasattr(e, 'pos'):
|
||||||
|
start = max(0, e.pos - 20)
|
||||||
|
end = min(len(cleaned), e.pos + 20)
|
||||||
|
print(f" Context at error: ...{cleaned[start:end]}...")
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
inspect_latest_meeting()
|
||||||
@@ -98,13 +98,13 @@ services:
|
|||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
volumes:
|
volumes:
|
||||||
- ./transcription-tool/backend:/app/backend
|
- ./transcription-tool/backend:/app/backend
|
||||||
|
- ./transcription-tool/frontend/dist:/app/frontend/dist # Mount Frontend Build for Live Updates
|
||||||
- ./transcripts.db:/app/transcripts.db
|
- ./transcripts.db:/app/transcripts.db
|
||||||
- ./uploads_audio:/app/uploads_audio
|
- ./uploads_audio:/app/uploads_audio
|
||||||
- ./gemini_api_key.txt:/app/gemini_api_key.txt
|
- ./gemini_api_key.txt:/app/gemini_api_key.txt
|
||||||
environment:
|
environment:
|
||||||
PYTHONUNBUFFERED: "1"
|
PYTHONUNBUFFERED: "1"
|
||||||
DATABASE_URL: "sqlite:////app/transcripts.db"
|
DATABASE_URL: "sqlite:////app/transcripts.db"
|
||||||
GEMINI_API_KEY: "AIzaSyCFRmr1rOrkFKiEuh9GOCJNB2zfJsYmR68" # Placeholder, actual key is in file
|
|
||||||
ports:
|
ports:
|
||||||
- "8001:8001"
|
- "8001:8001"
|
||||||
|
|
||||||
|
|||||||
@@ -99,6 +99,31 @@ async def upload_audio(
|
|||||||
|
|
||||||
return meeting
|
return meeting
|
||||||
|
|
||||||
|
@app.post("/api/meetings/{meeting_id}/retry")
|
||||||
|
def retry_meeting(
|
||||||
|
meeting_id: int,
|
||||||
|
background_tasks: BackgroundTasks,
|
||||||
|
db: Session = Depends(get_db)
|
||||||
|
):
|
||||||
|
meeting = db.query(Meeting).filter(Meeting.id == meeting_id).first()
|
||||||
|
if not meeting:
|
||||||
|
raise HTTPException(404, detail="Meeting not found")
|
||||||
|
|
||||||
|
# Check if chunks directory exists
|
||||||
|
chunk_dir = os.path.join(settings.UPLOAD_DIR, "chunks", str(meeting_id))
|
||||||
|
if not os.path.exists(chunk_dir) or not os.listdir(chunk_dir):
|
||||||
|
raise HTTPException(400, detail="Original audio chunks not found. Please re-upload.")
|
||||||
|
|
||||||
|
# Reset status
|
||||||
|
meeting.status = "QUEUED"
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
# Trigger Retry Task
|
||||||
|
from .services.orchestrator import retry_meeting_task
|
||||||
|
background_tasks.add_task(retry_meeting_task, meeting.id, SessionLocal)
|
||||||
|
|
||||||
|
return {"status": "started", "message": "Retrying transcription..."}
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
class InsightRequest(BaseModel):
|
class InsightRequest(BaseModel):
|
||||||
@@ -201,9 +226,16 @@ def delete_meeting(meeting_id: int, db: Session = Depends(get_db)):
|
|||||||
|
|
||||||
# Serve Frontend
|
# Serve Frontend
|
||||||
# This must be the last route definition to avoid catching API routes
|
# This must be the last route definition to avoid catching API routes
|
||||||
static_path = "/frontend_static"
|
|
||||||
|
# PRIORITY 1: Mounted Volume (Development / Live Update)
|
||||||
|
static_path = "/app/frontend/dist"
|
||||||
|
|
||||||
|
# PRIORITY 2: Built-in Image Path (Production)
|
||||||
|
if not os.path.exists(static_path):
|
||||||
|
static_path = "/frontend_static"
|
||||||
|
|
||||||
|
# PRIORITY 3: Local Development (running python directly)
|
||||||
if not os.path.exists(static_path):
|
if not os.path.exists(static_path):
|
||||||
# Fallback for local development if not in Docker
|
|
||||||
static_path = os.path.join(os.path.dirname(__file__), "../frontend/dist")
|
static_path = os.path.join(os.path.dirname(__file__), "../frontend/dist")
|
||||||
|
|
||||||
if os.path.exists(static_path):
|
if os.path.exists(static_path):
|
||||||
|
|||||||
@@ -19,6 +19,16 @@ def parse_time_to_seconds(time_str):
|
|||||||
return 0
|
return 0
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
def clean_json_string(text):
|
||||||
|
text = text.strip()
|
||||||
|
if text.startswith("```json"):
|
||||||
|
text = text[7:]
|
||||||
|
elif text.startswith("```"):
|
||||||
|
text = text[3:]
|
||||||
|
if text.endswith("```"):
|
||||||
|
text = text[:-3]
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
def process_meeting_task(meeting_id: int, db_session_factory):
|
def process_meeting_task(meeting_id: int, db_session_factory):
|
||||||
db = db_session_factory()
|
db = db_session_factory()
|
||||||
meeting = db.query(Meeting).filter(Meeting.id == meeting_id).first()
|
meeting = db.query(Meeting).filter(Meeting.id == meeting_id).first()
|
||||||
@@ -50,7 +60,13 @@ def process_meeting_task(meeting_id: int, db_session_factory):
|
|||||||
# Parse JSON and Adjust Timestamps
|
# Parse JSON and Adjust Timestamps
|
||||||
json_data = []
|
json_data = []
|
||||||
try:
|
try:
|
||||||
raw_json = json.loads(result["raw_text"])
|
cleaned_text = clean_json_string(result["raw_text"])
|
||||||
|
raw_json = json.loads(cleaned_text)
|
||||||
|
|
||||||
|
# Check for wrapped structure (e.g. {"items": [...]}) if schema enforced it
|
||||||
|
if isinstance(raw_json, dict) and "items" in raw_json:
|
||||||
|
raw_json = raw_json["items"] # Extract inner list
|
||||||
|
|
||||||
if isinstance(raw_json, list):
|
if isinstance(raw_json, list):
|
||||||
for entry in raw_json:
|
for entry in raw_json:
|
||||||
seconds = parse_time_to_seconds(entry.get("time", "00:00"))
|
seconds = parse_time_to_seconds(entry.get("time", "00:00"))
|
||||||
@@ -63,7 +79,7 @@ def process_meeting_task(meeting_id: int, db_session_factory):
|
|||||||
entry["display_time"] = f"{h:02}:{m:02}:{s:02}"
|
entry["display_time"] = f"{h:02}:{m:02}:{s:02}"
|
||||||
json_data.append(entry)
|
json_data.append(entry)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"JSON Parsing failed for chunk {i}: {e}")
|
logger.error(f"JSON Parsing failed for chunk {i}: {e}. Raw text start: {result['raw_text'][:100]}")
|
||||||
|
|
||||||
# Save chunk result
|
# Save chunk result
|
||||||
db_chunk = TranscriptChunk(
|
db_chunk = TranscriptChunk(
|
||||||
@@ -89,3 +105,94 @@ def process_meeting_task(meeting_id: int, db_session_factory):
|
|||||||
db.commit()
|
db.commit()
|
||||||
finally:
|
finally:
|
||||||
db.close()
|
db.close()
|
||||||
|
|
||||||
|
def retry_meeting_task(meeting_id: int, db_session_factory):
|
||||||
|
"""
|
||||||
|
Retries transcription using existing chunks on disk.
|
||||||
|
Avoids re-splitting the original file.
|
||||||
|
"""
|
||||||
|
db = db_session_factory()
|
||||||
|
meeting = db.query(Meeting).filter(Meeting.id == meeting_id).first()
|
||||||
|
if not meeting:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
import os
|
||||||
|
transcriber = TranscriptionService()
|
||||||
|
|
||||||
|
# 0. Validate Chunk Directory
|
||||||
|
chunk_dir = os.path.join(settings.UPLOAD_DIR, "chunks", str(meeting_id))
|
||||||
|
if not os.path.exists(chunk_dir):
|
||||||
|
logger.error(f"Chunk directory not found for meeting {meeting_id}")
|
||||||
|
meeting.status = "ERROR"
|
||||||
|
db.commit()
|
||||||
|
return
|
||||||
|
|
||||||
|
chunks = sorted([os.path.join(chunk_dir, f) for f in os.listdir(chunk_dir) if f.endswith(".mp3")])
|
||||||
|
if not chunks:
|
||||||
|
logger.error(f"No chunks found for meeting {meeting_id}")
|
||||||
|
meeting.status = "ERROR"
|
||||||
|
db.commit()
|
||||||
|
return
|
||||||
|
|
||||||
|
# Phase 1: Clear Old Chunks
|
||||||
|
meeting.status = "RETRYING"
|
||||||
|
db.query(TranscriptChunk).filter(TranscriptChunk.meeting_id == meeting_id).delete()
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
# Phase 2: Transcribe
|
||||||
|
all_text = []
|
||||||
|
for i, chunk_path in enumerate(chunks):
|
||||||
|
offset = i * settings.CHUNK_DURATION_SEC
|
||||||
|
logger.info(f"Retrying chunk {i+1}/{len(chunks)} with offset {offset}s")
|
||||||
|
|
||||||
|
result = transcriber.transcribe_chunk(chunk_path, offset)
|
||||||
|
|
||||||
|
# Parse JSON and Adjust Timestamps (Same logic as process_meeting_task)
|
||||||
|
json_data = []
|
||||||
|
try:
|
||||||
|
# With response_schema, raw_text SHOULD be valid JSON directly
|
||||||
|
# But let's keep clean_json_string just in case specific models deviate
|
||||||
|
cleaned_text = clean_json_string(result["raw_text"])
|
||||||
|
raw_json = json.loads(cleaned_text)
|
||||||
|
|
||||||
|
# Check for wrapped structure (e.g. {"items": [...]}) if schema enforced it
|
||||||
|
if isinstance(raw_json, dict) and "items" in raw_json:
|
||||||
|
raw_json = raw_json["items"] # Extract inner list
|
||||||
|
|
||||||
|
if isinstance(raw_json, list):
|
||||||
|
for entry in raw_json:
|
||||||
|
seconds = parse_time_to_seconds(entry.get("time", "00:00"))
|
||||||
|
absolute_seconds = seconds + offset
|
||||||
|
entry["absolute_seconds"] = absolute_seconds
|
||||||
|
|
||||||
|
h = int(absolute_seconds // 3600)
|
||||||
|
m = int((absolute_seconds % 3600) // 60)
|
||||||
|
s = int(absolute_seconds % 60)
|
||||||
|
entry["display_time"] = f"{h:02}:{m:02}:{s:02}"
|
||||||
|
json_data.append(entry)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"JSON Parsing failed for chunk {i}: {e}. Raw: {result['raw_text'][:100]}")
|
||||||
|
|
||||||
|
# Save chunk result
|
||||||
|
db_chunk = TranscriptChunk(
|
||||||
|
meeting_id=meeting.id,
|
||||||
|
chunk_index=i,
|
||||||
|
raw_text=result["raw_text"],
|
||||||
|
json_content=json_data
|
||||||
|
)
|
||||||
|
db.add(db_chunk)
|
||||||
|
all_text.append(result["raw_text"])
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
# Phase 3: Finalize
|
||||||
|
meeting.status = "COMPLETED"
|
||||||
|
db.commit()
|
||||||
|
logger.info(f"Meeting {meeting.id} retry completed.")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error retrying meeting {meeting_id}: {e}", exc_info=True)
|
||||||
|
meeting.status = "ERROR"
|
||||||
|
db.commit()
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|||||||
@@ -30,20 +30,17 @@ class TranscriptionService:
|
|||||||
if media_file.state == "FAILED":
|
if media_file.state == "FAILED":
|
||||||
raise Exception("File processing failed at Gemini.")
|
raise Exception("File processing failed at Gemini.")
|
||||||
|
|
||||||
# 3. Transcribe with Diarization and Timestamps
|
# 3. Transcribe with Diarization and Timestamps (Plain Text Mode for Stability)
|
||||||
prompt = """
|
prompt = """
|
||||||
Transkribiere dieses Audio wortgetreu.
|
Transkribiere dieses Audio wortgetreu.
|
||||||
Identifiziere die Sprecher (Speaker A, Speaker B, etc.).
|
Identifiziere die Sprecher (Speaker A, Speaker B, etc.).
|
||||||
|
|
||||||
Gib das Ergebnis als JSON-Liste zurück.
|
Gib das Ergebnis EXAKT in diesem Format zurück (pro Zeile ein Sprecherwechsel):
|
||||||
Format:
|
[MM:SS] Speaker Name: Gesprochener Text...
|
||||||
[
|
|
||||||
{
|
Beispiel:
|
||||||
"time": "MM:SS",
|
[00:00] Speaker A: Hallo zusammen.
|
||||||
"speaker": "Speaker A",
|
[00:05] Speaker B: Guten Morgen.
|
||||||
"text": "..."
|
|
||||||
}
|
|
||||||
]
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
logger.info(f"Generating transcription for {file_path}...")
|
logger.info(f"Generating transcription for {file_path}...")
|
||||||
@@ -52,14 +49,46 @@ class TranscriptionService:
|
|||||||
contents=[media_file, prompt],
|
contents=[media_file, prompt],
|
||||||
config=types.GenerateContentConfig(
|
config=types.GenerateContentConfig(
|
||||||
temperature=0.1,
|
temperature=0.1,
|
||||||
response_mime_type="application/json"
|
max_output_tokens=8192
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Cleanup: Delete file from Gemini storage
|
# Cleanup: Delete file from Gemini storage
|
||||||
self.client.files.delete(name=media_file.name)
|
self.client.files.delete(name=media_file.name)
|
||||||
|
|
||||||
|
# Parse Plain Text to JSON
|
||||||
|
structured_data = self.parse_transcript(response.text)
|
||||||
|
import json
|
||||||
return {
|
return {
|
||||||
"raw_text": response.text, # This is now a JSON string
|
"raw_text": json.dumps(structured_data), # Return valid JSON string
|
||||||
"offset": offset_seconds
|
"offset": offset_seconds
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def parse_transcript(self, text: str) -> list:
|
||||||
|
"""
|
||||||
|
Parses lines like '[00:12] Speaker A: Hello world' into structured JSON.
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
results = []
|
||||||
|
# Regex to match: [MM:SS] Speaker: Text
|
||||||
|
# Flexible for MM:SS or H:MM:SS
|
||||||
|
pattern = re.compile(r"^\[(\d{1,2}:\d{2}(?::\d{2})?)\]\s*([^:]+):\s*(.+)$")
|
||||||
|
|
||||||
|
for line in text.strip().split('\n'):
|
||||||
|
line = line.strip()
|
||||||
|
if not line: continue
|
||||||
|
|
||||||
|
match = pattern.match(line)
|
||||||
|
if match:
|
||||||
|
time_str, speaker, content = match.groups()
|
||||||
|
results.append({
|
||||||
|
"time": time_str,
|
||||||
|
"speaker": speaker.strip(),
|
||||||
|
"text": content.strip()
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
# Fallback: Append to previous if it looks like continuation
|
||||||
|
if results and not line.startswith("["):
|
||||||
|
results[-1]["text"] += " " + line
|
||||||
|
|
||||||
|
return results
|
||||||
|
|||||||
@@ -394,6 +394,20 @@ export default function App() {
|
|||||||
>
|
>
|
||||||
<Share2 className="h-5 w-5" />
|
<Share2 className="h-5 w-5" />
|
||||||
</button>
|
</button>
|
||||||
|
<button
|
||||||
|
onClick={async () => {
|
||||||
|
if(!confirm("Retry transcription using existing audio chunks? This will overwrite the current transcript.")) return;
|
||||||
|
try {
|
||||||
|
await axios.post(`${API_BASE}/meetings/${detailMeeting.id}/retry`);
|
||||||
|
alert("Retry started. Please wait for completion.");
|
||||||
|
fetchDetail(detailMeeting.id);
|
||||||
|
} catch(e) { alert("Retry failed."); }
|
||||||
|
}}
|
||||||
|
className="text-orange-500 hover:bg-orange-50 dark:hover:bg-orange-900/20 p-2 rounded"
|
||||||
|
title="Retry Transcription (Fix Format Issues)"
|
||||||
|
>
|
||||||
|
<Wand2 className="h-5 w-5" />
|
||||||
|
</button>
|
||||||
<button onClick={(e) => handleDeleteMeeting(e, detailMeeting.id)} className="text-red-500 hover:bg-red-50 dark:hover:bg-red-900/20 p-2 rounded"><Trash2 className="h-5 w-5" /></button>
|
<button onClick={(e) => handleDeleteMeeting(e, detailMeeting.id)} className="text-red-500 hover:bg-red-50 dark:hover:bg-red-900/20 p-2 rounded"><Trash2 className="h-5 w-5" /></button>
|
||||||
</div>
|
</div>
|
||||||
</header>
|
</header>
|
||||||
|
|||||||
Binary file not shown.
BIN
uploads_audio/chunks/6/chunk_001.mp3
Normal file
BIN
uploads_audio/chunks/6/chunk_001.mp3
Normal file
Binary file not shown.
Reference in New Issue
Block a user