mirror of
https://github.com/bellingcat/whisperbox-transcribe.git
synced 2026-06-07 19:18:35 +03:00
81 lines
1.6 KiB
Python
81 lines
1.6 KiB
Python
from datetime import datetime
|
|
from uuid import UUID
|
|
|
|
from pydantic import AnyHttpUrl, BaseModel, Field
|
|
|
|
from app.shared.db.models import ArtifactType, JobStatus, JobType
|
|
|
|
# JSON field types
|
|
|
|
|
|
class JobConfig(BaseModel):
|
|
"""Configuration for a job."""
|
|
|
|
language: str | None = Field(
|
|
description=(
|
|
"Spoken language in the media file. "
|
|
"While optional, this can improve output."
|
|
)
|
|
)
|
|
|
|
|
|
class JobMeta(BaseModel):
|
|
"""Metadata relating to a job's execution."""
|
|
|
|
error: str | None = Field(
|
|
description="Will contain a descriptive error message if processing failed."
|
|
)
|
|
|
|
task_id: UUID | None = Field(
|
|
description="Internal celery id of this job submission."
|
|
)
|
|
|
|
|
|
class RawTranscript(BaseModel):
|
|
"""A single transcript passage returned by whisper."""
|
|
|
|
id: int
|
|
seek: int
|
|
start: float
|
|
end: float
|
|
text: str
|
|
tokens: list[int]
|
|
temperature: float
|
|
avg_logprob: float
|
|
compression_ratio: float
|
|
no_speech_prob: float
|
|
|
|
|
|
class LanguageDetection(BaseModel):
|
|
"""A language detection"""
|
|
|
|
code: str
|
|
|
|
|
|
# DB objects
|
|
|
|
|
|
class WithDbFields(BaseModel):
|
|
id: UUID
|
|
created_at: datetime
|
|
updated_at: datetime | None
|
|
|
|
class Config:
|
|
orm_mode = True
|
|
|
|
|
|
class Job(WithDbFields):
|
|
"""A transcription job for one media file."""
|
|
|
|
status: JobStatus
|
|
type: JobType
|
|
url: AnyHttpUrl
|
|
meta: JobMeta | None
|
|
config: JobConfig | None
|
|
|
|
|
|
class Artifact(WithDbFields):
|
|
job_id: UUID
|
|
data: LanguageDetection | RawTranscript | None
|
|
type: ArtifactType
|