mirror of
https://github.com/bellingcat/whisperbox-transcribe.git
synced 2026-06-13 05:58:35 +03:00
refactor: remove shared schemas
This commit is contained in:
@@ -1,12 +1,14 @@
|
||||
import enum
|
||||
import uuid
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from sqlalchemy import JSON, VARCHAR, Column, DateTime, Enum, ForeignKey, String, func
|
||||
from sqlalchemy.dialects.postgresql import UUID
|
||||
from sqlalchemy.orm import Mapped, declarative_base, declarative_mixin, declared_attr
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
|
||||
# Enums
|
||||
|
||||
|
||||
@@ -32,7 +34,55 @@ class ArtifactType(str, enum.Enum):
|
||||
language_detection = "language_detection"
|
||||
|
||||
|
||||
# SQLAlchemy models
|
||||
# JSON field types
|
||||
|
||||
|
||||
class JobConfig(BaseModel):
|
||||
"""(JSON) Configuration for a job."""
|
||||
|
||||
language: str | None = Field(
|
||||
description=(
|
||||
"Spoken language in the media file. "
|
||||
"While optional, this can improve output."
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class JobMeta(BaseModel):
|
||||
"""(JSON) Metadata relating to a job's execution."""
|
||||
|
||||
error: str | None = Field(
|
||||
description="Will contain a descriptive error message if processing failed."
|
||||
)
|
||||
|
||||
task_id: uuid.UUID | None = Field(
|
||||
description="Internal celery id of this job submission."
|
||||
)
|
||||
|
||||
|
||||
class RawTranscript(BaseModel):
|
||||
"""(JSON) A single transcript passage returned by whisper."""
|
||||
|
||||
id: int
|
||||
seek: int
|
||||
start: float
|
||||
end: float
|
||||
text: str
|
||||
tokens: list[int]
|
||||
temperature: float
|
||||
avg_logprob: float
|
||||
compression_ratio: float
|
||||
no_speech_prob: float
|
||||
|
||||
|
||||
class LanguageDetection(BaseModel):
|
||||
"""A language detection"""
|
||||
|
||||
language_code: str
|
||||
|
||||
|
||||
# Sum type for all possible artifact data values
|
||||
ArtifactData = list[RawTranscript] | LanguageDetection | None
|
||||
|
||||
|
||||
@declarative_mixin
|
||||
|
||||
@@ -1,80 +0,0 @@
|
||||
from datetime import datetime
|
||||
from uuid import UUID
|
||||
|
||||
from pydantic import AnyHttpUrl, BaseModel, Field
|
||||
|
||||
from app.shared.db.models import ArtifactType, JobStatus, JobType
|
||||
|
||||
# JSON field types
|
||||
|
||||
|
||||
class JobConfig(BaseModel):
|
||||
"""Configuration for a job."""
|
||||
|
||||
language: str | None = Field(
|
||||
description=(
|
||||
"Spoken language in the media file. "
|
||||
"While optional, this can improve output."
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class JobMeta(BaseModel):
|
||||
"""Metadata relating to a job's execution."""
|
||||
|
||||
error: str | None = Field(
|
||||
description="Will contain a descriptive error message if processing failed."
|
||||
)
|
||||
|
||||
task_id: UUID | None = Field(
|
||||
description="Internal celery id of this job submission."
|
||||
)
|
||||
|
||||
|
||||
class RawTranscript(BaseModel):
|
||||
"""A single transcript passage returned by whisper."""
|
||||
|
||||
id: int
|
||||
seek: int
|
||||
start: float
|
||||
end: float
|
||||
text: str
|
||||
tokens: list[int]
|
||||
temperature: float
|
||||
avg_logprob: float
|
||||
compression_ratio: float
|
||||
no_speech_prob: float
|
||||
|
||||
|
||||
class LanguageDetection(BaseModel):
|
||||
"""A language detection"""
|
||||
|
||||
code: str
|
||||
|
||||
|
||||
# DB objects
|
||||
|
||||
|
||||
class WithDbFields(BaseModel):
|
||||
id: UUID
|
||||
created_at: datetime
|
||||
updated_at: datetime | None
|
||||
|
||||
class Config:
|
||||
orm_mode = True
|
||||
|
||||
|
||||
class Job(WithDbFields):
|
||||
"""A transcription job for one media file."""
|
||||
|
||||
status: JobStatus
|
||||
type: JobType
|
||||
url: AnyHttpUrl
|
||||
meta: JobMeta | None
|
||||
config: JobConfig | None
|
||||
|
||||
|
||||
class Artifact(WithDbFields):
|
||||
job_id: UUID
|
||||
data: LanguageDetection | RawTranscript | None
|
||||
type: ArtifactType
|
||||
Reference in New Issue
Block a user