docs: improve documentation

This commit is contained in:
Felix Spöttel
2023-03-01 17:55:06 +01:00
parent 7ece7944bf
commit ff8bd2547c
15 changed files with 106 additions and 58 deletions

View File

@@ -1,6 +1,6 @@
API_SECRET="a_very_secret_token"
DOMAIN="whisperbox.localhost"
DOMAIN="whisperbox-transcribe.localhost"
WHISPER_MODEL="tiny"
ENVIRONMENT="development"
DATABASE_URI="sqlite:///./whisperbox.sqlite"
DATABASE_URI="sqlite:///./whisperbox-transcribe.sqlite"
BROKER_URL="redis://redis:6379/"0

View File

@@ -1,4 +1,4 @@
API_SECRET="change_me"
WHISPER_MODEL="small"
DOMAIN="whisperbox.localhost"
DATABASE_URI="sqlite:///etc/whisperbox/data/whisperbox.sqlite"
DOMAIN="whisperbox-transcribe.localhost"
DATABASE_URI="sqlite:///etc/whisperbox-transcribe/data/whisperbox-transcribe.sqlite"

2
.gitignore vendored
View File

@@ -163,4 +163,4 @@ cython_debug/
.vscode
.DS_Store
whisperbox.sqlite*
whisperbox-transcribe.sqlite*

View File

@@ -1,4 +1,4 @@
# whisper-api
# whisperbox-transcribe
> HTTP wrapper around [openai/whisper](https://github.com/openai/whisper).
@@ -12,9 +12,11 @@ OpenAPI documentation can be accessed via `<service_url>/docs`.
## Develop
It is recommended to setup a virtual environment for python tooling. To install dependencies dependencies in your virtual env, run `pip install -e .[tooling,web,worker]`
[docker compose](https://docs.docker.com/get-started/08_using_compose/) is required for local development.
[docker-compose](https://docs.docker.com/get-started/08_using_compose/) is required for local development. Configuration such as `API_SECRET` can be adjusted in `./docker/dev/docker-compose.yml`.
It is recommended to setup a virtual environment for python tooling. To install dependencies in your virtual env, run `pip install -e .[tooling,web,worker]`.
Copy `.env.test` to `.env` to configure the service.
### Start
@@ -26,10 +28,10 @@ Builds and starts the docker containers.
```
# Bindings
http://localhost:5555 => Celery dashboard
http://localhost:8000 => API
http://localhost:8000/docs => API docs
./whisperbox.sqlite => Database
http://localhost:5555 => Celery dashboard
http://whisperbox-transcribe.localhost => API
http://whisperbox-transcribe.localhost => API docs
./whisperbox-transcribe.sqlite => Database
```
## Destroy

View File

@@ -40,9 +40,8 @@ class JobConfig(BaseModel):
# TODO: limit to locales selected by whisper.
language: Optional[str] = Field(
description=(
"Spoken language in the media file."
"While optional, this can improve output "
"by selecting a language-specific model. (applies to 'en')"
"Spoken language in the media file. "
"While optional, this can improve output."
)
)

View File

@@ -1,4 +1,4 @@
from typing import Any, Dict, Optional
from typing import Any, Dict, Optional, Union
from pydantic import AnyHttpUrl, BaseModel, Field
@@ -9,7 +9,7 @@ class DetailResponse(BaseModel):
detail: str
DEFAULT_RESPONSES: Dict[int | str, Dict[str, Any]] = {
DEFAULT_RESPONSES: Dict[Union[int, str], Dict[str, Any]] = {
401: {"model": DetailResponse, "description": "Not authenticated"}
}
@@ -21,13 +21,17 @@ class PostJobPayload(BaseModel):
)
)
type: schemas.JobType = Field(description="Type of this job.")
type: schemas.JobType = Field(
description="""Type of this job.
`transcript` uses the original language of the audio.
`translation` creates an automatic translation to english.
`language_detection` detects language from the first 30 seconds of audio."""
)
# TODO: limit to locales selected by whisper.
language: Optional[str] = Field(
description=(
"Spoken language in the media file."
"While optional, this can improve output "
"by selecting a language-specific model. (applies to 'en')"
"Spoken language in the media file. "
"While optional, this can improve output when set."
)
)

View File

@@ -13,7 +13,10 @@ from app.shared.db.base import get_session
from app.web.dtos import DEFAULT_RESPONSES, DetailResponse, PostJobPayload
from app.web.security import authenticate_api_key
app = FastAPI()
app = FastAPI(
description="whisperbox-transcribe is an async HTTP wrapper for openai/whisper.",
title="whisperbox-transcribe",
)
celery = get_celery_binding()
@@ -38,11 +41,28 @@ def api_root() -> None:
return None
@api_router.post("/jobs", response_model=schemas.Job, status_code=201)
@api_router.post(
"/jobs",
response_model=schemas.Job,
status_code=201,
summary="Enqueue a new job",
)
def create_job(
payload: PostJobPayload,
session: Session = Depends(get_session),
) -> models.Job:
"""
Enqueue a new whisper job for processing.
Notes:
* Jobs are processed one-by-one in order of creation.
* `payload.url` needs to point directly to a media file.
* The media file is downloaded to a tmp file for the duration of processing.
enough free space needs to be available on disk.
* Media files ideally are audio files with a sampling rate of 16kHz.
other files will be transcoded automatically via ffmpeg which might
consume considerable resources while active.
* Once a job is created, you can query its status by its id.
"""
# create a job with status "create" and save it to the database.
job = models.Job(
url=payload.url,
@@ -64,10 +84,13 @@ def create_job(
return job
@api_router.get("/jobs", response_model=List[schemas.Job])
@api_router.get(
"/jobs", response_model=List[schemas.Job], summary="Get metadata for all jobs"
)
def get_transcripts(
type: Optional[schemas.JobType] = None, session: Session = Depends(get_session)
) -> List[models.Job]:
"""Get metadata for all jobs."""
query = session.query(models.Job)
if type:
@@ -79,21 +102,33 @@ def get_transcripts(
@api_router.get(
"/jobs/{id}",
response_model=schemas.Job,
responses={404: {"model": DetailResponse, "description": "Not authenticated"}},
responses={404: {"model": DetailResponse, "description": "Not found"}},
summary="Get metadata for one jobs",
)
def get_transcript(
id: UUID = Path(), session: Session = Depends(get_session)
) -> Optional[models.Job]:
"""
Use this route to check transcription status of any given job.
"""
job = session.query(models.Job).filter(models.Job.id == str(id)).one_or_none()
if not job:
raise HTTPException(status_code=404)
return job
@api_router.get("/jobs/{id}/artifacts", response_model=List[schemas.Artifact])
@api_router.get(
"/jobs/{id}/artifacts",
response_model=List[schemas.Artifact],
summary="Get all artifacts for one job",
)
def get_artifacts_for_job(
id: UUID = Path(), session: Session = Depends(get_session)
) -> List[models.Artifact]:
"""
Right now, there is only one type of artifact (`raw_transcript`).
Returns an empty array for unfinished or non-existant jobs.
"""
artifacts = (
session.query(models.Artifact).filter(models.Artifact.job_id == str(id))
).all()
@@ -101,10 +136,13 @@ def get_artifacts_for_job(
return artifacts
@api_router.delete("/jobs/{id}", status_code=204)
@api_router.delete(
"/jobs/{id}", status_code=204, summary="Delete a job with all artifacts"
)
def delete_transcript(
id: UUID = Path(), session: Session = Depends(get_session)
) -> None:
"""Remove metadata and artifacts for a single job."""
session.query(models.Job).filter(models.Job.id == str(id)).delete()
return None

View File

@@ -13,6 +13,7 @@ from app.worker.strategies.local import LocalStrategy
celery = get_celery_binding()
class TranscribeTask(Task):
abstract = True
@@ -29,9 +30,7 @@ class TranscribeTask(Task):
return self.run(*args, **kwargs)
@celery.task(
base=TranscribeTask, bind=True, soft_time_limit=2 * 60 * 60
)
@celery.task(base=TranscribeTask, bind=True, soft_time_limit=2 * 60 * 60)
def transcribe(self: Task, job_id: UUID) -> None:
try:
# runs in a separate thread => requires sqlite's WAL mode to be enabled.

View File

@@ -1,4 +1,5 @@
version: "3.8"
name: whisperbox-transcribe
services:
traefik:
@@ -9,7 +10,7 @@ services:
command:
- "--providers.docker=true"
- "--providers.docker.exposedbydefault=false"
- "--providers.docker.network=whisperbox_transcription_traefik"
- "--providers.docker.network=whisperbox-transcribe_traefik"
- "--entrypoints.web.address=:80"
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro

View File

@@ -2,28 +2,29 @@ version: "3.8"
services:
traefik:
container_name: whisperbox_traefik_dev
container_name: whisperbox-transcribe_traefik_dev
redis:
container_name: whisperbox_redis_dev
container_name: whisperbox-transcribe_redis_dev
web:
container_name: whisperbox_web_dev
container_name: whisperbox-transcribe_web_dev
env_file: .env
command: bash -c "alembic upgrade head && uvicorn app.web.main:app --reload --host ${HOST:-0.0.0.0} --port ${PORT:-8000} --log-level info"
volumes:
- ./:/etc/whisperbox/
- ./:/etc/whisperbox-transcribe/
labels:
- "traefik.http.routers.web.entrypoints=web"
worker:
container_name: whisperbox_worker_dev
container_name: whisperbox-transcribe_worker_dev
env_file: .env
command: watchmedo auto-restart -d app/worker -p *.py --recursive celery -- --app=app.worker.main.celery worker --loglevel=info --concurrency=1 --pool solo
volumes:
- ./:/etc/whisperbox/
- ./:/etc/whisperbox-transcribe/
flower:
container_name: whisperbox_flower_dev
container_name: whisperbox-transcribe_flower_dev
image: mher/flower
command: celery --broker redis://redis:6379/0 flower --port=5555
ports:

View File

@@ -1,17 +1,20 @@
version: "3.8"
services:
traefik:
container_name: whisperbox-transcribe_traefik
redis:
container_name: whisperbox_redis
container_name: whisperbox-transcribe_redis
worker:
container_name: whisperbox_worker
container_name: whisperbox-transcribe_worker
env_file: .env
# <GPU SUPPORT>
# build:
# dockerfile: worker.gpu.Dockerfile
volumes:
- whisperbox-data:/etc/whisperbox/data
- whisperbox-transcribe-data:/etc/whisperbox-transcribe/data
# <GPU SUPPORT>
# deploy:
# resources:
@@ -22,12 +25,12 @@ services:
# capabilities: [gpu]
web:
container_name: whisperbox_web
container_name: whisperbox-transcribe_web
env_file: .env
volumes:
- whisperbox-data:/etc/whisperbox/data
- whisperbox-transcribe-data:/etc/whisperbox-transcribe/data
labels:
- "traefik.http.routers.web.entrypoints=web"
volumes:
whisperbox-data:
whisperbox-transcribe-data:

View File

@@ -1,7 +1,7 @@
[project]
name = "whisperbox_transcription"
name = "whisperbox-transcribe"
description = ""
version = "0.0.1"
version = "0.1.0"
dependencies=[
"celery[redis] ==5.2.7",

View File

@@ -1,6 +1,6 @@
FROM python:3.10 as python-build
FROM python:3.10-slim as python-build
WORKDIR /etc/whisperbox
WORKDIR /etc/whisperbox-transcribe
COPY pyproject.toml .
@@ -8,14 +8,14 @@ RUN python -m venv /opt/venv && \
/opt/venv/bin/pip install -U pip wheel && \
/opt/venv/bin/pip install -U .[web]
FROM python:3.10 as python-deploy
FROM python:3.10-slim as python-deploy
WORKDIR /etc/whisperbox
WORKDIR /etc/whisperbox-transcribe
COPY --from=python-build /opt/venv /opt/venv
COPY app ./app
COPY alembic.ini ./
COPY alembic.ini .
ENV VIRTUAL_ENV /opt/venv
ENV PATH /opt/venv/bin:$PATH

View File

@@ -1,30 +1,31 @@
FROM python:3.10 AS python-build
FROM python:3.10-slim AS python-build
WORKDIR /etc/whisperbox
WORKDIR /etc/whisperbox-transcribe
# Create and build virtual env from requirements.
COPY pyproject.toml .
RUN python -m venv /opt/venv && \
/opt/venv/bin/pip install -U pip wheel && \
/opt/venv/bin/pip install -U .[worker]
FROM python:3.10 as python-deploy
FROM python:3.10-slim as python-deploy
ARG WHISPER_MODEL
WORKDIR /etc/whisperbox
WORKDIR /etc/whisperbox-transcribe
COPY --from=python-build /opt/venv /opt/venv
COPY --from=mwader/static-ffmpeg:latest /ffmpeg /usr/local/bin/
COPY --from=mwader/static-ffmpeg:latest /ffprobe /usr/local/bin/
COPY app ./app
ENV VIRTUAL_ENV /opt/venv
ENV PATH /opt/venv/bin:$PATH
COPY scripts/download_models.py .
RUN python download_models.py ${WHISPER_MODEL}
COPY app ./app
CMD celery --app=app.worker.main.celery worker --loglevel=info --concurrency=1 --pool=solo

View File

@@ -5,7 +5,7 @@ ENV PYTHON_VERSION=3.10
ARG WHISPER_MODEL
WORKDIR /etc/whisperbox
WORKDIR /etc/whisperbox-transcribe
RUN export DEBIAN_FRONTEND=noninteractive \
&& apt-get -qq update \