From ff8bd2547c187a3fa2590213d5b1a21b36a3e860 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Sp=C3=B6ttel?= <1682504+fspoettel@users.noreply.github.com> Date: Wed, 1 Mar 2023 17:55:06 +0100 Subject: [PATCH] docs: improve documentation --- .env.dev | 4 ++-- .env.example | 4 ++-- .gitignore | 2 +- README.md | 16 +++++++------ app/shared/db/schemas.py | 5 ++-- app/web/dtos.py | 16 ++++++++----- app/web/main.py | 50 +++++++++++++++++++++++++++++++++++----- app/worker/main.py | 5 ++-- docker-compose.base.yml | 3 ++- docker-compose.dev.yml | 15 ++++++------ docker-compose.prod.yml | 15 +++++++----- pyproject.toml | 4 ++-- web.Dockerfile | 10 ++++---- worker.Dockerfile | 13 ++++++----- worker.gpu.Dockerfile | 2 +- 15 files changed, 106 insertions(+), 58 deletions(-) diff --git a/.env.dev b/.env.dev index a8da679..c89e391 100644 --- a/.env.dev +++ b/.env.dev @@ -1,6 +1,6 @@ API_SECRET="a_very_secret_token" -DOMAIN="whisperbox.localhost" +DOMAIN="whisperbox-transcribe.localhost" WHISPER_MODEL="tiny" ENVIRONMENT="development" -DATABASE_URI="sqlite:///./whisperbox.sqlite" +DATABASE_URI="sqlite:///./whisperbox-transcribe.sqlite" BROKER_URL="redis://redis:6379/"0 diff --git a/.env.example b/.env.example index 76ceb1d..a054f1d 100644 --- a/.env.example +++ b/.env.example @@ -1,4 +1,4 @@ API_SECRET="change_me" WHISPER_MODEL="small" -DOMAIN="whisperbox.localhost" -DATABASE_URI="sqlite:///etc/whisperbox/data/whisperbox.sqlite" +DOMAIN="whisperbox-transcribe.localhost" +DATABASE_URI="sqlite:///etc/whisperbox-transcribe/data/whisperbox-transcribe.sqlite" diff --git a/.gitignore b/.gitignore index 041efec..3d320bf 100644 --- a/.gitignore +++ b/.gitignore @@ -163,4 +163,4 @@ cython_debug/ .vscode .DS_Store -whisperbox.sqlite* +whisperbox-transcribe.sqlite* diff --git a/README.md b/README.md index 2e9c7d3..0fe0d75 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# whisper-api +# whisperbox-transcribe > HTTP wrapper around [openai/whisper](https://github.com/openai/whisper). @@ -12,9 +12,11 @@ OpenAPI documentation can be accessed via `/docs`. ## Develop -It is recommended to setup a virtual environment for python tooling. To install dependencies dependencies in your virtual env, run `pip install -e .[tooling,web,worker]` +[docker compose](https://docs.docker.com/get-started/08_using_compose/) is required for local development. -[docker-compose](https://docs.docker.com/get-started/08_using_compose/) is required for local development. Configuration such as `API_SECRET` can be adjusted in `./docker/dev/docker-compose.yml`. +It is recommended to setup a virtual environment for python tooling. To install dependencies in your virtual env, run `pip install -e .[tooling,web,worker]`. + +Copy `.env.test` to `.env` to configure the service. ### Start @@ -26,10 +28,10 @@ Builds and starts the docker containers. ``` # Bindings -http://localhost:5555 => Celery dashboard -http://localhost:8000 => API -http://localhost:8000/docs => API docs -./whisperbox.sqlite => Database +http://localhost:5555 => Celery dashboard +http://whisperbox-transcribe.localhost => API +http://whisperbox-transcribe.localhost => API docs +./whisperbox-transcribe.sqlite => Database ``` ## Destroy diff --git a/app/shared/db/schemas.py b/app/shared/db/schemas.py index 87a7281..8a61116 100644 --- a/app/shared/db/schemas.py +++ b/app/shared/db/schemas.py @@ -40,9 +40,8 @@ class JobConfig(BaseModel): # TODO: limit to locales selected by whisper. language: Optional[str] = Field( description=( - "Spoken language in the media file." - "While optional, this can improve output " - "by selecting a language-specific model. (applies to 'en')" + "Spoken language in the media file. " + "While optional, this can improve output." ) ) diff --git a/app/web/dtos.py b/app/web/dtos.py index a5df059..b3873d9 100644 --- a/app/web/dtos.py +++ b/app/web/dtos.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, Union from pydantic import AnyHttpUrl, BaseModel, Field @@ -9,7 +9,7 @@ class DetailResponse(BaseModel): detail: str -DEFAULT_RESPONSES: Dict[int | str, Dict[str, Any]] = { +DEFAULT_RESPONSES: Dict[Union[int, str], Dict[str, Any]] = { 401: {"model": DetailResponse, "description": "Not authenticated"} } @@ -21,13 +21,17 @@ class PostJobPayload(BaseModel): ) ) - type: schemas.JobType = Field(description="Type of this job.") + type: schemas.JobType = Field( + description="""Type of this job. + `transcript` uses the original language of the audio. + `translation` creates an automatic translation to english. + `language_detection` detects language from the first 30 seconds of audio.""" + ) # TODO: limit to locales selected by whisper. language: Optional[str] = Field( description=( - "Spoken language in the media file." - "While optional, this can improve output " - "by selecting a language-specific model. (applies to 'en')" + "Spoken language in the media file. " + "While optional, this can improve output when set." ) ) diff --git a/app/web/main.py b/app/web/main.py index c23f68b..bc09d69 100644 --- a/app/web/main.py +++ b/app/web/main.py @@ -13,7 +13,10 @@ from app.shared.db.base import get_session from app.web.dtos import DEFAULT_RESPONSES, DetailResponse, PostJobPayload from app.web.security import authenticate_api_key -app = FastAPI() +app = FastAPI( + description="whisperbox-transcribe is an async HTTP wrapper for openai/whisper.", + title="whisperbox-transcribe", +) celery = get_celery_binding() @@ -38,11 +41,28 @@ def api_root() -> None: return None -@api_router.post("/jobs", response_model=schemas.Job, status_code=201) +@api_router.post( + "/jobs", + response_model=schemas.Job, + status_code=201, + summary="Enqueue a new job", +) def create_job( payload: PostJobPayload, session: Session = Depends(get_session), ) -> models.Job: + """ + Enqueue a new whisper job for processing. + Notes: + * Jobs are processed one-by-one in order of creation. + * `payload.url` needs to point directly to a media file. + * The media file is downloaded to a tmp file for the duration of processing. + enough free space needs to be available on disk. + * Media files ideally are audio files with a sampling rate of 16kHz. + other files will be transcoded automatically via ffmpeg which might + consume considerable resources while active. + * Once a job is created, you can query its status by its id. + """ # create a job with status "create" and save it to the database. job = models.Job( url=payload.url, @@ -64,10 +84,13 @@ def create_job( return job -@api_router.get("/jobs", response_model=List[schemas.Job]) +@api_router.get( + "/jobs", response_model=List[schemas.Job], summary="Get metadata for all jobs" +) def get_transcripts( type: Optional[schemas.JobType] = None, session: Session = Depends(get_session) ) -> List[models.Job]: + """Get metadata for all jobs.""" query = session.query(models.Job) if type: @@ -79,21 +102,33 @@ def get_transcripts( @api_router.get( "/jobs/{id}", response_model=schemas.Job, - responses={404: {"model": DetailResponse, "description": "Not authenticated"}}, + responses={404: {"model": DetailResponse, "description": "Not found"}}, + summary="Get metadata for one jobs", ) def get_transcript( id: UUID = Path(), session: Session = Depends(get_session) ) -> Optional[models.Job]: + """ + Use this route to check transcription status of any given job. + """ job = session.query(models.Job).filter(models.Job.id == str(id)).one_or_none() if not job: raise HTTPException(status_code=404) return job -@api_router.get("/jobs/{id}/artifacts", response_model=List[schemas.Artifact]) +@api_router.get( + "/jobs/{id}/artifacts", + response_model=List[schemas.Artifact], + summary="Get all artifacts for one job", +) def get_artifacts_for_job( id: UUID = Path(), session: Session = Depends(get_session) ) -> List[models.Artifact]: + """ + Right now, there is only one type of artifact (`raw_transcript`). + Returns an empty array for unfinished or non-existant jobs. + """ artifacts = ( session.query(models.Artifact).filter(models.Artifact.job_id == str(id)) ).all() @@ -101,10 +136,13 @@ def get_artifacts_for_job( return artifacts -@api_router.delete("/jobs/{id}", status_code=204) +@api_router.delete( + "/jobs/{id}", status_code=204, summary="Delete a job with all artifacts" +) def delete_transcript( id: UUID = Path(), session: Session = Depends(get_session) ) -> None: + """Remove metadata and artifacts for a single job.""" session.query(models.Job).filter(models.Job.id == str(id)).delete() return None diff --git a/app/worker/main.py b/app/worker/main.py index 5c7536d..c920b0c 100644 --- a/app/worker/main.py +++ b/app/worker/main.py @@ -13,6 +13,7 @@ from app.worker.strategies.local import LocalStrategy celery = get_celery_binding() + class TranscribeTask(Task): abstract = True @@ -29,9 +30,7 @@ class TranscribeTask(Task): return self.run(*args, **kwargs) -@celery.task( - base=TranscribeTask, bind=True, soft_time_limit=2 * 60 * 60 -) +@celery.task(base=TranscribeTask, bind=True, soft_time_limit=2 * 60 * 60) def transcribe(self: Task, job_id: UUID) -> None: try: # runs in a separate thread => requires sqlite's WAL mode to be enabled. diff --git a/docker-compose.base.yml b/docker-compose.base.yml index ee43414..42f26fd 100644 --- a/docker-compose.base.yml +++ b/docker-compose.base.yml @@ -1,4 +1,5 @@ version: "3.8" +name: whisperbox-transcribe services: traefik: @@ -9,7 +10,7 @@ services: command: - "--providers.docker=true" - "--providers.docker.exposedbydefault=false" - - "--providers.docker.network=whisperbox_transcription_traefik" + - "--providers.docker.network=whisperbox-transcribe_traefik" - "--entrypoints.web.address=:80" volumes: - /var/run/docker.sock:/var/run/docker.sock:ro diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index fc4a739..5f7a9ba 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -2,28 +2,29 @@ version: "3.8" services: traefik: - container_name: whisperbox_traefik_dev + container_name: whisperbox-transcribe_traefik_dev + redis: - container_name: whisperbox_redis_dev + container_name: whisperbox-transcribe_redis_dev web: - container_name: whisperbox_web_dev + container_name: whisperbox-transcribe_web_dev env_file: .env command: bash -c "alembic upgrade head && uvicorn app.web.main:app --reload --host ${HOST:-0.0.0.0} --port ${PORT:-8000} --log-level info" volumes: - - ./:/etc/whisperbox/ + - ./:/etc/whisperbox-transcribe/ labels: - "traefik.http.routers.web.entrypoints=web" worker: - container_name: whisperbox_worker_dev + container_name: whisperbox-transcribe_worker_dev env_file: .env command: watchmedo auto-restart -d app/worker -p *.py --recursive celery -- --app=app.worker.main.celery worker --loglevel=info --concurrency=1 --pool solo volumes: - - ./:/etc/whisperbox/ + - ./:/etc/whisperbox-transcribe/ flower: - container_name: whisperbox_flower_dev + container_name: whisperbox-transcribe_flower_dev image: mher/flower command: celery --broker redis://redis:6379/0 flower --port=5555 ports: diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 2b6e793..40b8dc2 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -1,17 +1,20 @@ version: "3.8" services: + traefik: + container_name: whisperbox-transcribe_traefik + redis: - container_name: whisperbox_redis + container_name: whisperbox-transcribe_redis worker: - container_name: whisperbox_worker + container_name: whisperbox-transcribe_worker env_file: .env # # build: # dockerfile: worker.gpu.Dockerfile volumes: - - whisperbox-data:/etc/whisperbox/data + - whisperbox-transcribe-data:/etc/whisperbox-transcribe/data # # deploy: # resources: @@ -22,12 +25,12 @@ services: # capabilities: [gpu] web: - container_name: whisperbox_web + container_name: whisperbox-transcribe_web env_file: .env volumes: - - whisperbox-data:/etc/whisperbox/data + - whisperbox-transcribe-data:/etc/whisperbox-transcribe/data labels: - "traefik.http.routers.web.entrypoints=web" volumes: - whisperbox-data: + whisperbox-transcribe-data: diff --git a/pyproject.toml b/pyproject.toml index bb4e498..9332f18 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] -name = "whisperbox_transcription" +name = "whisperbox-transcribe" description = "" -version = "0.0.1" +version = "0.1.0" dependencies=[ "celery[redis] ==5.2.7", diff --git a/web.Dockerfile b/web.Dockerfile index c1ebec2..7d13e52 100644 --- a/web.Dockerfile +++ b/web.Dockerfile @@ -1,6 +1,6 @@ -FROM python:3.10 as python-build +FROM python:3.10-slim as python-build -WORKDIR /etc/whisperbox +WORKDIR /etc/whisperbox-transcribe COPY pyproject.toml . @@ -8,14 +8,14 @@ RUN python -m venv /opt/venv && \ /opt/venv/bin/pip install -U pip wheel && \ /opt/venv/bin/pip install -U .[web] -FROM python:3.10 as python-deploy +FROM python:3.10-slim as python-deploy -WORKDIR /etc/whisperbox +WORKDIR /etc/whisperbox-transcribe COPY --from=python-build /opt/venv /opt/venv COPY app ./app -COPY alembic.ini ./ +COPY alembic.ini . ENV VIRTUAL_ENV /opt/venv ENV PATH /opt/venv/bin:$PATH diff --git a/worker.Dockerfile b/worker.Dockerfile index 9ba5d3c..02ceee1 100644 --- a/worker.Dockerfile +++ b/worker.Dockerfile @@ -1,30 +1,31 @@ -FROM python:3.10 AS python-build +FROM python:3.10-slim AS python-build -WORKDIR /etc/whisperbox +WORKDIR /etc/whisperbox-transcribe +# Create and build virtual env from requirements. COPY pyproject.toml . RUN python -m venv /opt/venv && \ /opt/venv/bin/pip install -U pip wheel && \ /opt/venv/bin/pip install -U .[worker] -FROM python:3.10 as python-deploy +FROM python:3.10-slim as python-deploy ARG WHISPER_MODEL -WORKDIR /etc/whisperbox +WORKDIR /etc/whisperbox-transcribe COPY --from=python-build /opt/venv /opt/venv COPY --from=mwader/static-ffmpeg:latest /ffmpeg /usr/local/bin/ COPY --from=mwader/static-ffmpeg:latest /ffprobe /usr/local/bin/ -COPY app ./app - ENV VIRTUAL_ENV /opt/venv ENV PATH /opt/venv/bin:$PATH COPY scripts/download_models.py . RUN python download_models.py ${WHISPER_MODEL} +COPY app ./app + CMD celery --app=app.worker.main.celery worker --loglevel=info --concurrency=1 --pool=solo diff --git a/worker.gpu.Dockerfile b/worker.gpu.Dockerfile index ddf4764..9aa7f02 100644 --- a/worker.gpu.Dockerfile +++ b/worker.gpu.Dockerfile @@ -5,7 +5,7 @@ ENV PYTHON_VERSION=3.10 ARG WHISPER_MODEL -WORKDIR /etc/whisperbox +WORKDIR /etc/whisperbox-transcribe RUN export DEBIAN_FRONTEND=noninteractive \ && apt-get -qq update \