docs: improve documentation

2026-06-08 03:28:35 +03:00 · 2023-03-01 17:55:06 +01:00
parent 7ece7944bf
commit ff8bd2547c
15 changed files with 106 additions and 58 deletions
--- a/.env.dev
+++ b/.env.dev
@@ -1,6 +1,6 @@
 API_SECRET="a_very_secret_token"
-DOMAIN="whisperbox.localhost"
+DOMAIN="whisperbox-transcribe.localhost"
 WHISPER_MODEL="tiny"
 ENVIRONMENT="development"
-DATABASE_URI="sqlite:///./whisperbox.sqlite"
+DATABASE_URI="sqlite:///./whisperbox-transcribe.sqlite"
 BROKER_URL="redis://redis:6379/"0
--- a/.env.example
+++ b/.env.example
@@ -1,4 +1,4 @@
 API_SECRET="change_me"
 WHISPER_MODEL="small"
-DOMAIN="whisperbox.localhost"
-DATABASE_URI="sqlite:///etc/whisperbox/data/whisperbox.sqlite"
+DOMAIN="whisperbox-transcribe.localhost"
+DATABASE_URI="sqlite:///etc/whisperbox-transcribe/data/whisperbox-transcribe.sqlite"
--- a/.gitignore
+++ b/.gitignore
@@ -163,4 +163,4 @@ cython_debug/
 .vscode
 .DS_Store

-whisperbox.sqlite*
+whisperbox-transcribe.sqlite*
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# whisper-api
+# whisperbox-transcribe 

 > HTTP wrapper around [openai/whisper](https://github.com/openai/whisper).

@@ -12,9 +12,11 @@ OpenAPI documentation can be accessed via `<service_url>/docs`.

 ## Develop

-It is recommended to setup a virtual environment for python tooling. To install dependencies dependencies in your virtual env, run `pip install -e .[tooling,web,worker]`
+[docker compose](https://docs.docker.com/get-started/08_using_compose/) is required for local development.

-[docker-compose](https://docs.docker.com/get-started/08_using_compose/) is required for local development. Configuration such as `API_SECRET` can be adjusted in `./docker/dev/docker-compose.yml`.
+It is recommended to setup a virtual environment for python tooling. To install dependencies in your virtual env, run `pip install -e .[tooling,web,worker]`.
+
+Copy `.env.test` to `.env` to configure the service.

 ### Start

@@ -26,10 +28,10 @@ Builds and starts the docker containers.

 ```
 # Bindings
-http://localhost:5555      => Celery dashboard
-http://localhost:8000      => API
-http://localhost:8000/docs => API docs
-./whisperbox.sqlite        => Database
+http://localhost:5555                   => Celery dashboard
+http://whisperbox-transcribe.localhost  => API
+http://whisperbox-transcribe.localhost  => API docs
+./whisperbox-transcribe.sqlite          => Database
 ```

 ## Destroy
--- a/app/shared/db/schemas.py
+++ b/app/shared/db/schemas.py
@@ -40,9 +40,8 @@ class JobConfig(BaseModel):
    # TODO: limit to locales selected by whisper.
    language: Optional[str] = Field(
        description=(
-            "Spoken language in the media file."
-            "While optional, this can improve output "
-            "by selecting a language-specific model. (applies to 'en')"
+            "Spoken language in the media file. "
+            "While optional, this can improve output."
        )
    )

--- a/app/web/dtos.py
+++ b/app/web/dtos.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Union

 from pydantic import AnyHttpUrl, BaseModel, Field

@@ -9,7 +9,7 @@ class DetailResponse(BaseModel):
    detail: str


-DEFAULT_RESPONSES: Dict[int | str, Dict[str, Any]] = {
+DEFAULT_RESPONSES: Dict[Union[int, str], Dict[str, Any]] = {
    401: {"model": DetailResponse, "description": "Not authenticated"}
 }

@@ -21,13 +21,17 @@ class PostJobPayload(BaseModel):
        )
    )

-    type: schemas.JobType = Field(description="Type of this job.")
+    type: schemas.JobType = Field(
+        description="""Type of this job.
+        `transcript` uses the original language of the audio.
+        `translation` creates an automatic translation to english.
+        `language_detection` detects language from the first 30 seconds of audio."""
+    )

    # TODO: limit to locales selected by whisper.
    language: Optional[str] = Field(
        description=(
-            "Spoken language in the media file."
-            "While optional, this can improve output "
-            "by selecting a language-specific model. (applies to 'en')"
+            "Spoken language in the media file. "
+            "While optional, this can improve output when set."
        )
    )
--- a/app/web/main.py
+++ b/app/web/main.py
@@ -13,7 +13,10 @@ from app.shared.db.base import get_session
 from app.web.dtos import DEFAULT_RESPONSES, DetailResponse, PostJobPayload
 from app.web.security import authenticate_api_key

-app = FastAPI()
+app = FastAPI(
+    description="whisperbox-transcribe is an async HTTP wrapper for openai/whisper.",
+    title="whisperbox-transcribe",
+)
 celery = get_celery_binding()


@@ -38,11 +41,28 @@ def api_root() -> None:
    return None


-@api_router.post("/jobs", response_model=schemas.Job, status_code=201)
+@api_router.post(
+    "/jobs",
+    response_model=schemas.Job,
+    status_code=201,
+    summary="Enqueue a new job",
+)
 def create_job(
    payload: PostJobPayload,
    session: Session = Depends(get_session),
 ) -> models.Job:
+    """
+    Enqueue a new whisper job for processing.
+    Notes:
+     * Jobs are processed one-by-one in order of creation.
+     * `payload.url` needs to point directly to a media file.
+     * The media file is downloaded to a tmp file for the duration of processing.
+       enough free space needs to be available on disk.
+     * Media files ideally are audio files with a sampling rate of 16kHz.
+       other files will be transcoded automatically via ffmpeg which might
+       consume considerable resources while active.
+     * Once a job is created, you can query its status by its id.
+    """
    # create a job with status "create" and save it to the database.
    job = models.Job(
        url=payload.url,
@@ -64,10 +84,13 @@ def create_job(
    return job


-@api_router.get("/jobs", response_model=List[schemas.Job])
+@api_router.get(
+    "/jobs", response_model=List[schemas.Job], summary="Get metadata for all jobs"
+)
 def get_transcripts(
    type: Optional[schemas.JobType] = None, session: Session = Depends(get_session)
 ) -> List[models.Job]:
+    """Get metadata for all jobs."""
    query = session.query(models.Job)

    if type:
@@ -79,21 +102,33 @@ def get_transcripts(
@api_router.get(
    "/jobs/{id}",
    response_model=schemas.Job,
-    responses={404: {"model": DetailResponse, "description": "Not authenticated"}},
+    responses={404: {"model": DetailResponse, "description": "Not found"}},
+    summary="Get metadata for one jobs",
 )
 def get_transcript(
    id: UUID = Path(), session: Session = Depends(get_session)
 ) -> Optional[models.Job]:
+    """
+    Use this route to check transcription status of any given job.
+    """
    job = session.query(models.Job).filter(models.Job.id == str(id)).one_or_none()
    if not job:
        raise HTTPException(status_code=404)
    return job


-@api_router.get("/jobs/{id}/artifacts", response_model=List[schemas.Artifact])
+@api_router.get(
+    "/jobs/{id}/artifacts",
+    response_model=List[schemas.Artifact],
+    summary="Get all artifacts for one job",
+)
 def get_artifacts_for_job(
    id: UUID = Path(), session: Session = Depends(get_session)
 ) -> List[models.Artifact]:
+    """
+    Right now, there is only one type of artifact (`raw_transcript`).
+    Returns an empty array for unfinished or non-existant jobs.
+    """
    artifacts = (
        session.query(models.Artifact).filter(models.Artifact.job_id == str(id))
    ).all()
@@ -101,10 +136,13 @@ def get_artifacts_for_job(
    return artifacts


-@api_router.delete("/jobs/{id}", status_code=204)
+@api_router.delete(
+    "/jobs/{id}", status_code=204, summary="Delete a job with all artifacts"
+)
 def delete_transcript(
    id: UUID = Path(), session: Session = Depends(get_session)
 ) -> None:
+    """Remove metadata and artifacts for a single job."""
    session.query(models.Job).filter(models.Job.id == str(id)).delete()
    return None

--- a/app/worker/main.py
+++ b/app/worker/main.py
@@ -13,6 +13,7 @@ from app.worker.strategies.local import LocalStrategy

 celery = get_celery_binding()

+
 class TranscribeTask(Task):
    abstract = True

@@ -29,9 +30,7 @@ class TranscribeTask(Task):
        return self.run(*args, **kwargs)


-@celery.task(
-    base=TranscribeTask, bind=True, soft_time_limit=2 * 60 * 60
-)
+@celery.task(base=TranscribeTask, bind=True, soft_time_limit=2 * 60 * 60)
 def transcribe(self: Task, job_id: UUID) -> None:
    try:
        # runs in a separate thread => requires sqlite's WAL mode to be enabled.
--- a/docker-compose.base.yml
+++ b/docker-compose.base.yml
@@ -1,4 +1,5 @@
 version: "3.8"
+name: whisperbox-transcribe

 services:
  traefik:
@@ -9,7 +10,7 @@ services:
    command:
      - "--providers.docker=true"
      - "--providers.docker.exposedbydefault=false"
-      - "--providers.docker.network=whisperbox_transcription_traefik"
+      - "--providers.docker.network=whisperbox-transcribe_traefik"
      - "--entrypoints.web.address=:80"
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock:ro
--- a/docker-compose.dev.yml
+++ b/docker-compose.dev.yml
@@ -2,28 +2,29 @@ version: "3.8"

 services:
  traefik:
-    container_name: whisperbox_traefik_dev
+    container_name: whisperbox-transcribe_traefik_dev
+
  redis:
-    container_name: whisperbox_redis_dev
+    container_name: whisperbox-transcribe_redis_dev

  web:
-    container_name: whisperbox_web_dev
+    container_name: whisperbox-transcribe_web_dev
    env_file: .env
    command: bash -c "alembic upgrade head && uvicorn app.web.main:app --reload --host ${HOST:-0.0.0.0} --port ${PORT:-8000} --log-level info"
    volumes:
-      - ./:/etc/whisperbox/
+      - ./:/etc/whisperbox-transcribe/
    labels:
      - "traefik.http.routers.web.entrypoints=web"

  worker:
-    container_name: whisperbox_worker_dev
+    container_name: whisperbox-transcribe_worker_dev
    env_file: .env
    command: watchmedo auto-restart -d app/worker -p *.py --recursive celery -- --app=app.worker.main.celery worker --loglevel=info --concurrency=1 --pool solo
    volumes:
-      - ./:/etc/whisperbox/
+      - ./:/etc/whisperbox-transcribe/

  flower:
-    container_name: whisperbox_flower_dev
+    container_name: whisperbox-transcribe_flower_dev
    image: mher/flower
    command: celery --broker redis://redis:6379/0 flower --port=5555
    ports:
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -1,17 +1,20 @@
 version: "3.8"

 services:
+  traefik:
+    container_name: whisperbox-transcribe_traefik
+
  redis:
-    container_name: whisperbox_redis
+    container_name: whisperbox-transcribe_redis

  worker:
-    container_name: whisperbox_worker
+    container_name: whisperbox-transcribe_worker
    env_file: .env
    # <GPU SUPPORT>
    # build:
    #   dockerfile: worker.gpu.Dockerfile
    volumes:
-      - whisperbox-data:/etc/whisperbox/data
+      - whisperbox-transcribe-data:/etc/whisperbox-transcribe/data
    # <GPU SUPPORT>
    # deploy:
    #   resources:
@@ -22,12 +25,12 @@ services:
    #           capabilities: [gpu]

  web:
-    container_name: whisperbox_web
+    container_name: whisperbox-transcribe_web
    env_file: .env
    volumes:
-      - whisperbox-data:/etc/whisperbox/data
+      - whisperbox-transcribe-data:/etc/whisperbox-transcribe/data
    labels:
      - "traefik.http.routers.web.entrypoints=web"

 volumes:
-  whisperbox-data:
+  whisperbox-transcribe-data:
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
-name = "whisperbox_transcription"
+name = "whisperbox-transcribe"
 description = ""
-version = "0.0.1"
+version = "0.1.0"

 dependencies=[
  "celery[redis] ==5.2.7",
--- a/web.Dockerfile
+++ b/web.Dockerfile
@@ -1,6 +1,6 @@
-FROM python:3.10 as python-build
+FROM python:3.10-slim as python-build

-WORKDIR /etc/whisperbox
+WORKDIR /etc/whisperbox-transcribe

 COPY pyproject.toml .

@@ -8,14 +8,14 @@ RUN python -m venv /opt/venv && \
    /opt/venv/bin/pip install -U pip wheel && \
    /opt/venv/bin/pip install -U .[web]

-FROM python:3.10 as python-deploy
+FROM python:3.10-slim as python-deploy

-WORKDIR /etc/whisperbox
+WORKDIR /etc/whisperbox-transcribe

 COPY --from=python-build /opt/venv /opt/venv

 COPY app ./app
-COPY alembic.ini ./
+COPY alembic.ini .

 ENV VIRTUAL_ENV /opt/venv
 ENV PATH /opt/venv/bin:$PATH
--- a/worker.Dockerfile
+++ b/worker.Dockerfile
@@ -1,30 +1,31 @@
-FROM python:3.10 AS python-build
+FROM python:3.10-slim AS python-build

-WORKDIR /etc/whisperbox
+WORKDIR /etc/whisperbox-transcribe

+# Create and build virtual env from requirements.
 COPY pyproject.toml .

 RUN python -m venv /opt/venv && \
    /opt/venv/bin/pip install -U pip wheel && \
    /opt/venv/bin/pip install -U .[worker]

-FROM python:3.10 as python-deploy
+FROM python:3.10-slim as python-deploy

 ARG WHISPER_MODEL

-WORKDIR /etc/whisperbox
+WORKDIR /etc/whisperbox-transcribe

 COPY --from=python-build /opt/venv /opt/venv

 COPY --from=mwader/static-ffmpeg:latest /ffmpeg /usr/local/bin/
 COPY --from=mwader/static-ffmpeg:latest /ffprobe /usr/local/bin/

-COPY app ./app
-
 ENV VIRTUAL_ENV /opt/venv
 ENV PATH /opt/venv/bin:$PATH

 COPY scripts/download_models.py .
 RUN python download_models.py ${WHISPER_MODEL}

+COPY app ./app
+
 CMD celery --app=app.worker.main.celery worker --loglevel=info --concurrency=1 --pool=solo
--- a/worker.gpu.Dockerfile
+++ b/worker.gpu.Dockerfile
@@ -5,7 +5,7 @@ ENV PYTHON_VERSION=3.10

 ARG WHISPER_MODEL

-WORKDIR /etc/whisperbox
+WORKDIR /etc/whisperbox-transcribe

 RUN export DEBIAN_FRONTEND=noninteractive \
    && apt-get -qq update \