From 7e48f706df3902818b6ed5b4334d1f3b908d89f6 Mon Sep 17 00:00:00 2001 From: Michael Plunkett <5885605+michplunkett@users.noreply.github.com> Date: Wed, 26 Feb 2025 10:42:28 -0600 Subject: [PATCH] Add `pre-commit` with GiHub Action (#56) * Update pyproject.toml * add pre-commit * Create .pre-commit-config.yaml * Comment out ruff * Update .pre-commit-config.yaml * General formatting * Create format-and-fail.yml * Update ci.yml * Add pre-commit to dev dependencies * Update pyproject.toml --- .env.alembic | 2 +- .env.example | 2 +- .env.test | 2 +- .github/workflows/ci.yml | 10 +- .github/workflows/format-and-fail.yml | 16 +++ .pre-commit-config.yaml | 79 +++++++++++ LICENSE | 2 +- Makefile | 4 + README.md | 2 +- app/migrations/env.py | 4 +- ...7ed0_create_archives_store_until_column.py | 2 +- ...24ec4b1_rename_sheets_last_archived_col.py | 2 +- ...dd_new_service_account_email_column_to_.py | 2 +- ...21d2c96d8_add_sheet_id_to_archive_table.py | 2 +- ...45b_modify_archive_url_to_have_uuid_id_.py | 3 +- .../a23aaf3ae930_drop_active_column.py | 2 +- ...a012ec405b8_add_columns_to_groups_table.py | 2 +- app/shared/aa_utils.py | 5 +- app/shared/business_logic.py | 1 + app/shared/db/database.py | 14 +- app/shared/db/models.py | 15 +- app/shared/db/worker_crud.py | 6 +- app/shared/log.py | 3 +- app/shared/schemas.py | 3 +- app/shared/settings.py | 15 +- app/shared/task_messaging.py | 5 +- app/shared/user_groups.py | 11 +- app/shared/utils/misc.py | 2 +- app/tests/conftest.py | 23 ++-- app/tests/fake_service_account.json | 4 +- app/tests/orchestration.test.yaml | 2 +- app/tests/shared/db/test_models.py | 2 +- app/tests/shared/db/test_worker_crud.py | 11 +- app/tests/shared/test_business_logic.py | 7 +- app/tests/shared/utils/test_misc.py | 4 +- app/tests/user-groups.test.broken.yaml | 2 +- app/tests/user-groups.test.yaml | 2 +- app/tests/web/db/test_crud.py | 7 +- app/tests/web/db/test_user_state.py | 1 + app/tests/web/endpoints/test_default.py | 6 +- .../web/endpoints/test_interoperability.py | 2 +- app/tests/web/endpoints/test_sheet.py | 4 +- app/tests/web/test_main.py | 10 +- app/tests/web/test_security.py | 4 +- app/tests/worker/test_worker_main.py | 7 +- app/web/__init__.py | 3 +- app/web/config.py | 2 +- app/web/db/crud.py | 18 +-- app/web/db/user_state.py | 11 +- app/web/endpoints/default.py | 8 +- app/web/endpoints/interoperability.py | 14 +- app/web/endpoints/sheet.py | 12 +- app/web/endpoints/task.py | 4 +- app/web/endpoints/url.py | 20 +-- app/web/events.py | 24 +++- app/web/main.py | 27 ++-- app/web/middleware.py | 8 +- app/web/security.py | 15 +- app/web/utils/metrics.py | 3 +- app/web/utils/misc.py | 1 + app/worker/main.py | 19 +-- app/worker/worker_log.py | 8 +- docker-compose.dev.yml | 2 +- docker-compose.yml | 6 +- poetry.lock | 129 +++++++++++++++++- pyproject.toml | 2 +- user-groups.example.yaml | 1 - worker.Dockerfile | 2 +- 68 files changed, 473 insertions(+), 182 deletions(-) create mode 100644 .github/workflows/format-and-fail.yml create mode 100644 .pre-commit-config.yaml diff --git a/.env.alembic b/.env.alembic index 8691557..11bf2aa 100644 --- a/.env.alembic +++ b/.env.alembic @@ -2,4 +2,4 @@ CHROME_APP_IDS='["1234567890"]' ALLOWED_ORIGINS='["allowed"]' BLOCKED_EMAILS='[]' DATABASE_PATH="sqlite:///./database/auto-archiver.db" -API_BEARER_TOKEN=THIS_API_TOKEN_SHOULD_NEVER_BE_USED \ No newline at end of file +API_BEARER_TOKEN=THIS_API_TOKEN_SHOULD_NEVER_BE_USED diff --git a/.env.example b/.env.example index ef3935a..ea544ef 100644 --- a/.env.example +++ b/.env.example @@ -35,4 +35,4 @@ MAIL_SSL_TLS=True # celery workers config -CONCURRENCY=2 \ No newline at end of file +CONCURRENCY=2 diff --git a/.env.test b/.env.test index 32318f0..360f40e 100644 --- a/.env.test +++ b/.env.test @@ -5,4 +5,4 @@ BLOCKED_EMAILS='["blocked@example.com"]' DATABASE_PATH="sqlite:///auto-archiver.test.db" API_BEARER_TOKEN=this_is_the_test_api_token -USER_GROUPS_FILENAME=app/tests/user-groups.test.yaml \ No newline at end of file +USER_GROUPS_FILENAME=app/tests/user-groups.test.yaml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9b63544..4135ac2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,13 +1,9 @@ name: CI on: push: - branches: - - main - - dev + branches: [ main, dev ] pull_request: - branches: - - main - - dev + branches: [ main, dev ] jobs: test: @@ -41,4 +37,4 @@ jobs: run: poetry run coverage run -m pytest -v -ra --color=yes app/tests/ - name: Report coverage - run: poetry run coverage report \ No newline at end of file + run: poetry run coverage report diff --git a/.github/workflows/format-and-fail.yml b/.github/workflows/format-and-fail.yml new file mode 100644 index 0000000..f1c01f8 --- /dev/null +++ b/.github/workflows/format-and-fail.yml @@ -0,0 +1,16 @@ +name: Format and Fail +on: + push: + branches: [ main, dev ] + pull_request: + branches: [ main, dev ] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: "3.11" + - uses: pre-commit/action@v3.0.0 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..73765d5 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,79 @@ +repos: + - repo: https://github.com/nbQA-dev/nbQA + rev: 1.8.5 + hooks: + - id: nbqa-ruff + args: + - --fix + - --target-version=py311 + - --ignore=E721,E722 + - --line-length=80 + - id: nbqa-black + args: + - --line-length=80 + - id: nbqa-isort + args: + - --float-to-top + - --profile=black + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: trailing-whitespace + - id: check-docstring-first + - id: check-executables-have-shebangs + - id: check-json + - id: check-case-conflict + - id: check-toml + - id: check-merge-conflict + - id: check-xml + - id: check-yaml + exclude: app/tests/user-groups.test.broken.yaml + - id: end-of-file-fixer + - id: check-symlinks + - id: mixed-line-ending + - id: sort-simple-yaml + - id: fix-encoding-pragma + args: + - --remove + - id: pretty-format-json + args: + - --autofix + + - repo: https://github.com/pre-commit/pygrep-hooks + rev: v1.10.0 + hooks: + - id: python-check-blanket-noqa + - id: python-check-mock-methods + - id: python-no-eval + - id: python-no-log-warn + + - repo: https://github.com/PyCQA/isort + rev: 5.12.0 + hooks: + - id: isort + name: Run isort to sort imports + files: \.py$ + # To keep consistent with the global isort skip config defined in setup.cfg + exclude: ^build/.*$|^.tox/.*$|^venv/.*$ + args: + - --lines-after-imports=2 + - --profile=black + - --line-length=80 + +# - repo: https://github.com/astral-sh/ruff-pre-commit +# rev: v0.4.10 +# hooks: +# - id: ruff +# types_or: [python,pyi] +# args: +# - --fix +# - --target-version=py311 +# - --select=B,C,E,F,W,B9 +# - --line-length=80 +# - --ignore=E203,E402,E501,E261 +# - id: ruff-format +# types_or: [ python,pyi] +# args: +# - --target-version=py311 +# - --line-length=80 diff --git a/LICENSE b/LICENSE index e10dcd9..c5bae4c 100644 --- a/LICENSE +++ b/LICENSE @@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. \ No newline at end of file +SOFTWARE. diff --git a/Makefile b/Makefile index bfcb3d3..ddda086 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,7 @@ +.PHONY: lint +lint: + poetry run pre-commit run --all-files + .PHONY: clean-dev clean-dev: @echo -n "Are you sure? [yes/N] (this will delete volumes) " && read ans && [ $${ans:-N} = yes ] diff --git a/README.md b/README.md index 2c87342..da402e3 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ To properly set up the API you need to install `docker` and to have these files, 2. go through the example file and configure it according to your needs. 3. you will need to create and reference at least one `secrets/orchestration.yaml` file, you can do so by following the instructions in the [auto-archiver](https://github.com/bellingcat/auto-archiver#installation) that automatically generates one for you. If you use the archive sheets feature you will need to create a `orchestrationsheets-sheets.yaml` file as well that should have the `gsheet_feeder` and `gsheet_db` enabled and configured, the auto-archiver has [extensive documentation](https://auto-archiver.readthedocs.io/en/latest/) on how to set this up. -Do not commit those files, they are .gitignored by default. +Do not commit those files, they are .gitignored by default. We also advise you to keep any sensitive files in the `secrets/` folder which is pinned and gitignored. We have examples for both of those files (`.env.example` and `user-groups.example.yaml`), and here's how to set them up whether you're in development or production: diff --git a/app/migrations/env.py b/app/migrations/env.py index 870ef18..1f579ab 100644 --- a/app/migrations/env.py +++ b/app/migrations/env.py @@ -1,11 +1,11 @@ from logging.config import fileConfig -from sqlalchemy import engine_from_config -from sqlalchemy import pool from alembic import context +from sqlalchemy import engine_from_config, pool from app.shared.settings import get_settings + # this is the Alembic Config object, which provides # access to the values within the .ini file in use. config = context.config diff --git a/app/migrations/versions/02b2f6d17ed0_create_archives_store_until_column.py b/app/migrations/versions/02b2f6d17ed0_create_archives_store_until_column.py index d00fa2c..8642f2b 100644 --- a/app/migrations/versions/02b2f6d17ed0_create_archives_store_until_column.py +++ b/app/migrations/versions/02b2f6d17ed0_create_archives_store_until_column.py @@ -5,8 +5,8 @@ Revises: 1636724ec4b1 Create Date: 2025-02-08 15:22:20.392522 """ -from alembic import op import sqlalchemy as sa +from alembic import op # revision identifiers, used by Alembic. diff --git a/app/migrations/versions/1636724ec4b1_rename_sheets_last_archived_col.py b/app/migrations/versions/1636724ec4b1_rename_sheets_last_archived_col.py index 6c109f3..324f75a 100644 --- a/app/migrations/versions/1636724ec4b1_rename_sheets_last_archived_col.py +++ b/app/migrations/versions/1636724ec4b1_rename_sheets_last_archived_col.py @@ -5,8 +5,8 @@ Revises: a23aaf3ae930 Create Date: 2025-02-05 19:19:01.984396 """ -from alembic import op import sqlalchemy as sa +from alembic import op # revision identifiers, used by Alembic. diff --git a/app/migrations/versions/63ac79df4ad0_add_new_service_account_email_column_to_.py b/app/migrations/versions/63ac79df4ad0_add_new_service_account_email_column_to_.py index 7067746..572905d 100644 --- a/app/migrations/versions/63ac79df4ad0_add_new_service_account_email_column_to_.py +++ b/app/migrations/versions/63ac79df4ad0_add_new_service_account_email_column_to_.py @@ -5,8 +5,8 @@ Revises: 02b2f6d17ed0 Create Date: 2025-02-11 21:53:23.293274 """ -from alembic import op import sqlalchemy as sa +from alembic import op # revision identifiers, used by Alembic. diff --git a/app/migrations/versions/89121d2c96d8_add_sheet_id_to_archive_table.py b/app/migrations/versions/89121d2c96d8_add_sheet_id_to_archive_table.py index 3011cf6..892e853 100644 --- a/app/migrations/versions/89121d2c96d8_add_sheet_id_to_archive_table.py +++ b/app/migrations/versions/89121d2c96d8_add_sheet_id_to_archive_table.py @@ -5,8 +5,8 @@ Revises: fa012ec405b8 Create Date: 2024-11-04 11:12:30.237299 """ -from alembic import op import sqlalchemy as sa +from alembic import op from sqlalchemy.engine.reflection import Inspector diff --git a/app/migrations/versions/9369a264945b_modify_archive_url_to_have_uuid_id_.py b/app/migrations/versions/9369a264945b_modify_archive_url_to_have_uuid_id_.py index a2b708a..bdfe474 100644 --- a/app/migrations/versions/9369a264945b_modify_archive_url_to_have_uuid_id_.py +++ b/app/migrations/versions/9369a264945b_modify_archive_url_to_have_uuid_id_.py @@ -1,12 +1,13 @@ """modify archive url to have uuid id instead of url unique constraint Revision ID: 9369a264945b -Revises: +Revises: Create Date: 2023-12-20 17:24:59.320691 """ from alembic import op + # revision identifiers, used by Alembic. revision = '9369a264945b' down_revision = None diff --git a/app/migrations/versions/a23aaf3ae930_drop_active_column.py b/app/migrations/versions/a23aaf3ae930_drop_active_column.py index 912f408..ebc85e7 100644 --- a/app/migrations/versions/a23aaf3ae930_drop_active_column.py +++ b/app/migrations/versions/a23aaf3ae930_drop_active_column.py @@ -5,8 +5,8 @@ Revises: 89121d2c96d8 Create Date: 2025-02-04 12:19:20.753570 """ -from alembic import op import sqlalchemy as sa +from alembic import op # revision identifiers, used by Alembic. diff --git a/app/migrations/versions/fa012ec405b8_add_columns_to_groups_table.py b/app/migrations/versions/fa012ec405b8_add_columns_to_groups_table.py index f0577ea..c3169c3 100644 --- a/app/migrations/versions/fa012ec405b8_add_columns_to_groups_table.py +++ b/app/migrations/versions/fa012ec405b8_add_columns_to_groups_table.py @@ -5,8 +5,8 @@ Revises: 93a611e4c066 Create Date: 2024-10-31 09:36:50.360710 """ -from alembic import op import sqlalchemy as sa +from alembic import op from sqlalchemy.engine.reflection import Inspector diff --git a/app/shared/aa_utils.py b/app/shared/aa_utils.py index 393a975..466d003 100644 --- a/app/shared/aa_utils.py +++ b/app/shared/aa_utils.py @@ -1,11 +1,13 @@ # TODO: code in this file should eventually be moved to the auto-archiver code base from typing import List -from loguru import logger + from auto_archiver.core import Media, Metadata +from loguru import logger from app.shared.db import models + def get_all_urls(result: Metadata) -> List[models.ArchiveUrl]: db_urls = [] for m in result.media: @@ -29,4 +31,3 @@ def convert_if_media(media): except Exception as e: logger.debug(f"error parsing {media} : {e}") return False - diff --git a/app/shared/business_logic.py b/app/shared/business_logic.py index d179fda..b4291b6 100644 --- a/app/shared/business_logic.py +++ b/app/shared/business_logic.py @@ -2,6 +2,7 @@ import datetime + from sqlalchemy.orm import Session from app.shared.db import worker_crud diff --git a/app/shared/db/database.py b/app/shared/db/database.py index 171b97b..51c235a 100644 --- a/app/shared/db/database.py +++ b/app/shared/db/database.py @@ -1,8 +1,14 @@ -from functools import lru_cache -from sqlalchemy import Engine, create_engine, event, text -from sqlalchemy.orm import sessionmaker from contextlib import asynccontextmanager, contextmanager -from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, AsyncEngine, async_sessionmaker +from functools import lru_cache + +from sqlalchemy import Engine, create_engine, event, text +from sqlalchemy.ext.asyncio import ( + AsyncEngine, + AsyncSession, + async_sessionmaker, + create_async_engine, +) +from sqlalchemy.orm import sessionmaker from app.shared.settings import get_settings diff --git a/app/shared/db/models.py b/app/shared/db/models.py index 1736224..aa93034 100644 --- a/app/shared/db/models.py +++ b/app/shared/db/models.py @@ -1,8 +1,17 @@ -from sqlalchemy import Column, String, JSON, DateTime, Boolean, Table, ForeignKey -from sqlalchemy.sql import func -from sqlalchemy.orm import relationship, declarative_base import uuid +from sqlalchemy import ( + JSON, + Boolean, + Column, + DateTime, + ForeignKey, + String, + Table, +) +from sqlalchemy.orm import declarative_base, relationship +from sqlalchemy.sql import func + Base = declarative_base() diff --git a/app/shared/db/worker_crud.py b/app/shared/db/worker_crud.py index 814689a..82f3380 100644 --- a/app/shared/db/worker_crud.py +++ b/app/shared/db/worker_crud.py @@ -1,8 +1,10 @@ -from sqlalchemy.orm import Session from datetime import datetime -from app.shared.db import models +from sqlalchemy.orm import Session + from app.shared import schemas +from app.shared.db import models + # TODO: isolate database operations away from worker and into WEB # ONLY WORKER diff --git a/app/shared/log.py b/app/shared/log.py index 68587e2..734c368 100644 --- a/app/shared/log.py +++ b/app/shared/log.py @@ -1,4 +1,5 @@ import traceback + from loguru import logger @@ -6,7 +7,7 @@ from loguru import logger logger.add("logs/api_logs.log", retention="30 days") logger.add("logs/error_logs.log", retention="30 days", level="ERROR") - + def log_error(e: Exception, traceback_str: str = None, extra:str = ""): if not traceback_str: traceback_str = traceback.format_exc() if extra: extra = f"{extra}\n" diff --git a/app/shared/schemas.py b/app/shared/schemas.py index 66119f7..e8479f5 100644 --- a/app/shared/schemas.py +++ b/app/shared/schemas.py @@ -1,7 +1,8 @@ +from datetime import datetime from typing import Annotated + from annotated_types import Len from pydantic import BaseModel -from datetime import datetime class SubmitSheet(BaseModel): diff --git a/app/shared/settings.py b/app/shared/settings.py index d884f80..2f68ae7 100644 --- a/app/shared/settings.py +++ b/app/shared/settings.py @@ -1,14 +1,15 @@ -from functools import lru_cache import os +from functools import lru_cache +from typing import Annotated, Set + +from annotated_types import Len from fastapi_mail import ConnectionConfig from pydantic_settings import BaseSettings, SettingsConfigDict -from typing import Annotated, Set -from annotated_types import Len class Settings(BaseSettings): - + model_config = SettingsConfigDict(env_file=os.environ.get("ENVIRONMENT_FILE") , env_file_encoding='utf-8', extra='ignore', str_strip_whitespace=True) # general @@ -37,14 +38,14 @@ class Settings(BaseSettings): if self.REDIS_PASSWORD: return f"redis://:{self.REDIS_PASSWORD}@{self.REDIS_HOSTNAME}:6379" return f"redis://{self.REDIS_HOSTNAME}:6379" - + # cronjobs CRON_ARCHIVE_SHEETS: bool = False CRON_DELETE_STALE_SHEETS: bool = False DELETE_STALE_SHEETS_DAYS: int = 14 CRON_DELETE_SCHEDULED_ARCHIVES: bool = False DELETE_SCHEDULED_ARCHIVES_CHECK_EVERY_N_DAYS: int = 7 - + # observability REPEAT_COUNT_METRICS_SECONDS: int = 30 @@ -73,4 +74,4 @@ class Settings(BaseSettings): @lru_cache def get_settings(): - return Settings() \ No newline at end of file + return Settings() diff --git a/app/shared/task_messaging.py b/app/shared/task_messaging.py index 21fb3d1..88d0057 100644 --- a/app/shared/task_messaging.py +++ b/app/shared/task_messaging.py @@ -1,8 +1,9 @@ from functools import lru_cache -from celery import Celery -import redis +from celery import Celery + +import redis from app.shared.settings import get_settings diff --git a/app/shared/user_groups.py b/app/shared/user_groups.py index 592e012..480b84e 100644 --- a/app/shared/user_groups.py +++ b/app/shared/user_groups.py @@ -1,9 +1,16 @@ import json import os +from typing import Dict, List, Set + import yaml from loguru import logger -from pydantic import BaseModel, computed_field, field_validator, Field, model_validator -from typing import Dict, List, Set +from pydantic import ( + BaseModel, + Field, + computed_field, + field_validator, + model_validator, +) from typing_extensions import Self diff --git a/app/shared/utils/misc.py b/app/shared/utils/misc.py index 562b2c3..6c5940d 100644 --- a/app/shared/utils/misc.py +++ b/app/shared/utils/misc.py @@ -7,4 +7,4 @@ def fnv1a_hash_mod(s: str, modulo:int) -> int: hash ^= ord(char) hash *= fnv_prime hash &= 0xFFFFFFFF # Keep it 32-bit - return (hash if hash < 0x80000000 else hash - 0x100000000) % modulo \ No newline at end of file + return (hash if hash < 0x80000000 else hash - 0x100000000) % modulo diff --git a/app/tests/conftest.py b/app/tests/conftest.py index afa76f9..f7da39e 100644 --- a/app/tests/conftest.py +++ b/app/tests/conftest.py @@ -1,12 +1,14 @@ import os from typing import AsyncGenerator -from fastapi.testclient import TestClient -import pytest from unittest.mock import patch + +import pytest import pytest_asyncio -from sqlalchemy.ext.asyncio import AsyncSession, AsyncEngine -from app.web.config import ALLOW_ANY_EMAIL +from fastapi.testclient import TestClient +from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession + from app.shared.settings import Settings +from app.web.config import ALLOW_ANY_EMAIL from app.web.db.user_state import UserState @@ -65,10 +67,11 @@ def db_session(test_db): @pytest_asyncio.fixture() async def async_test_db(get_settings: Settings): + import asyncio + from app.shared.db import models from app.shared.db.database import make_async_engine from app.web.db.crud import get_user_group_names - import asyncio get_user_group_names.cache_clear() engine = await make_async_engine(get_settings.ASYNC_DATABASE_PATH) @@ -108,8 +111,8 @@ async def async_db_session(async_test_db: AsyncEngine) -> AsyncGenerator[AsyncSe @pytest.fixture() def app(db_session): - from app.web.main import app_factory from app.web.db import crud + from app.web.main import app_factory app = app_factory() crud.upsert_user_groups(db_session) return app @@ -123,7 +126,11 @@ def client(app): @pytest.fixture() def app_with_auth(app, db_session): - from app.web.security import get_token_or_user_auth, get_user_auth, get_user_state + from app.web.security import ( + get_token_or_user_auth, + get_user_auth, + get_user_state, + ) app.dependency_overrides[get_token_or_user_auth] = lambda: "rick@example.com" app.dependency_overrides[get_user_auth] = lambda: "morty@example.com" app.dependency_overrides[get_user_state] = lambda: UserState(db_session, "MORTY@example.com") @@ -138,7 +145,7 @@ def client_with_auth(app_with_auth): @pytest.fixture() def app_with_token(app): - from app.web.security import token_api_key_auth, get_token_or_user_auth + from app.web.security import get_token_or_user_auth, token_api_key_auth app.dependency_overrides[token_api_key_auth] = lambda: ALLOW_ANY_EMAIL app.dependency_overrides[get_token_or_user_auth] = lambda: ALLOW_ANY_EMAIL return app diff --git a/app/tests/fake_service_account.json b/app/tests/fake_service_account.json index 3d41bd9..10c0585 100644 --- a/app/tests/fake_service_account.json +++ b/app/tests/fake_service_account.json @@ -1,3 +1,3 @@ { - "client_email": "fake_service_account@fake_service_account.iam.gserviceaccount.com" -} \ No newline at end of file + "client_email": "fake_service_account@fake_service_account.iam.gserviceaccount.com" +} diff --git a/app/tests/orchestration.test.yaml b/app/tests/orchestration.test.yaml index 4ee1880..ef7ed27 100644 --- a/app/tests/orchestration.test.yaml +++ b/app/tests/orchestration.test.yaml @@ -15,7 +15,7 @@ configurations: gsheet_feeder: service_account: "app/tests/fake_service_account.json" cli_feeder: - urls: + urls: - "url1" hash_enricher: algorithm: "SHA-256" diff --git a/app/tests/shared/db/test_models.py b/app/tests/shared/db/test_models.py index 35ba368..4da9571 100644 --- a/app/tests/shared/db/test_models.py +++ b/app/tests/shared/db/test_models.py @@ -3,4 +3,4 @@ def test_generate_uuid(): assert generate_uuid() != generate_uuid() assert len(generate_uuid()) == 36 - assert generate_uuid().count("-") == 4 \ No newline at end of file + assert generate_uuid().count("-") == 4 diff --git a/app/tests/shared/db/test_worker_crud.py b/app/tests/shared/db/test_worker_crud.py index 1098cbe..09e9a76 100644 --- a/app/tests/shared/db/test_worker_crud.py +++ b/app/tests/shared/db/test_worker_crud.py @@ -1,10 +1,9 @@ -from app.shared.db import models -from app.shared.db import worker_crud, models from datetime import datetime - +from app.shared.db import models, worker_crud from app.tests.web.db.test_crud import test_data + def test_update_sheet_last_url_archived_at(db_session): # Create test sheet @@ -19,7 +18,7 @@ def test_update_sheet_last_url_archived_at(db_session): db_session.refresh(test_sheet) assert isinstance(test_sheet.last_url_archived_at, datetime) assert test_sheet.last_url_archived_at > before - + # Test non-existent sheet assert worker_crud.update_sheet_last_url_archived_at(db_session, "non-existent-sheet") is False @@ -73,8 +72,8 @@ def test_create_tag(db_session): def test_create_task(db_session): - from app.shared.db import worker_crud from app.shared import schemas + from app.shared.db import worker_crud task = schemas.ArchiveCreate( id="archive-id-456-101", @@ -114,4 +113,4 @@ def test_create_task(db_session): assert nt.group_id == "spaceship" assert len(nt.tags) == 0 assert len(nt.urls) == 0 - assert nt.created_at is not None \ No newline at end of file + assert nt.created_at is not None diff --git a/app/tests/shared/test_business_logic.py b/app/tests/shared/test_business_logic.py index 225fb11..830fa7b 100644 --- a/app/tests/shared/test_business_logic.py +++ b/app/tests/shared/test_business_logic.py @@ -1,7 +1,12 @@ from datetime import datetime, timedelta from unittest.mock import MagicMock, patch + import pytest -from app.shared.business_logic import get_store_archive_until, get_store_archive_until_or_never + +from app.shared.business_logic import ( + get_store_archive_until, + get_store_archive_until_or_never, +) class Test_get_store_archive_until: diff --git a/app/tests/shared/utils/test_misc.py b/app/tests/shared/utils/test_misc.py index d7595c8..18db28d 100644 --- a/app/tests/shared/utils/test_misc.py +++ b/app/tests/shared/utils/test_misc.py @@ -11,7 +11,7 @@ def test_fnv1a_hash_mod(): # Test different modulos hash1 = fnv1a_hash_mod("test", 5) - hash2 = fnv1a_hash_mod("test", 10) + hash2 = fnv1a_hash_mod("test", 10) assert 0 <= hash1 < 5 assert 0 <= hash2 < 10 @@ -28,4 +28,4 @@ def test_fnv1a_hash_mod(): assert 0 <= fnv1a_hash_mod("测试", 10) < 10 # Test modulo = 1 edge case - assert fnv1a_hash_mod("test", 1) == 0 \ No newline at end of file + assert fnv1a_hash_mod("test", 1) == 0 diff --git a/app/tests/user-groups.test.broken.yaml b/app/tests/user-groups.test.broken.yaml index 8bc59c5..9b41741 100644 --- a/app/tests/user-groups.test.broken.yaml +++ b/app/tests/user-groups.test.broken.yaml @@ -3,4 +3,4 @@ This is just an invalid yaml for testing still broken: True - one - - two \ No newline at end of file + - two diff --git a/app/tests/user-groups.test.yaml b/app/tests/user-groups.test.yaml index 16a3ba7..e9a446f 100644 --- a/app/tests/user-groups.test.yaml +++ b/app/tests/user-groups.test.yaml @@ -84,4 +84,4 @@ groups: # max_archive_lifespan_months: 12 max_monthly_urls: 1 # max_monthly_mbs: 50 - priority: "low" \ No newline at end of file + priority: "low" diff --git a/app/tests/web/db/test_crud.py b/app/tests/web/db/test_crud.py index aad9d4c..298a087 100644 --- a/app/tests/web/db/test_crud.py +++ b/app/tests/web/db/test_crud.py @@ -3,10 +3,12 @@ from unittest.mock import patch import pytest import yaml + from app.shared.db import models from app.shared.settings import Settings - from app.web.db import crud + + authors = ["rick@example.com", "morty@example.com", "jerry@example.com"] @@ -373,6 +375,7 @@ async def test_get_sheets_by_id_hash(async_db_session): @pytest.mark.asyncio async def test_delete_stale_sheets(async_db_session): from datetime import datetime, timedelta + from sqlalchemy.sql import select now = datetime.now() @@ -435,4 +438,4 @@ async def test_delete_stale_sheets(async_db_session): # Running again should not delete anything deleted = await crud.delete_stale_sheets(async_db_session, 7) - assert len(deleted) == 0 \ No newline at end of file + assert len(deleted) == 0 diff --git a/app/tests/web/db/test_user_state.py b/app/tests/web/db/test_user_state.py index 42c61d1..665bf08 100644 --- a/app/tests/web/db/test_user_state.py +++ b/app/tests/web/db/test_user_state.py @@ -1,5 +1,6 @@ from unittest.mock import MagicMock, PropertyMock, patch + import pytest from app.shared.db import models diff --git a/app/tests/web/endpoints/test_default.py b/app/tests/web/endpoints/test_default.py index 401a164..e4e34cc 100644 --- a/app/tests/web/endpoints/test_default.py +++ b/app/tests/web/endpoints/test_default.py @@ -1,10 +1,12 @@ from unittest.mock import MagicMock -from fastapi.testclient import TestClient + import pytest +from fastapi.testclient import TestClient + from app.shared.schemas import Usage, UsageResponse from app.shared.user_groups import GroupInfo -from app.web.config import VERSION from app.tests.web.db.test_crud import test_data +from app.web.config import VERSION def test_endpoint_home(client_with_auth): diff --git a/app/tests/web/endpoints/test_interoperability.py b/app/tests/web/endpoints/test_interoperability.py index 31cf8f0..703f69a 100644 --- a/app/tests/web/endpoints/test_interoperability.py +++ b/app/tests/web/endpoints/test_interoperability.py @@ -1,5 +1,5 @@ -from datetime import datetime import json +from datetime import datetime from unittest.mock import MagicMock, patch from app.shared.db import models diff --git a/app/tests/web/endpoints/test_sheet.py b/app/tests/web/endpoints/test_sheet.py index 1396d85..9b47228 100644 --- a/app/tests/web/endpoints/test_sheet.py +++ b/app/tests/web/endpoints/test_sheet.py @@ -1,5 +1,5 @@ -from datetime import datetime import json +from datetime import datetime from unittest.mock import MagicMock, patch from fastapi.testclient import TestClient @@ -45,8 +45,8 @@ def test_create_sheet_endpoint(app_with_auth, db_session): assert response.json() == {"detail": "User does not have access to this group."} # switch to jerry who's got less quota/permissions - from app.web.security import get_user_state from app.web.db.user_state import UserState + from app.web.security import get_user_state app_with_auth.dependency_overrides[get_user_state] = lambda: UserState(db_session, "jerry@example.com") client_jerry = TestClient(app_with_auth) diff --git a/app/tests/web/test_main.py b/app/tests/web/test_main.py index f77d368..a4ddf1e 100644 --- a/app/tests/web/test_main.py +++ b/app/tests/web/test_main.py @@ -1,10 +1,10 @@ import os -from unittest.mock import patch -from fastapi.testclient import TestClient - import shutil +from unittest.mock import patch import pytest +from fastapi.testclient import TestClient + def test_lifespan(app): with TestClient(app) as client: @@ -25,7 +25,7 @@ def test_logging_middleware(m1, client_with_auth): client_with_auth.delete("/url/123") # creates one empty and one from above assert len(EXCEPTION_COUNTER.collect()[0].samples) == 2 - + def test_serve_local_archive_logic(get_settings): # create a test file first @@ -38,7 +38,7 @@ def test_serve_local_archive_logic(get_settings): get_settings.SERVE_LOCAL_ARCHIVE = "/app/local_archive_test" from app.web.main import app_factory app = app_factory(get_settings) - + # test client = TestClient(app) r = client.get("/app/local_archive_test/temp.txt") diff --git a/app/tests/web/test_security.py b/app/tests/web/test_security.py index 1a6c00b..55a434b 100644 --- a/app/tests/web/test_security.py +++ b/app/tests/web/test_security.py @@ -1,8 +1,8 @@ from unittest.mock import Mock, patch +import pytest from fastapi import HTTPException from fastapi.security import HTTPAuthorizationCredentials -import pytest from app.web.config import ALLOW_ANY_EMAIL @@ -108,8 +108,8 @@ async def test_authenticate_user_exception(): def test_get_user_state(): - from app.web.security import get_user_state from app.web.db.user_state import UserState + from app.web.security import get_user_state mock_session = Mock() test_email = "test@example.com" diff --git a/app/tests/worker/test_worker_main.py b/app/tests/worker/test_worker_main.py index d40c457..9a77528 100644 --- a/app/tests/worker/test_worker_main.py +++ b/app/tests/worker/test_worker_main.py @@ -1,13 +1,12 @@ from datetime import datetime - from unittest.mock import patch import pytest - -from app.shared.db import models -from app.shared import schemas from auto_archiver.core import Media, Metadata +from app.shared import schemas +from app.shared.db import models + class Test_create_archive_task(): URL = "https://example-live.com" diff --git a/app/web/__init__.py b/app/web/__init__.py index a817e9e..98a139a 100644 --- a/app/web/__init__.py +++ b/app/web/__init__.py @@ -1,3 +1,4 @@ from app.web.main import app_factory -app = app_factory \ No newline at end of file + +app = app_factory diff --git a/app/web/config.py b/app/web/config.py index 29a6806..b795d88 100644 --- a/app/web/config.py +++ b/app/web/config.py @@ -5,7 +5,7 @@ API_DESCRIPTION = """ **Usage notes:** - The API requires a Bearer token for most operations, which you can obtain by logging in with your Google account. -- You can use this API to archive single URLs or entire Google Sheets. +- You can use this API to archive single URLs or entire Google Sheets. - Once you submit a URL or Sheet for archiving, the API will return a task_id that you can use to check the status of the archiving process. It works asynchronously. """ BREAKING_CHANGES = {"minVersion": "0.4.0", "message": "The latest update has breaking changes, please update the extension to the most recent version."} diff --git a/app/web/db/crud.py b/app/web/db/crud.py index c16b09a..b33faa2 100644 --- a/app/web/db/crud.py +++ b/app/web/db/crud.py @@ -1,18 +1,19 @@ from collections import defaultdict -from functools import lru_cache -from sqlalchemy.orm import Session, load_only -from sqlalchemy import Column, or_, func, select -from loguru import logger from datetime import datetime, timedelta -from sqlalchemy.ext.asyncio import AsyncSession +from functools import lru_cache + from cachetools import LRUCache, cached from cachetools.keys import hashkey +from loguru import logger +from sqlalchemy import Column, func, or_, select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import Session, load_only -from app.web.config import ALLOW_ANY_EMAIL from app.shared.db import models from app.shared.settings import get_settings from app.shared.user_groups import UserGroups from app.shared.utils.misc import fnv1a_hash_mod +from app.web.config import ALLOW_ANY_EMAIL from app.web.utils.misc import convert_priority_to_queue_dict @@ -117,7 +118,7 @@ async def get_group_priority_async(db: AsyncSession, group_id: str) -> dict: @cached(cache=LRUCache(maxsize=128), key=lambda db, email: hashkey(email)) def get_user_group_names(db: Session, email: str) -> list[str]: """ - given an email retrieves the user groups from the DB and then the email-domain groups from a global variable, the email does not need to belong to an existing user. + given an email retrieves the user groups from the DB and then the email-domain groups from a global variable, the email does not need to belong to an existing user. """ # TODO: the read: [group1, group2] permissions don't currently work if not email or not len(email) or "@" not in email: return [] @@ -173,7 +174,7 @@ def upsert_user_groups(db: Session): def display_email_pii(email: str): return f"'{email[0:3]}...@{email.split('@')[1]}'" """ - reads the user_groups yaml file and inserts any new users, groups, + reads the user_groups yaml file and inserts any new users, groups, along with new participation of users in groups """ filename = get_settings().USER_GROUPS_FILENAME @@ -192,6 +193,7 @@ def upsert_user_groups(db: Session): for group in explicit_groups: group_domains[group].add(domain) import json + # upsert groups and save a map of groupid -> dbobject for group_id, g in ug.groups.items(): upsert_group(db, group_id, g.description, g.orchestrator, g.orchestrator_sheet, g.service_account_email, json.loads(g.permissions.model_dump_json()), list(group_domains.get(group_id, []))) diff --git a/app/web/db/user_state.py b/app/web/db/user_state.py index 968e1bd..384b0b6 100644 --- a/app/web/db/user_state.py +++ b/app/web/db/user_state.py @@ -1,13 +1,14 @@ -from typing import Dict, Set -import sqlalchemy -from sqlalchemy.orm import Session -from sqlalchemy import func from datetime import datetime +from typing import Dict, Set + +import sqlalchemy +from sqlalchemy import func +from sqlalchemy.orm import Session from app.shared.db import models -from app.shared.user_groups import GroupInfo, GroupPermissions from app.shared.schemas import Usage, UsageResponse +from app.shared.user_groups import GroupInfo, GroupPermissions from app.web.db import crud from app.web.utils.misc import convert_priority_to_queue_dict diff --git a/app/web/endpoints/default.py b/app/web/endpoints/default.py index 9271992..cd23d13 100644 --- a/app/web/endpoints/default.py +++ b/app/web/endpoints/default.py @@ -1,13 +1,15 @@ from typing import Dict + from fastapi import APIRouter, Depends, HTTPException from fastapi.responses import FileResponse, JSONResponse -from app.web.config import VERSION, BREAKING_CHANGES from app.shared.schemas import ActiveUser, UsageResponse +from app.shared.user_groups import GroupInfo +from app.web.config import BREAKING_CHANGES, VERSION from app.web.db.user_state import UserState from app.web.security import get_user_state -from app.shared.user_groups import GroupInfo + default_router = APIRouter() @@ -42,7 +44,7 @@ def get_user_usage( if not user.active: raise HTTPException(status_code=403, detail="User is not active.") return user.usage() - + @default_router.get('/favicon.ico', include_in_schema=False) diff --git a/app/web/endpoints/interoperability.py b/app/web/endpoints/interoperability.py index 06ea175..7892bde 100644 --- a/app/web/endpoints/interoperability.py +++ b/app/web/endpoints/interoperability.py @@ -1,19 +1,19 @@ import json + +import sqlalchemy +from auto_archiver.core import Metadata from fastapi import APIRouter, Depends, HTTPException from fastapi.responses import JSONResponse from loguru import logger -import sqlalchemy -from auto_archiver.core import Metadata from sqlalchemy.orm import Session -from app.shared.aa_utils import get_all_urls -from app.web.config import ALLOW_ANY_EMAIL from app.shared import business_logic, schemas -from app.shared.db import worker_crud +from app.shared.aa_utils import get_all_urls +from app.shared.db import models, worker_crud from app.shared.db.database import get_db_dependency -from app.web.security import token_api_key_auth -from app.shared.db import models from app.shared.log import log_error +from app.web.config import ALLOW_ANY_EMAIL +from app.web.security import token_api_key_auth interoperability_router = APIRouter(prefix="/interop", tags=["Interoperability endpoints."]) diff --git a/app/web/endpoints/sheet.py b/app/web/endpoints/sheet.py index 7848b5e..d8c089a 100644 --- a/app/web/endpoints/sheet.py +++ b/app/web/endpoints/sheet.py @@ -1,16 +1,16 @@ from fastapi import APIRouter, Depends, HTTPException from fastapi.responses import JSONResponse - from sqlalchemy import exc from sqlalchemy.orm import Session -from app.web.db.user_state import UserState from app.shared import schemas -from app.shared.task_messaging import get_celery -from app.web.security import get_user_state -from app.web.db import crud from app.shared.db.database import get_db_dependency +from app.shared.task_messaging import get_celery +from app.web.db import crud +from app.web.db.user_state import UserState +from app.web.security import get_user_state + sheet_router = APIRouter(prefix="/sheet", tags=["Google Spreadsheet operations"]) @@ -78,4 +78,4 @@ def archive_user_sheet( group_queue = user.priority_group(sheet.group_id) task = celery.signature("create_sheet_task", args=[schemas.SubmitSheet(sheet_id=id, author_id=user.email, group_id=sheet.group_id).model_dump_json()]).apply_async(**group_queue) - return JSONResponse({"id": task.id}, status_code=201) \ No newline at end of file + return JSONResponse({"id": task.id}, status_code=201) diff --git a/app/web/endpoints/task.py b/app/web/endpoints/task.py index 610c579..3f2ff94 100644 --- a/app/web/endpoints/task.py +++ b/app/web/endpoints/task.py @@ -3,10 +3,10 @@ from fastapi import APIRouter, Depends from fastapi.encoders import jsonable_encoder from fastapi.responses import JSONResponse -from app.shared.task_messaging import get_celery -from app.web.security import get_token_or_user_auth from app.shared import schemas from app.shared.log import log_error +from app.shared.task_messaging import get_celery +from app.web.security import get_token_or_user_auth from app.web.utils.misc import custom_jsonable_encoder diff --git a/app/web/endpoints/url.py b/app/web/endpoints/url.py index a7ac4b4..8307c2d 100644 --- a/app/web/endpoints/url.py +++ b/app/web/endpoints/url.py @@ -1,22 +1,22 @@ +from datetime import datetime +from urllib.parse import urlparse + from fastapi import APIRouter, Depends, HTTPException from fastapi.responses import JSONResponse -from datetime import datetime from loguru import logger from sqlalchemy.orm import Session -from app.web.config import ALLOW_ANY_EMAIL from app.shared import schemas +from app.shared.db.database import get_db_dependency from app.shared.task_messaging import get_celery -from app.web.security import get_token_or_user_auth, get_user_state +from app.web.config import ALLOW_ANY_EMAIL from app.web.db import crud from app.web.db.user_state import UserState -from app.shared.db.database import get_db_dependency - -from urllib.parse import urlparse - +from app.web.security import get_token_or_user_auth, get_user_state from app.web.utils.misc import convert_priority_to_queue_dict + url_router = APIRouter(prefix="/url", tags=["Single URL operations"]) celery = get_celery() @@ -47,7 +47,7 @@ def archive_url( else: archive_create.author_id = archive.author_id or email group_queue = convert_priority_to_queue_dict("high") - + task = celery.signature("create_archive_task", args=[archive_create.model_dump_json()]).apply_async(**group_queue) task_response = schemas.Task(id=task.id) @@ -74,8 +74,8 @@ def search_by_url( @url_router.delete("/{id}", summary="Delete a single URL archive by id.") def delete_archive( - id:str, - user: UserState = Depends(get_user_state), + id:str, + user: UserState = Depends(get_user_state), db: Session = Depends(get_db_dependency) ) -> schemas.DeleteResponse: logger.info(f"deleting url archive task {id} request by {user.email}") diff --git a/app/web/events.py b/app/web/events.py index 625731a..fa15614 100644 --- a/app/web/events.py +++ b/app/web/events.py @@ -1,22 +1,32 @@ import asyncio -from collections import defaultdict import datetime import logging +from collections import defaultdict +from contextlib import asynccontextmanager + import alembic.config from fastapi import FastAPI -from contextlib import asynccontextmanager +from fastapi_mail import FastMail, MessageSchema, MessageType from fastapi_utils.tasks import repeat_every from loguru import logger -from fastapi_mail import FastMail, MessageSchema, MessageType -from app.shared.db import models -from app.shared.db.database import get_db, get_db_async, make_engine, wal_checkpoint from app.shared import schemas +from app.shared.db import models +from app.shared.db.database import ( + get_db, + get_db_async, + make_engine, + wal_checkpoint, +) from app.shared.settings import get_settings from app.shared.task_messaging import get_celery from app.web.db import crud from app.web.middleware import increase_exceptions_counter -from app.web.utils.metrics import measure_regular_metrics, redis_subscribe_worker_exceptions +from app.web.utils.metrics import ( + measure_regular_metrics, + redis_subscribe_worker_exceptions, +) + celery = get_celery() @@ -183,4 +193,4 @@ async def delete_stale_sheets(): async def generate_users_export_csv(): #TODO: implement a cronjob that regularly requested user data to a CSV file # see https://colab.research.google.com/drive/1QDbo3QXHPBdiTuANlA1AWVvN-rqxuCPa?authuser=0#scrollTo=4nPXeSdK8RBT - pass \ No newline at end of file + pass diff --git a/app/web/main.py b/app/web/main.py index ff2266e..69af5c6 100644 --- a/app/web/main.py +++ b/app/web/main.py @@ -1,24 +1,23 @@ import os -from fastapi import FastAPI, Depends -from fastapi.staticfiles import StaticFiles + +from fastapi import Depends, FastAPI from fastapi.middleware.cors import CORSMiddleware -from prometheus_fastapi_instrumentator import Instrumentator +from fastapi.staticfiles import StaticFiles from loguru import logger +from prometheus_fastapi_instrumentator import Instrumentator -from app.web.middleware import logging_middleware -from app.shared.task_messaging import get_celery - -from app.web.security import token_api_key_auth -from app.web.config import VERSION, API_DESCRIPTION -from app.web.events import lifespan from app.shared.settings import get_settings - - +from app.shared.task_messaging import get_celery +from app.web.config import API_DESCRIPTION, VERSION from app.web.endpoints.default import default_router -from app.web.endpoints.url import url_router +from app.web.endpoints.interoperability import interoperability_router from app.web.endpoints.sheet import sheet_router from app.web.endpoints.task import task_router -from app.web.endpoints.interoperability import interoperability_router +from app.web.endpoints.url import url_router +from app.web.events import lifespan +from app.web.middleware import logging_middleware +from app.web.security import token_api_key_auth + celery = get_celery() @@ -57,4 +56,4 @@ def app_factory(settings = get_settings()): logger.warning(f"MOUNTing local archive, use this in development only {settings.SERVE_LOCAL_ARCHIVE}") app.mount(settings.SERVE_LOCAL_ARCHIVE, StaticFiles(directory=local_dir), name=settings.SERVE_LOCAL_ARCHIVE) - return app \ No newline at end of file + return app diff --git a/app/web/middleware.py b/app/web/middleware.py index 52da626..5ddca4b 100644 --- a/app/web/middleware.py +++ b/app/web/middleware.py @@ -1,7 +1,9 @@ import traceback -from loguru import logger + from fastapi import Request +from loguru import logger + from app.shared.log import log_error from app.web.utils.metrics import EXCEPTION_COUNTER @@ -25,7 +27,7 @@ async def increase_exceptions_counter(e: Exception, location:str="cronjob"): last_trace = traceback.extract_tb(e.__traceback__)[-1] _file, _line, func_name, _text = last_trace location = func_name - except Exception as e: + except Exception as e: logger.error(f"Unable to get function name from cronjob exception traceback: {e}") EXCEPTION_COUNTER.labels(type=e.__class__.__name__, location=location).inc() - log_error(e) \ No newline at end of file + log_error(e) diff --git a/app/web/security.py b/app/web/security.py index 12115af..494e094 100644 --- a/app/web/security.py +++ b/app/web/security.py @@ -1,14 +1,17 @@ +import secrets + +import requests +from fastapi import Depends, HTTPException, status +from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer from loguru import logger -import requests, secrets -from fastapi import HTTPException, status, Depends -from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials from sqlalchemy.orm import Session -from app.web.config import ALLOW_ANY_EMAIL -from app.shared.settings import get_settings from app.shared.db.database import get_db_dependency +from app.shared.settings import get_settings +from app.web.config import ALLOW_ANY_EMAIL from app.web.db.user_state import UserState + settings = get_settings() bearer_security = HTTPBearer() @@ -80,4 +83,4 @@ def authenticate_user(access_token): def get_user_state(email:str=Depends(get_user_auth), db:Session=Depends(get_db_dependency)): - return UserState(db, email) \ No newline at end of file + return UserState(db, email) diff --git a/app/web/utils/metrics.py b/app/web/utils/metrics.py index a885b9a..d8026a1 100644 --- a/app/web/utils/metrics.py +++ b/app/web/utils/metrics.py @@ -2,12 +2,13 @@ import asyncio import json import os import shutil + from prometheus_client import Counter, Gauge -from app.web.db import crud from app.shared.db.database import get_db from app.shared.log import log_error from app.shared.task_messaging import get_redis +from app.web.db import crud # Custom metrics diff --git a/app/web/utils/misc.py b/app/web/utils/misc.py index 870a60b..16a6591 100644 --- a/app/web/utils/misc.py +++ b/app/web/utils/misc.py @@ -1,4 +1,5 @@ import base64 + from fastapi.encoders import jsonable_encoder diff --git a/app/worker/main.py b/app/worker/main.py index 561245b..7b4826b 100644 --- a/app/worker/main.py +++ b/app/worker/main.py @@ -1,21 +1,22 @@ +import datetime import json +import traceback -import traceback, datetime +from auto_archiver.core.orchestrator import ArchivingOrchestrator from celery.signals import task_failure from loguru import logger from sqlalchemy import exc -from auto_archiver.core.orchestrator import ArchivingOrchestrator -from app.shared.db import models -from app.shared.db.database import get_db from app.shared import business_logic, schemas -from app.shared.task_messaging import get_celery, get_redis -from app.shared.settings import get_settings -from app.shared.log import log_error from app.shared.aa_utils import get_all_urls -from app.shared.db import worker_crud +from app.shared.db import models, worker_crud +from app.shared.db.database import get_db +from app.shared.log import log_error +from app.shared.settings import get_settings +from app.shared.task_messaging import get_celery, get_redis from app.worker.worker_log import setup_celery_logger + settings = get_settings() celery = get_celery("worker") @@ -26,7 +27,7 @@ USER_GROUPS_FILENAME = settings.USER_GROUPS_FILENAME setup_celery_logger(celery) # TODO: these are temporary PATCHES for new aa's functionality -# logger.add("app/worker/worker_log.log", level="DEBUG") +# logger.add("app/worker/worker_log.log", level="DEBUG") logger.remove = lambda x: print(f"logger.remove({x})") # TODO: after release, as it requires updating past entries with sheet_id where tag is used, drop tags diff --git a/app/worker/worker_log.py b/app/worker/worker_log.py index 221d824..022c63d 100644 --- a/app/worker/worker_log.py +++ b/app/worker/worker_log.py @@ -1,9 +1,11 @@ -from loguru import logger -from celery import Celery import sys +from celery import Celery +from loguru import logger + from app.shared.task_messaging import get_celery + celery = get_celery("worker") def setup_celery_logger(celery): @@ -22,7 +24,7 @@ def setup_celery_logger(celery): if message.strip(): logger.info(message.strip()) # Required to prevent issues with buffered output - def flush(self): pass + def flush(self): pass def isatty(self): return False sys.stdout = InterceptHandler() diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index 29088c0..9c88a36 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -12,7 +12,7 @@ services: - ALLOWED_ORIGINS=["http://localhost:8000","http://localhost:8004","http://localhost:8081","chrome-extension://ojcimmjndnlmmlgnjaeojoebaceokpdp"] - USER_GROUPS_FILENAME=/aa-api/app/user-groups.dev.yaml - DATABASE_PATH=sqlite:////aa-api/database/auto-archiver.db - + worker: # command: watchmedo auto-restart --patterns="*.py" --recursive --ignore-directories -- celery -- --app=app.worker.main.celery worker -Q high_priority,low_priority --concurrency=${CONCURRENCY} --max-tasks-per-child=100 diff --git a/docker-compose.yml b/docker-compose.yml index 31969bc..737c346 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,7 +5,7 @@ volumes: name: "auto-archiver-api" services: web: - build: + build: context: . dockerfile: web.Dockerfile restart: always @@ -29,7 +29,7 @@ services: retries: 3 worker: - build: + build: context: . dockerfile: worker.Dockerfile restart: always @@ -68,4 +68,4 @@ services: test: ["CMD", "redis-cli", "-a", "${REDIS_PASSWORD}", "ping"] interval: 30s timeout: 10s - retries: 3 \ No newline at end of file + retries: 3 diff --git a/poetry.lock b/poetry.lock index ceb824a..9c1ea55 100644 --- a/poetry.lock +++ b/poetry.lock @@ -616,6 +616,18 @@ files = [ [package.dependencies] pycparser = "*" +[[package]] +name = "cfgv" +version = "3.4.0" +description = "Validate configuration and produce human readable error messages." +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"}, + {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"}, +] + [[package]] name = "charset-normalizer" version = "3.4.1" @@ -959,6 +971,18 @@ calendars = ["convertdate (>=2.2.1)", "hijridate"] fasttext = ["fasttext (>=0.9.1)", "numpy (>=1.19.3,<2)"] langdetect = ["langdetect (>=1.0.0)"] +[[package]] +name = "distlib" +version = "0.3.9" +description = "Distribution utilities" +optional = false +python-versions = "*" +groups = ["dev"] +files = [ + {file = "distlib-0.3.9-py2.py3-none-any.whl", hash = "sha256:47f8c22fd27c27e25a65601af709b38e4f0a45ea4fc2e710f65755fa8caaaf87"}, + {file = "distlib-0.3.9.tar.gz", hash = "sha256:a60f20dea646b8a33f3e7772f74dc0b2d0772d2837ee1342a00645c81edf9403"}, +] + [[package]] name = "dnspython" version = "2.7.0" @@ -1097,6 +1121,23 @@ future = "*" [package.extras] dev = ["Sphinx (==2.1.0)", "future (==0.17.1)", "numpy (==1.16.4)", "pytest (==4.6.1)", "pytest-mock (==1.10.4)", "tox (==3.12.1)"] +[[package]] +name = "filelock" +version = "3.17.0" +description = "A platform independent file lock." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "filelock-3.17.0-py3-none-any.whl", hash = "sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338"}, + {file = "filelock-3.17.0.tar.gz", hash = "sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e"}, +] + +[package.extras] +docs = ["furo (>=2024.8.6)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)"] +testing = ["covdefaults (>=2.3)", "coverage (>=7.6.10)", "diff-cover (>=9.2.1)", "pytest (>=8.3.4)", "pytest-asyncio (>=0.25.2)", "pytest-cov (>=6)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.28.1)"] +typing = ["typing-extensions (>=4.12.2) ; python_version < \"3.11\""] + [[package]] name = "future" version = "1.0.0" @@ -1409,6 +1450,21 @@ http2 = ["h2 (>=3,<5)"] socks = ["socksio (==1.*)"] zstd = ["zstandard (>=0.18.0)"] +[[package]] +name = "identify" +version = "2.6.8" +description = "File identification library for Python" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "identify-2.6.8-py2.py3-none-any.whl", hash = "sha256:83657f0f766a3c8d0eaea16d4ef42494b39b34629a4b3192a9d020d349b3e255"}, + {file = "identify-2.6.8.tar.gz", hash = "sha256:61491417ea2c0c5c670484fd8abbb34de34cdae1e5f39a73ee65e48e4bb663fc"}, +] + +[package.extras] +license = ["ukkonen"] + [[package]] name = "idna" version = "3.10" @@ -1724,6 +1780,18 @@ files = [ {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] +[[package]] +name = "nodeenv" +version = "1.9.1" +description = "Node.js virtual environment builder" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["dev"] +files = [ + {file = "nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9"}, + {file = "nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f"}, +] + [[package]] name = "numpy" version = "2.1.3" @@ -1981,6 +2049,23 @@ tests = ["check-manifest", "coverage (>=7.4.2)", "defusedxml", "markdown2", "ole typing = ["typing-extensions ; python_version < \"3.10\""] xmp = ["defusedxml"] +[[package]] +name = "platformdirs" +version = "4.3.6" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"}, + {file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"}, +] + +[package.extras] +docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"] +type = ["mypy (>=1.11.2)"] + [[package]] name = "pluggy" version = "1.5.0" @@ -1997,6 +2082,25 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "pre-commit" +version = "4.1.0" +description = "A framework for managing and maintaining multi-language pre-commit hooks." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "pre_commit-4.1.0-py2.py3-none-any.whl", hash = "sha256:d29e7cb346295bcc1cc75fc3e92e343495e3ea0196c9ec6ba53f49f10ab6ae7b"}, + {file = "pre_commit-4.1.0.tar.gz", hash = "sha256:ae3f018575a588e30dfddfab9a05448bfbd6b73d78709617b5a2b853549716d4"}, +] + +[package.dependencies] +cfgv = ">=2.0.0" +identify = ">=1.0.0" +nodeenv = ">=0.11.1" +pyyaml = ">=5.1" +virtualenv = ">=20.10.0" + [[package]] name = "prometheus-client" version = "0.21.1" @@ -2557,7 +2661,7 @@ version = "6.0.2" description = "YAML parser and emitter for Python" optional = false python-versions = ">=3.8" -groups = ["web"] +groups = ["dev", "web"] files = [ {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"}, {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"}, @@ -3415,6 +3519,27 @@ files = [ {file = "vine-5.1.0.tar.gz", hash = "sha256:8b62e981d35c41049211cf62a0a1242d8c1ee9bd15bb196ce38aefd6799e61e0"}, ] +[[package]] +name = "virtualenv" +version = "20.29.2" +description = "Virtual Python Environment builder" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "virtualenv-20.29.2-py3-none-any.whl", hash = "sha256:febddfc3d1ea571bdb1dc0f98d7b45d24def7428214d4fb73cc486c9568cce6a"}, + {file = "virtualenv-20.29.2.tar.gz", hash = "sha256:fdaabebf6d03b5ba83ae0a02cfe96f48a716f4fae556461d180825866f75b728"}, +] + +[package.dependencies] +distlib = ">=0.3.7,<1" +filelock = ">=3.12.2,<4" +platformdirs = ">=3.9.1,<5" + +[package.extras] +docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] +test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8) ; platform_python_implementation == \"PyPy\" or platform_python_implementation == \"CPython\" and sys_platform == \"win32\" and python_version >= \"3.13\"", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10) ; platform_python_implementation == \"CPython\""] + [[package]] name = "vk-api" version = "11.9.9" @@ -3688,4 +3813,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.13" -content-hash = "11d734f2ee32206214a7ecb8dc3ec8d19a7b6281ee98b509a5bb8bdb647c674a" +content-hash = "c4a5c50ac109c9912992ca86d2b5ec712c6bcfc84838bf42f90208b02cc27b3c" diff --git a/pyproject.toml b/pyproject.toml index ea1c87d..df7b789 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,4 +52,4 @@ pytest = ">=8.3.4,<9.0.0" httpx = ">=0.28.1,<0.29.0" coverage = ">=7.6.11,<8.0.0" pytest-asyncio = ">=0.25.3,<0.26.0" - +pre-commit = "^4.1.0" diff --git a/user-groups.example.yaml b/user-groups.example.yaml index 32f99b4..22a919d 100644 --- a/user-groups.example.yaml +++ b/user-groups.example.yaml @@ -59,4 +59,3 @@ groups: permissions: read: ["default"] read_public: true - \ No newline at end of file diff --git a/worker.Dockerfile b/worker.Dockerfile index 4e24f87..9154c32 100644 --- a/worker.Dockerfile +++ b/worker.Dockerfile @@ -30,4 +30,4 @@ COPY alembic.ini ./ COPY ./app/ ./app/ COPY user-groups.* ./app/ -ENTRYPOINT ["./poetry-venv/bin/poetry", "run"] \ No newline at end of file +ENTRYPOINT ["./poetry-venv/bin/poetry", "run"]