Add pre-commit with GiHub Action (#56)

* Update pyproject.toml

* add pre-commit

* Create .pre-commit-config.yaml

* Comment out ruff

* Update .pre-commit-config.yaml

* General formatting

* Create format-and-fail.yml

* Update ci.yml

* Add pre-commit to dev dependencies

* Update pyproject.toml
This commit is contained in:
Michael Plunkett
2025-02-26 10:42:28 -06:00
committed by GitHub
parent d9f36957cd
commit 7e48f706df
68 changed files with 473 additions and 182 deletions

View File

@@ -2,4 +2,4 @@ CHROME_APP_IDS='["1234567890"]'
ALLOWED_ORIGINS='["allowed"]' ALLOWED_ORIGINS='["allowed"]'
BLOCKED_EMAILS='[]' BLOCKED_EMAILS='[]'
DATABASE_PATH="sqlite:///./database/auto-archiver.db" DATABASE_PATH="sqlite:///./database/auto-archiver.db"
API_BEARER_TOKEN=THIS_API_TOKEN_SHOULD_NEVER_BE_USED API_BEARER_TOKEN=THIS_API_TOKEN_SHOULD_NEVER_BE_USED

View File

@@ -35,4 +35,4 @@ MAIL_SSL_TLS=True
# celery workers config # celery workers config
CONCURRENCY=2 CONCURRENCY=2

View File

@@ -5,4 +5,4 @@ BLOCKED_EMAILS='["blocked@example.com"]'
DATABASE_PATH="sqlite:///auto-archiver.test.db" DATABASE_PATH="sqlite:///auto-archiver.test.db"
API_BEARER_TOKEN=this_is_the_test_api_token API_BEARER_TOKEN=this_is_the_test_api_token
USER_GROUPS_FILENAME=app/tests/user-groups.test.yaml USER_GROUPS_FILENAME=app/tests/user-groups.test.yaml

View File

@@ -1,13 +1,9 @@
name: CI name: CI
on: on:
push: push:
branches: branches: [ main, dev ]
- main
- dev
pull_request: pull_request:
branches: branches: [ main, dev ]
- main
- dev
jobs: jobs:
test: test:
@@ -41,4 +37,4 @@ jobs:
run: poetry run coverage run -m pytest -v -ra --color=yes app/tests/ run: poetry run coverage run -m pytest -v -ra --color=yes app/tests/
- name: Report coverage - name: Report coverage
run: poetry run coverage report run: poetry run coverage report

16
.github/workflows/format-and-fail.yml vendored Normal file
View File

@@ -0,0 +1,16 @@
name: Format and Fail
on:
push:
branches: [ main, dev ]
pull_request:
branches: [ main, dev ]
jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
python-version: "3.11"
- uses: pre-commit/action@v3.0.0

79
.pre-commit-config.yaml Normal file
View File

@@ -0,0 +1,79 @@
repos:
- repo: https://github.com/nbQA-dev/nbQA
rev: 1.8.5
hooks:
- id: nbqa-ruff
args:
- --fix
- --target-version=py311
- --ignore=E721,E722
- --line-length=80
- id: nbqa-black
args:
- --line-length=80
- id: nbqa-isort
args:
- --float-to-top
- --profile=black
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
hooks:
- id: trailing-whitespace
- id: check-docstring-first
- id: check-executables-have-shebangs
- id: check-json
- id: check-case-conflict
- id: check-toml
- id: check-merge-conflict
- id: check-xml
- id: check-yaml
exclude: app/tests/user-groups.test.broken.yaml
- id: end-of-file-fixer
- id: check-symlinks
- id: mixed-line-ending
- id: sort-simple-yaml
- id: fix-encoding-pragma
args:
- --remove
- id: pretty-format-json
args:
- --autofix
- repo: https://github.com/pre-commit/pygrep-hooks
rev: v1.10.0
hooks:
- id: python-check-blanket-noqa
- id: python-check-mock-methods
- id: python-no-eval
- id: python-no-log-warn
- repo: https://github.com/PyCQA/isort
rev: 5.12.0
hooks:
- id: isort
name: Run isort to sort imports
files: \.py$
# To keep consistent with the global isort skip config defined in setup.cfg
exclude: ^build/.*$|^.tox/.*$|^venv/.*$
args:
- --lines-after-imports=2
- --profile=black
- --line-length=80
# - repo: https://github.com/astral-sh/ruff-pre-commit
# rev: v0.4.10
# hooks:
# - id: ruff
# types_or: [python,pyi]
# args:
# - --fix
# - --target-version=py311
# - --select=B,C,E,F,W,B9
# - --line-length=80
# - --ignore=E203,E402,E501,E261
# - id: ruff-format
# types_or: [ python,pyi]
# args:
# - --target-version=py311
# - --line-length=80

View File

@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE. SOFTWARE.

View File

@@ -1,3 +1,7 @@
.PHONY: lint
lint:
poetry run pre-commit run --all-files
.PHONY: clean-dev .PHONY: clean-dev
clean-dev: clean-dev:
@echo -n "Are you sure? [yes/N] (this will delete volumes) " && read ans && [ $${ans:-N} = yes ] @echo -n "Are you sure? [yes/N] (this will delete volumes) " && read ans && [ $${ans:-N} = yes ]

View File

@@ -14,7 +14,7 @@ To properly set up the API you need to install `docker` and to have these files,
2. go through the example file and configure it according to your needs. 2. go through the example file and configure it according to your needs.
3. you will need to create and reference at least one `secrets/orchestration.yaml` file, you can do so by following the instructions in the [auto-archiver](https://github.com/bellingcat/auto-archiver#installation) that automatically generates one for you. If you use the archive sheets feature you will need to create a `orchestrationsheets-sheets.yaml` file as well that should have the `gsheet_feeder` and `gsheet_db` enabled and configured, the auto-archiver has [extensive documentation](https://auto-archiver.readthedocs.io/en/latest/) on how to set this up. 3. you will need to create and reference at least one `secrets/orchestration.yaml` file, you can do so by following the instructions in the [auto-archiver](https://github.com/bellingcat/auto-archiver#installation) that automatically generates one for you. If you use the archive sheets feature you will need to create a `orchestrationsheets-sheets.yaml` file as well that should have the `gsheet_feeder` and `gsheet_db` enabled and configured, the auto-archiver has [extensive documentation](https://auto-archiver.readthedocs.io/en/latest/) on how to set this up.
Do not commit those files, they are .gitignored by default. Do not commit those files, they are .gitignored by default.
We also advise you to keep any sensitive files in the `secrets/` folder which is pinned and gitignored. We also advise you to keep any sensitive files in the `secrets/` folder which is pinned and gitignored.
We have examples for both of those files (`.env.example` and `user-groups.example.yaml`), and here's how to set them up whether you're in development or production: We have examples for both of those files (`.env.example` and `user-groups.example.yaml`), and here's how to set them up whether you're in development or production:

View File

@@ -1,11 +1,11 @@
from logging.config import fileConfig from logging.config import fileConfig
from sqlalchemy import engine_from_config
from sqlalchemy import pool
from alembic import context from alembic import context
from sqlalchemy import engine_from_config, pool
from app.shared.settings import get_settings from app.shared.settings import get_settings
# this is the Alembic Config object, which provides # this is the Alembic Config object, which provides
# access to the values within the .ini file in use. # access to the values within the .ini file in use.
config = context.config config = context.config

View File

@@ -5,8 +5,8 @@ Revises: 1636724ec4b1
Create Date: 2025-02-08 15:22:20.392522 Create Date: 2025-02-08 15:22:20.392522
""" """
from alembic import op
import sqlalchemy as sa import sqlalchemy as sa
from alembic import op
# revision identifiers, used by Alembic. # revision identifiers, used by Alembic.

View File

@@ -5,8 +5,8 @@ Revises: a23aaf3ae930
Create Date: 2025-02-05 19:19:01.984396 Create Date: 2025-02-05 19:19:01.984396
""" """
from alembic import op
import sqlalchemy as sa import sqlalchemy as sa
from alembic import op
# revision identifiers, used by Alembic. # revision identifiers, used by Alembic.

View File

@@ -5,8 +5,8 @@ Revises: 02b2f6d17ed0
Create Date: 2025-02-11 21:53:23.293274 Create Date: 2025-02-11 21:53:23.293274
""" """
from alembic import op
import sqlalchemy as sa import sqlalchemy as sa
from alembic import op
# revision identifiers, used by Alembic. # revision identifiers, used by Alembic.

View File

@@ -5,8 +5,8 @@ Revises: fa012ec405b8
Create Date: 2024-11-04 11:12:30.237299 Create Date: 2024-11-04 11:12:30.237299
""" """
from alembic import op
import sqlalchemy as sa import sqlalchemy as sa
from alembic import op
from sqlalchemy.engine.reflection import Inspector from sqlalchemy.engine.reflection import Inspector

View File

@@ -1,12 +1,13 @@
"""modify archive url to have uuid id instead of url unique constraint """modify archive url to have uuid id instead of url unique constraint
Revision ID: 9369a264945b Revision ID: 9369a264945b
Revises: Revises:
Create Date: 2023-12-20 17:24:59.320691 Create Date: 2023-12-20 17:24:59.320691
""" """
from alembic import op from alembic import op
# revision identifiers, used by Alembic. # revision identifiers, used by Alembic.
revision = '9369a264945b' revision = '9369a264945b'
down_revision = None down_revision = None

View File

@@ -5,8 +5,8 @@ Revises: 89121d2c96d8
Create Date: 2025-02-04 12:19:20.753570 Create Date: 2025-02-04 12:19:20.753570
""" """
from alembic import op
import sqlalchemy as sa import sqlalchemy as sa
from alembic import op
# revision identifiers, used by Alembic. # revision identifiers, used by Alembic.

View File

@@ -5,8 +5,8 @@ Revises: 93a611e4c066
Create Date: 2024-10-31 09:36:50.360710 Create Date: 2024-10-31 09:36:50.360710
""" """
from alembic import op
import sqlalchemy as sa import sqlalchemy as sa
from alembic import op
from sqlalchemy.engine.reflection import Inspector from sqlalchemy.engine.reflection import Inspector

View File

@@ -1,11 +1,13 @@
# TODO: code in this file should eventually be moved to the auto-archiver code base # TODO: code in this file should eventually be moved to the auto-archiver code base
from typing import List from typing import List
from loguru import logger
from auto_archiver.core import Media, Metadata from auto_archiver.core import Media, Metadata
from loguru import logger
from app.shared.db import models from app.shared.db import models
def get_all_urls(result: Metadata) -> List[models.ArchiveUrl]: def get_all_urls(result: Metadata) -> List[models.ArchiveUrl]:
db_urls = [] db_urls = []
for m in result.media: for m in result.media:
@@ -29,4 +31,3 @@ def convert_if_media(media):
except Exception as e: except Exception as e:
logger.debug(f"error parsing {media} : {e}") logger.debug(f"error parsing {media} : {e}")
return False return False

View File

@@ -2,6 +2,7 @@
import datetime import datetime
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from app.shared.db import worker_crud from app.shared.db import worker_crud

View File

@@ -1,8 +1,14 @@
from functools import lru_cache
from sqlalchemy import Engine, create_engine, event, text
from sqlalchemy.orm import sessionmaker
from contextlib import asynccontextmanager, contextmanager from contextlib import asynccontextmanager, contextmanager
from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, AsyncEngine, async_sessionmaker from functools import lru_cache
from sqlalchemy import Engine, create_engine, event, text
from sqlalchemy.ext.asyncio import (
AsyncEngine,
AsyncSession,
async_sessionmaker,
create_async_engine,
)
from sqlalchemy.orm import sessionmaker
from app.shared.settings import get_settings from app.shared.settings import get_settings

View File

@@ -1,8 +1,17 @@
from sqlalchemy import Column, String, JSON, DateTime, Boolean, Table, ForeignKey
from sqlalchemy.sql import func
from sqlalchemy.orm import relationship, declarative_base
import uuid import uuid
from sqlalchemy import (
JSON,
Boolean,
Column,
DateTime,
ForeignKey,
String,
Table,
)
from sqlalchemy.orm import declarative_base, relationship
from sqlalchemy.sql import func
Base = declarative_base() Base = declarative_base()

View File

@@ -1,8 +1,10 @@
from sqlalchemy.orm import Session
from datetime import datetime from datetime import datetime
from app.shared.db import models from sqlalchemy.orm import Session
from app.shared import schemas from app.shared import schemas
from app.shared.db import models
# TODO: isolate database operations away from worker and into WEB # TODO: isolate database operations away from worker and into WEB
# ONLY WORKER # ONLY WORKER

View File

@@ -1,4 +1,5 @@
import traceback import traceback
from loguru import logger from loguru import logger
@@ -6,7 +7,7 @@ from loguru import logger
logger.add("logs/api_logs.log", retention="30 days") logger.add("logs/api_logs.log", retention="30 days")
logger.add("logs/error_logs.log", retention="30 days", level="ERROR") logger.add("logs/error_logs.log", retention="30 days", level="ERROR")
def log_error(e: Exception, traceback_str: str = None, extra:str = ""): def log_error(e: Exception, traceback_str: str = None, extra:str = ""):
if not traceback_str: traceback_str = traceback.format_exc() if not traceback_str: traceback_str = traceback.format_exc()
if extra: extra = f"{extra}\n" if extra: extra = f"{extra}\n"

View File

@@ -1,7 +1,8 @@
from datetime import datetime
from typing import Annotated from typing import Annotated
from annotated_types import Len from annotated_types import Len
from pydantic import BaseModel from pydantic import BaseModel
from datetime import datetime
class SubmitSheet(BaseModel): class SubmitSheet(BaseModel):

View File

@@ -1,14 +1,15 @@
from functools import lru_cache
import os import os
from functools import lru_cache
from typing import Annotated, Set
from annotated_types import Len
from fastapi_mail import ConnectionConfig from fastapi_mail import ConnectionConfig
from pydantic_settings import BaseSettings, SettingsConfigDict from pydantic_settings import BaseSettings, SettingsConfigDict
from typing import Annotated, Set
from annotated_types import Len
class Settings(BaseSettings): class Settings(BaseSettings):
model_config = SettingsConfigDict(env_file=os.environ.get("ENVIRONMENT_FILE") , env_file_encoding='utf-8', extra='ignore', str_strip_whitespace=True) model_config = SettingsConfigDict(env_file=os.environ.get("ENVIRONMENT_FILE") , env_file_encoding='utf-8', extra='ignore', str_strip_whitespace=True)
# general # general
@@ -37,14 +38,14 @@ class Settings(BaseSettings):
if self.REDIS_PASSWORD: if self.REDIS_PASSWORD:
return f"redis://:{self.REDIS_PASSWORD}@{self.REDIS_HOSTNAME}:6379" return f"redis://:{self.REDIS_PASSWORD}@{self.REDIS_HOSTNAME}:6379"
return f"redis://{self.REDIS_HOSTNAME}:6379" return f"redis://{self.REDIS_HOSTNAME}:6379"
# cronjobs # cronjobs
CRON_ARCHIVE_SHEETS: bool = False CRON_ARCHIVE_SHEETS: bool = False
CRON_DELETE_STALE_SHEETS: bool = False CRON_DELETE_STALE_SHEETS: bool = False
DELETE_STALE_SHEETS_DAYS: int = 14 DELETE_STALE_SHEETS_DAYS: int = 14
CRON_DELETE_SCHEDULED_ARCHIVES: bool = False CRON_DELETE_SCHEDULED_ARCHIVES: bool = False
DELETE_SCHEDULED_ARCHIVES_CHECK_EVERY_N_DAYS: int = 7 DELETE_SCHEDULED_ARCHIVES_CHECK_EVERY_N_DAYS: int = 7
# observability # observability
REPEAT_COUNT_METRICS_SECONDS: int = 30 REPEAT_COUNT_METRICS_SECONDS: int = 30
@@ -73,4 +74,4 @@ class Settings(BaseSettings):
@lru_cache @lru_cache
def get_settings(): def get_settings():
return Settings() return Settings()

View File

@@ -1,8 +1,9 @@
from functools import lru_cache from functools import lru_cache
from celery import Celery
import redis
from celery import Celery
import redis
from app.shared.settings import get_settings from app.shared.settings import get_settings

View File

@@ -1,9 +1,16 @@
import json import json
import os import os
from typing import Dict, List, Set
import yaml import yaml
from loguru import logger from loguru import logger
from pydantic import BaseModel, computed_field, field_validator, Field, model_validator from pydantic import (
from typing import Dict, List, Set BaseModel,
Field,
computed_field,
field_validator,
model_validator,
)
from typing_extensions import Self from typing_extensions import Self

View File

@@ -7,4 +7,4 @@ def fnv1a_hash_mod(s: str, modulo:int) -> int:
hash ^= ord(char) hash ^= ord(char)
hash *= fnv_prime hash *= fnv_prime
hash &= 0xFFFFFFFF # Keep it 32-bit hash &= 0xFFFFFFFF # Keep it 32-bit
return (hash if hash < 0x80000000 else hash - 0x100000000) % modulo return (hash if hash < 0x80000000 else hash - 0x100000000) % modulo

View File

@@ -1,12 +1,14 @@
import os import os
from typing import AsyncGenerator from typing import AsyncGenerator
from fastapi.testclient import TestClient
import pytest
from unittest.mock import patch from unittest.mock import patch
import pytest
import pytest_asyncio import pytest_asyncio
from sqlalchemy.ext.asyncio import AsyncSession, AsyncEngine from fastapi.testclient import TestClient
from app.web.config import ALLOW_ANY_EMAIL from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession
from app.shared.settings import Settings from app.shared.settings import Settings
from app.web.config import ALLOW_ANY_EMAIL
from app.web.db.user_state import UserState from app.web.db.user_state import UserState
@@ -65,10 +67,11 @@ def db_session(test_db):
@pytest_asyncio.fixture() @pytest_asyncio.fixture()
async def async_test_db(get_settings: Settings): async def async_test_db(get_settings: Settings):
import asyncio
from app.shared.db import models from app.shared.db import models
from app.shared.db.database import make_async_engine from app.shared.db.database import make_async_engine
from app.web.db.crud import get_user_group_names from app.web.db.crud import get_user_group_names
import asyncio
get_user_group_names.cache_clear() get_user_group_names.cache_clear()
engine = await make_async_engine(get_settings.ASYNC_DATABASE_PATH) engine = await make_async_engine(get_settings.ASYNC_DATABASE_PATH)
@@ -108,8 +111,8 @@ async def async_db_session(async_test_db: AsyncEngine) -> AsyncGenerator[AsyncSe
@pytest.fixture() @pytest.fixture()
def app(db_session): def app(db_session):
from app.web.main import app_factory
from app.web.db import crud from app.web.db import crud
from app.web.main import app_factory
app = app_factory() app = app_factory()
crud.upsert_user_groups(db_session) crud.upsert_user_groups(db_session)
return app return app
@@ -123,7 +126,11 @@ def client(app):
@pytest.fixture() @pytest.fixture()
def app_with_auth(app, db_session): def app_with_auth(app, db_session):
from app.web.security import get_token_or_user_auth, get_user_auth, get_user_state from app.web.security import (
get_token_or_user_auth,
get_user_auth,
get_user_state,
)
app.dependency_overrides[get_token_or_user_auth] = lambda: "rick@example.com" app.dependency_overrides[get_token_or_user_auth] = lambda: "rick@example.com"
app.dependency_overrides[get_user_auth] = lambda: "morty@example.com" app.dependency_overrides[get_user_auth] = lambda: "morty@example.com"
app.dependency_overrides[get_user_state] = lambda: UserState(db_session, "MORTY@example.com") app.dependency_overrides[get_user_state] = lambda: UserState(db_session, "MORTY@example.com")
@@ -138,7 +145,7 @@ def client_with_auth(app_with_auth):
@pytest.fixture() @pytest.fixture()
def app_with_token(app): def app_with_token(app):
from app.web.security import token_api_key_auth, get_token_or_user_auth from app.web.security import get_token_or_user_auth, token_api_key_auth
app.dependency_overrides[token_api_key_auth] = lambda: ALLOW_ANY_EMAIL app.dependency_overrides[token_api_key_auth] = lambda: ALLOW_ANY_EMAIL
app.dependency_overrides[get_token_or_user_auth] = lambda: ALLOW_ANY_EMAIL app.dependency_overrides[get_token_or_user_auth] = lambda: ALLOW_ANY_EMAIL
return app return app

View File

@@ -1,3 +1,3 @@
{ {
"client_email": "fake_service_account@fake_service_account.iam.gserviceaccount.com" "client_email": "fake_service_account@fake_service_account.iam.gserviceaccount.com"
} }

View File

@@ -15,7 +15,7 @@ configurations:
gsheet_feeder: gsheet_feeder:
service_account: "app/tests/fake_service_account.json" service_account: "app/tests/fake_service_account.json"
cli_feeder: cli_feeder:
urls: urls:
- "url1" - "url1"
hash_enricher: hash_enricher:
algorithm: "SHA-256" algorithm: "SHA-256"

View File

@@ -3,4 +3,4 @@ def test_generate_uuid():
assert generate_uuid() != generate_uuid() assert generate_uuid() != generate_uuid()
assert len(generate_uuid()) == 36 assert len(generate_uuid()) == 36
assert generate_uuid().count("-") == 4 assert generate_uuid().count("-") == 4

View File

@@ -1,10 +1,9 @@
from app.shared.db import models
from app.shared.db import worker_crud, models
from datetime import datetime from datetime import datetime
from app.shared.db import models, worker_crud
from app.tests.web.db.test_crud import test_data from app.tests.web.db.test_crud import test_data
def test_update_sheet_last_url_archived_at(db_session): def test_update_sheet_last_url_archived_at(db_session):
# Create test sheet # Create test sheet
@@ -19,7 +18,7 @@ def test_update_sheet_last_url_archived_at(db_session):
db_session.refresh(test_sheet) db_session.refresh(test_sheet)
assert isinstance(test_sheet.last_url_archived_at, datetime) assert isinstance(test_sheet.last_url_archived_at, datetime)
assert test_sheet.last_url_archived_at > before assert test_sheet.last_url_archived_at > before
# Test non-existent sheet # Test non-existent sheet
assert worker_crud.update_sheet_last_url_archived_at(db_session, "non-existent-sheet") is False assert worker_crud.update_sheet_last_url_archived_at(db_session, "non-existent-sheet") is False
@@ -73,8 +72,8 @@ def test_create_tag(db_session):
def test_create_task(db_session): def test_create_task(db_session):
from app.shared.db import worker_crud
from app.shared import schemas from app.shared import schemas
from app.shared.db import worker_crud
task = schemas.ArchiveCreate( task = schemas.ArchiveCreate(
id="archive-id-456-101", id="archive-id-456-101",
@@ -114,4 +113,4 @@ def test_create_task(db_session):
assert nt.group_id == "spaceship" assert nt.group_id == "spaceship"
assert len(nt.tags) == 0 assert len(nt.tags) == 0
assert len(nt.urls) == 0 assert len(nt.urls) == 0
assert nt.created_at is not None assert nt.created_at is not None

View File

@@ -1,7 +1,12 @@
from datetime import datetime, timedelta from datetime import datetime, timedelta
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
import pytest import pytest
from app.shared.business_logic import get_store_archive_until, get_store_archive_until_or_never
from app.shared.business_logic import (
get_store_archive_until,
get_store_archive_until_or_never,
)
class Test_get_store_archive_until: class Test_get_store_archive_until:

View File

@@ -11,7 +11,7 @@ def test_fnv1a_hash_mod():
# Test different modulos # Test different modulos
hash1 = fnv1a_hash_mod("test", 5) hash1 = fnv1a_hash_mod("test", 5)
hash2 = fnv1a_hash_mod("test", 10) hash2 = fnv1a_hash_mod("test", 10)
assert 0 <= hash1 < 5 assert 0 <= hash1 < 5
assert 0 <= hash2 < 10 assert 0 <= hash2 < 10
@@ -28,4 +28,4 @@ def test_fnv1a_hash_mod():
assert 0 <= fnv1a_hash_mod("测试", 10) < 10 assert 0 <= fnv1a_hash_mod("测试", 10) < 10
# Test modulo = 1 edge case # Test modulo = 1 edge case
assert fnv1a_hash_mod("test", 1) == 0 assert fnv1a_hash_mod("test", 1) == 0

View File

@@ -3,4 +3,4 @@ This is just an invalid yaml for testing
still broken: True still broken: True
- one - one
- two - two

View File

@@ -84,4 +84,4 @@ groups:
# max_archive_lifespan_months: 12 # max_archive_lifespan_months: 12
max_monthly_urls: 1 max_monthly_urls: 1
# max_monthly_mbs: 50 # max_monthly_mbs: 50
priority: "low" priority: "low"

View File

@@ -3,10 +3,12 @@ from unittest.mock import patch
import pytest import pytest
import yaml import yaml
from app.shared.db import models from app.shared.db import models
from app.shared.settings import Settings from app.shared.settings import Settings
from app.web.db import crud from app.web.db import crud
authors = ["rick@example.com", "morty@example.com", "jerry@example.com"] authors = ["rick@example.com", "morty@example.com", "jerry@example.com"]
@@ -373,6 +375,7 @@ async def test_get_sheets_by_id_hash(async_db_session):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_delete_stale_sheets(async_db_session): async def test_delete_stale_sheets(async_db_session):
from datetime import datetime, timedelta from datetime import datetime, timedelta
from sqlalchemy.sql import select from sqlalchemy.sql import select
now = datetime.now() now = datetime.now()
@@ -435,4 +438,4 @@ async def test_delete_stale_sheets(async_db_session):
# Running again should not delete anything # Running again should not delete anything
deleted = await crud.delete_stale_sheets(async_db_session, 7) deleted = await crud.delete_stale_sheets(async_db_session, 7)
assert len(deleted) == 0 assert len(deleted) == 0

View File

@@ -1,5 +1,6 @@
from unittest.mock import MagicMock, PropertyMock, patch from unittest.mock import MagicMock, PropertyMock, patch
import pytest import pytest
from app.shared.db import models from app.shared.db import models

View File

@@ -1,10 +1,12 @@
from unittest.mock import MagicMock from unittest.mock import MagicMock
from fastapi.testclient import TestClient
import pytest import pytest
from fastapi.testclient import TestClient
from app.shared.schemas import Usage, UsageResponse from app.shared.schemas import Usage, UsageResponse
from app.shared.user_groups import GroupInfo from app.shared.user_groups import GroupInfo
from app.web.config import VERSION
from app.tests.web.db.test_crud import test_data from app.tests.web.db.test_crud import test_data
from app.web.config import VERSION
def test_endpoint_home(client_with_auth): def test_endpoint_home(client_with_auth):

View File

@@ -1,5 +1,5 @@
from datetime import datetime
import json import json
from datetime import datetime
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
from app.shared.db import models from app.shared.db import models

View File

@@ -1,5 +1,5 @@
from datetime import datetime
import json import json
from datetime import datetime
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
from fastapi.testclient import TestClient from fastapi.testclient import TestClient
@@ -45,8 +45,8 @@ def test_create_sheet_endpoint(app_with_auth, db_session):
assert response.json() == {"detail": "User does not have access to this group."} assert response.json() == {"detail": "User does not have access to this group."}
# switch to jerry who's got less quota/permissions # switch to jerry who's got less quota/permissions
from app.web.security import get_user_state
from app.web.db.user_state import UserState from app.web.db.user_state import UserState
from app.web.security import get_user_state
app_with_auth.dependency_overrides[get_user_state] = lambda: UserState(db_session, "jerry@example.com") app_with_auth.dependency_overrides[get_user_state] = lambda: UserState(db_session, "jerry@example.com")
client_jerry = TestClient(app_with_auth) client_jerry = TestClient(app_with_auth)

View File

@@ -1,10 +1,10 @@
import os import os
from unittest.mock import patch
from fastapi.testclient import TestClient
import shutil import shutil
from unittest.mock import patch
import pytest import pytest
from fastapi.testclient import TestClient
def test_lifespan(app): def test_lifespan(app):
with TestClient(app) as client: with TestClient(app) as client:
@@ -25,7 +25,7 @@ def test_logging_middleware(m1, client_with_auth):
client_with_auth.delete("/url/123") client_with_auth.delete("/url/123")
# creates one empty and one from above # creates one empty and one from above
assert len(EXCEPTION_COUNTER.collect()[0].samples) == 2 assert len(EXCEPTION_COUNTER.collect()[0].samples) == 2
def test_serve_local_archive_logic(get_settings): def test_serve_local_archive_logic(get_settings):
# create a test file first # create a test file first
@@ -38,7 +38,7 @@ def test_serve_local_archive_logic(get_settings):
get_settings.SERVE_LOCAL_ARCHIVE = "/app/local_archive_test" get_settings.SERVE_LOCAL_ARCHIVE = "/app/local_archive_test"
from app.web.main import app_factory from app.web.main import app_factory
app = app_factory(get_settings) app = app_factory(get_settings)
# test # test
client = TestClient(app) client = TestClient(app)
r = client.get("/app/local_archive_test/temp.txt") r = client.get("/app/local_archive_test/temp.txt")

View File

@@ -1,8 +1,8 @@
from unittest.mock import Mock, patch from unittest.mock import Mock, patch
import pytest
from fastapi import HTTPException from fastapi import HTTPException
from fastapi.security import HTTPAuthorizationCredentials from fastapi.security import HTTPAuthorizationCredentials
import pytest
from app.web.config import ALLOW_ANY_EMAIL from app.web.config import ALLOW_ANY_EMAIL
@@ -108,8 +108,8 @@ async def test_authenticate_user_exception():
def test_get_user_state(): def test_get_user_state():
from app.web.security import get_user_state
from app.web.db.user_state import UserState from app.web.db.user_state import UserState
from app.web.security import get_user_state
mock_session = Mock() mock_session = Mock()
test_email = "test@example.com" test_email = "test@example.com"

View File

@@ -1,13 +1,12 @@
from datetime import datetime from datetime import datetime
from unittest.mock import patch from unittest.mock import patch
import pytest import pytest
from app.shared.db import models
from app.shared import schemas
from auto_archiver.core import Media, Metadata from auto_archiver.core import Media, Metadata
from app.shared import schemas
from app.shared.db import models
class Test_create_archive_task(): class Test_create_archive_task():
URL = "https://example-live.com" URL = "https://example-live.com"

View File

@@ -1,3 +1,4 @@
from app.web.main import app_factory from app.web.main import app_factory
app = app_factory
app = app_factory

View File

@@ -5,7 +5,7 @@ API_DESCRIPTION = """
**Usage notes:** **Usage notes:**
- The API requires a Bearer token for most operations, which you can obtain by logging in with your Google account. - The API requires a Bearer token for most operations, which you can obtain by logging in with your Google account.
- You can use this API to archive single URLs or entire Google Sheets. - You can use this API to archive single URLs or entire Google Sheets.
- Once you submit a URL or Sheet for archiving, the API will return a task_id that you can use to check the status of the archiving process. It works asynchronously. - Once you submit a URL or Sheet for archiving, the API will return a task_id that you can use to check the status of the archiving process. It works asynchronously.
""" """
BREAKING_CHANGES = {"minVersion": "0.4.0", "message": "The latest update has breaking changes, please update the extension to the most recent version."} BREAKING_CHANGES = {"minVersion": "0.4.0", "message": "The latest update has breaking changes, please update the extension to the most recent version."}

View File

@@ -1,18 +1,19 @@
from collections import defaultdict from collections import defaultdict
from functools import lru_cache
from sqlalchemy.orm import Session, load_only
from sqlalchemy import Column, or_, func, select
from loguru import logger
from datetime import datetime, timedelta from datetime import datetime, timedelta
from sqlalchemy.ext.asyncio import AsyncSession from functools import lru_cache
from cachetools import LRUCache, cached from cachetools import LRUCache, cached
from cachetools.keys import hashkey from cachetools.keys import hashkey
from loguru import logger
from sqlalchemy import Column, func, or_, select
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import Session, load_only
from app.web.config import ALLOW_ANY_EMAIL
from app.shared.db import models from app.shared.db import models
from app.shared.settings import get_settings from app.shared.settings import get_settings
from app.shared.user_groups import UserGroups from app.shared.user_groups import UserGroups
from app.shared.utils.misc import fnv1a_hash_mod from app.shared.utils.misc import fnv1a_hash_mod
from app.web.config import ALLOW_ANY_EMAIL
from app.web.utils.misc import convert_priority_to_queue_dict from app.web.utils.misc import convert_priority_to_queue_dict
@@ -117,7 +118,7 @@ async def get_group_priority_async(db: AsyncSession, group_id: str) -> dict:
@cached(cache=LRUCache(maxsize=128), key=lambda db, email: hashkey(email)) @cached(cache=LRUCache(maxsize=128), key=lambda db, email: hashkey(email))
def get_user_group_names(db: Session, email: str) -> list[str]: def get_user_group_names(db: Session, email: str) -> list[str]:
""" """
given an email retrieves the user groups from the DB and then the email-domain groups from a global variable, the email does not need to belong to an existing user. given an email retrieves the user groups from the DB and then the email-domain groups from a global variable, the email does not need to belong to an existing user.
""" """
# TODO: the read: [group1, group2] permissions don't currently work # TODO: the read: [group1, group2] permissions don't currently work
if not email or not len(email) or "@" not in email: return [] if not email or not len(email) or "@" not in email: return []
@@ -173,7 +174,7 @@ def upsert_user_groups(db: Session):
def display_email_pii(email: str): def display_email_pii(email: str):
return f"'{email[0:3]}...@{email.split('@')[1]}'" return f"'{email[0:3]}...@{email.split('@')[1]}'"
""" """
reads the user_groups yaml file and inserts any new users, groups, reads the user_groups yaml file and inserts any new users, groups,
along with new participation of users in groups along with new participation of users in groups
""" """
filename = get_settings().USER_GROUPS_FILENAME filename = get_settings().USER_GROUPS_FILENAME
@@ -192,6 +193,7 @@ def upsert_user_groups(db: Session):
for group in explicit_groups: for group in explicit_groups:
group_domains[group].add(domain) group_domains[group].add(domain)
import json import json
# upsert groups and save a map of groupid -> dbobject # upsert groups and save a map of groupid -> dbobject
for group_id, g in ug.groups.items(): for group_id, g in ug.groups.items():
upsert_group(db, group_id, g.description, g.orchestrator, g.orchestrator_sheet, g.service_account_email, json.loads(g.permissions.model_dump_json()), list(group_domains.get(group_id, []))) upsert_group(db, group_id, g.description, g.orchestrator, g.orchestrator_sheet, g.service_account_email, json.loads(g.permissions.model_dump_json()), list(group_domains.get(group_id, [])))

View File

@@ -1,13 +1,14 @@
from typing import Dict, Set
import sqlalchemy
from sqlalchemy.orm import Session
from sqlalchemy import func
from datetime import datetime from datetime import datetime
from typing import Dict, Set
import sqlalchemy
from sqlalchemy import func
from sqlalchemy.orm import Session
from app.shared.db import models from app.shared.db import models
from app.shared.user_groups import GroupInfo, GroupPermissions
from app.shared.schemas import Usage, UsageResponse from app.shared.schemas import Usage, UsageResponse
from app.shared.user_groups import GroupInfo, GroupPermissions
from app.web.db import crud from app.web.db import crud
from app.web.utils.misc import convert_priority_to_queue_dict from app.web.utils.misc import convert_priority_to_queue_dict

View File

@@ -1,13 +1,15 @@
from typing import Dict from typing import Dict
from fastapi import APIRouter, Depends, HTTPException from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import FileResponse, JSONResponse from fastapi.responses import FileResponse, JSONResponse
from app.web.config import VERSION, BREAKING_CHANGES
from app.shared.schemas import ActiveUser, UsageResponse from app.shared.schemas import ActiveUser, UsageResponse
from app.shared.user_groups import GroupInfo
from app.web.config import BREAKING_CHANGES, VERSION
from app.web.db.user_state import UserState from app.web.db.user_state import UserState
from app.web.security import get_user_state from app.web.security import get_user_state
from app.shared.user_groups import GroupInfo
default_router = APIRouter() default_router = APIRouter()
@@ -42,7 +44,7 @@ def get_user_usage(
if not user.active: if not user.active:
raise HTTPException(status_code=403, detail="User is not active.") raise HTTPException(status_code=403, detail="User is not active.")
return user.usage() return user.usage()
@default_router.get('/favicon.ico', include_in_schema=False) @default_router.get('/favicon.ico', include_in_schema=False)

View File

@@ -1,19 +1,19 @@
import json import json
import sqlalchemy
from auto_archiver.core import Metadata
from fastapi import APIRouter, Depends, HTTPException from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from loguru import logger from loguru import logger
import sqlalchemy
from auto_archiver.core import Metadata
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from app.shared.aa_utils import get_all_urls
from app.web.config import ALLOW_ANY_EMAIL
from app.shared import business_logic, schemas from app.shared import business_logic, schemas
from app.shared.db import worker_crud from app.shared.aa_utils import get_all_urls
from app.shared.db import models, worker_crud
from app.shared.db.database import get_db_dependency from app.shared.db.database import get_db_dependency
from app.web.security import token_api_key_auth
from app.shared.db import models
from app.shared.log import log_error from app.shared.log import log_error
from app.web.config import ALLOW_ANY_EMAIL
from app.web.security import token_api_key_auth
interoperability_router = APIRouter(prefix="/interop", tags=["Interoperability endpoints."]) interoperability_router = APIRouter(prefix="/interop", tags=["Interoperability endpoints."])

View File

@@ -1,16 +1,16 @@
from fastapi import APIRouter, Depends, HTTPException from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from sqlalchemy import exc from sqlalchemy import exc
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from app.web.db.user_state import UserState
from app.shared import schemas from app.shared import schemas
from app.shared.task_messaging import get_celery
from app.web.security import get_user_state
from app.web.db import crud
from app.shared.db.database import get_db_dependency from app.shared.db.database import get_db_dependency
from app.shared.task_messaging import get_celery
from app.web.db import crud
from app.web.db.user_state import UserState
from app.web.security import get_user_state
sheet_router = APIRouter(prefix="/sheet", tags=["Google Spreadsheet operations"]) sheet_router = APIRouter(prefix="/sheet", tags=["Google Spreadsheet operations"])
@@ -78,4 +78,4 @@ def archive_user_sheet(
group_queue = user.priority_group(sheet.group_id) group_queue = user.priority_group(sheet.group_id)
task = celery.signature("create_sheet_task", args=[schemas.SubmitSheet(sheet_id=id, author_id=user.email, group_id=sheet.group_id).model_dump_json()]).apply_async(**group_queue) task = celery.signature("create_sheet_task", args=[schemas.SubmitSheet(sheet_id=id, author_id=user.email, group_id=sheet.group_id).model_dump_json()]).apply_async(**group_queue)
return JSONResponse({"id": task.id}, status_code=201) return JSONResponse({"id": task.id}, status_code=201)

View File

@@ -3,10 +3,10 @@ from fastapi import APIRouter, Depends
from fastapi.encoders import jsonable_encoder from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from app.shared.task_messaging import get_celery
from app.web.security import get_token_or_user_auth
from app.shared import schemas from app.shared import schemas
from app.shared.log import log_error from app.shared.log import log_error
from app.shared.task_messaging import get_celery
from app.web.security import get_token_or_user_auth
from app.web.utils.misc import custom_jsonable_encoder from app.web.utils.misc import custom_jsonable_encoder

View File

@@ -1,22 +1,22 @@
from datetime import datetime
from urllib.parse import urlparse
from fastapi import APIRouter, Depends, HTTPException from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from datetime import datetime
from loguru import logger from loguru import logger
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from app.web.config import ALLOW_ANY_EMAIL
from app.shared import schemas from app.shared import schemas
from app.shared.db.database import get_db_dependency
from app.shared.task_messaging import get_celery from app.shared.task_messaging import get_celery
from app.web.security import get_token_or_user_auth, get_user_state from app.web.config import ALLOW_ANY_EMAIL
from app.web.db import crud from app.web.db import crud
from app.web.db.user_state import UserState from app.web.db.user_state import UserState
from app.shared.db.database import get_db_dependency from app.web.security import get_token_or_user_auth, get_user_state
from urllib.parse import urlparse
from app.web.utils.misc import convert_priority_to_queue_dict from app.web.utils.misc import convert_priority_to_queue_dict
url_router = APIRouter(prefix="/url", tags=["Single URL operations"]) url_router = APIRouter(prefix="/url", tags=["Single URL operations"])
celery = get_celery() celery = get_celery()
@@ -47,7 +47,7 @@ def archive_url(
else: else:
archive_create.author_id = archive.author_id or email archive_create.author_id = archive.author_id or email
group_queue = convert_priority_to_queue_dict("high") group_queue = convert_priority_to_queue_dict("high")
task = celery.signature("create_archive_task", args=[archive_create.model_dump_json()]).apply_async(**group_queue) task = celery.signature("create_archive_task", args=[archive_create.model_dump_json()]).apply_async(**group_queue)
task_response = schemas.Task(id=task.id) task_response = schemas.Task(id=task.id)
@@ -74,8 +74,8 @@ def search_by_url(
@url_router.delete("/{id}", summary="Delete a single URL archive by id.") @url_router.delete("/{id}", summary="Delete a single URL archive by id.")
def delete_archive( def delete_archive(
id:str, id:str,
user: UserState = Depends(get_user_state), user: UserState = Depends(get_user_state),
db: Session = Depends(get_db_dependency) db: Session = Depends(get_db_dependency)
) -> schemas.DeleteResponse: ) -> schemas.DeleteResponse:
logger.info(f"deleting url archive task {id} request by {user.email}") logger.info(f"deleting url archive task {id} request by {user.email}")

View File

@@ -1,22 +1,32 @@
import asyncio import asyncio
from collections import defaultdict
import datetime import datetime
import logging import logging
from collections import defaultdict
from contextlib import asynccontextmanager
import alembic.config import alembic.config
from fastapi import FastAPI from fastapi import FastAPI
from contextlib import asynccontextmanager from fastapi_mail import FastMail, MessageSchema, MessageType
from fastapi_utils.tasks import repeat_every from fastapi_utils.tasks import repeat_every
from loguru import logger from loguru import logger
from fastapi_mail import FastMail, MessageSchema, MessageType
from app.shared.db import models
from app.shared.db.database import get_db, get_db_async, make_engine, wal_checkpoint
from app.shared import schemas from app.shared import schemas
from app.shared.db import models
from app.shared.db.database import (
get_db,
get_db_async,
make_engine,
wal_checkpoint,
)
from app.shared.settings import get_settings from app.shared.settings import get_settings
from app.shared.task_messaging import get_celery from app.shared.task_messaging import get_celery
from app.web.db import crud from app.web.db import crud
from app.web.middleware import increase_exceptions_counter from app.web.middleware import increase_exceptions_counter
from app.web.utils.metrics import measure_regular_metrics, redis_subscribe_worker_exceptions from app.web.utils.metrics import (
measure_regular_metrics,
redis_subscribe_worker_exceptions,
)
celery = get_celery() celery = get_celery()
@@ -183,4 +193,4 @@ async def delete_stale_sheets():
async def generate_users_export_csv(): async def generate_users_export_csv():
#TODO: implement a cronjob that regularly requested user data to a CSV file #TODO: implement a cronjob that regularly requested user data to a CSV file
# see https://colab.research.google.com/drive/1QDbo3QXHPBdiTuANlA1AWVvN-rqxuCPa?authuser=0#scrollTo=4nPXeSdK8RBT # see https://colab.research.google.com/drive/1QDbo3QXHPBdiTuANlA1AWVvN-rqxuCPa?authuser=0#scrollTo=4nPXeSdK8RBT
pass pass

View File

@@ -1,24 +1,23 @@
import os import os
from fastapi import FastAPI, Depends
from fastapi.staticfiles import StaticFiles from fastapi import Depends, FastAPI
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from prometheus_fastapi_instrumentator import Instrumentator from fastapi.staticfiles import StaticFiles
from loguru import logger from loguru import logger
from prometheus_fastapi_instrumentator import Instrumentator
from app.web.middleware import logging_middleware
from app.shared.task_messaging import get_celery
from app.web.security import token_api_key_auth
from app.web.config import VERSION, API_DESCRIPTION
from app.web.events import lifespan
from app.shared.settings import get_settings from app.shared.settings import get_settings
from app.shared.task_messaging import get_celery
from app.web.config import API_DESCRIPTION, VERSION
from app.web.endpoints.default import default_router from app.web.endpoints.default import default_router
from app.web.endpoints.url import url_router from app.web.endpoints.interoperability import interoperability_router
from app.web.endpoints.sheet import sheet_router from app.web.endpoints.sheet import sheet_router
from app.web.endpoints.task import task_router from app.web.endpoints.task import task_router
from app.web.endpoints.interoperability import interoperability_router from app.web.endpoints.url import url_router
from app.web.events import lifespan
from app.web.middleware import logging_middleware
from app.web.security import token_api_key_auth
celery = get_celery() celery = get_celery()
@@ -57,4 +56,4 @@ def app_factory(settings = get_settings()):
logger.warning(f"MOUNTing local archive, use this in development only {settings.SERVE_LOCAL_ARCHIVE}") logger.warning(f"MOUNTing local archive, use this in development only {settings.SERVE_LOCAL_ARCHIVE}")
app.mount(settings.SERVE_LOCAL_ARCHIVE, StaticFiles(directory=local_dir), name=settings.SERVE_LOCAL_ARCHIVE) app.mount(settings.SERVE_LOCAL_ARCHIVE, StaticFiles(directory=local_dir), name=settings.SERVE_LOCAL_ARCHIVE)
return app return app

View File

@@ -1,7 +1,9 @@
import traceback import traceback
from loguru import logger
from fastapi import Request from fastapi import Request
from loguru import logger
from app.shared.log import log_error from app.shared.log import log_error
from app.web.utils.metrics import EXCEPTION_COUNTER from app.web.utils.metrics import EXCEPTION_COUNTER
@@ -25,7 +27,7 @@ async def increase_exceptions_counter(e: Exception, location:str="cronjob"):
last_trace = traceback.extract_tb(e.__traceback__)[-1] last_trace = traceback.extract_tb(e.__traceback__)[-1]
_file, _line, func_name, _text = last_trace _file, _line, func_name, _text = last_trace
location = func_name location = func_name
except Exception as e: except Exception as e:
logger.error(f"Unable to get function name from cronjob exception traceback: {e}") logger.error(f"Unable to get function name from cronjob exception traceback: {e}")
EXCEPTION_COUNTER.labels(type=e.__class__.__name__, location=location).inc() EXCEPTION_COUNTER.labels(type=e.__class__.__name__, location=location).inc()
log_error(e) log_error(e)

View File

@@ -1,14 +1,17 @@
import secrets
import requests
from fastapi import Depends, HTTPException, status
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
from loguru import logger from loguru import logger
import requests, secrets
from fastapi import HTTPException, status, Depends
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from app.web.config import ALLOW_ANY_EMAIL
from app.shared.settings import get_settings
from app.shared.db.database import get_db_dependency from app.shared.db.database import get_db_dependency
from app.shared.settings import get_settings
from app.web.config import ALLOW_ANY_EMAIL
from app.web.db.user_state import UserState from app.web.db.user_state import UserState
settings = get_settings() settings = get_settings()
bearer_security = HTTPBearer() bearer_security = HTTPBearer()
@@ -80,4 +83,4 @@ def authenticate_user(access_token):
def get_user_state(email:str=Depends(get_user_auth), db:Session=Depends(get_db_dependency)): def get_user_state(email:str=Depends(get_user_auth), db:Session=Depends(get_db_dependency)):
return UserState(db, email) return UserState(db, email)

View File

@@ -2,12 +2,13 @@ import asyncio
import json import json
import os import os
import shutil import shutil
from prometheus_client import Counter, Gauge from prometheus_client import Counter, Gauge
from app.web.db import crud
from app.shared.db.database import get_db from app.shared.db.database import get_db
from app.shared.log import log_error from app.shared.log import log_error
from app.shared.task_messaging import get_redis from app.shared.task_messaging import get_redis
from app.web.db import crud
# Custom metrics # Custom metrics

View File

@@ -1,4 +1,5 @@
import base64 import base64
from fastapi.encoders import jsonable_encoder from fastapi.encoders import jsonable_encoder

View File

@@ -1,21 +1,22 @@
import datetime
import json import json
import traceback
import traceback, datetime from auto_archiver.core.orchestrator import ArchivingOrchestrator
from celery.signals import task_failure from celery.signals import task_failure
from loguru import logger from loguru import logger
from sqlalchemy import exc from sqlalchemy import exc
from auto_archiver.core.orchestrator import ArchivingOrchestrator
from app.shared.db import models
from app.shared.db.database import get_db
from app.shared import business_logic, schemas from app.shared import business_logic, schemas
from app.shared.task_messaging import get_celery, get_redis
from app.shared.settings import get_settings
from app.shared.log import log_error
from app.shared.aa_utils import get_all_urls from app.shared.aa_utils import get_all_urls
from app.shared.db import worker_crud from app.shared.db import models, worker_crud
from app.shared.db.database import get_db
from app.shared.log import log_error
from app.shared.settings import get_settings
from app.shared.task_messaging import get_celery, get_redis
from app.worker.worker_log import setup_celery_logger from app.worker.worker_log import setup_celery_logger
settings = get_settings() settings = get_settings()
celery = get_celery("worker") celery = get_celery("worker")
@@ -26,7 +27,7 @@ USER_GROUPS_FILENAME = settings.USER_GROUPS_FILENAME
setup_celery_logger(celery) setup_celery_logger(celery)
# TODO: these are temporary PATCHES for new aa's functionality # TODO: these are temporary PATCHES for new aa's functionality
# logger.add("app/worker/worker_log.log", level="DEBUG") # logger.add("app/worker/worker_log.log", level="DEBUG")
logger.remove = lambda x: print(f"logger.remove({x})") logger.remove = lambda x: print(f"logger.remove({x})")
# TODO: after release, as it requires updating past entries with sheet_id where tag is used, drop tags # TODO: after release, as it requires updating past entries with sheet_id where tag is used, drop tags

View File

@@ -1,9 +1,11 @@
from loguru import logger
from celery import Celery
import sys import sys
from celery import Celery
from loguru import logger
from app.shared.task_messaging import get_celery from app.shared.task_messaging import get_celery
celery = get_celery("worker") celery = get_celery("worker")
def setup_celery_logger(celery): def setup_celery_logger(celery):
@@ -22,7 +24,7 @@ def setup_celery_logger(celery):
if message.strip(): if message.strip():
logger.info(message.strip()) logger.info(message.strip())
# Required to prevent issues with buffered output # Required to prevent issues with buffered output
def flush(self): pass def flush(self): pass
def isatty(self): return False def isatty(self): return False
sys.stdout = InterceptHandler() sys.stdout = InterceptHandler()

View File

@@ -12,7 +12,7 @@ services:
- ALLOWED_ORIGINS=["http://localhost:8000","http://localhost:8004","http://localhost:8081","chrome-extension://ojcimmjndnlmmlgnjaeojoebaceokpdp"] - ALLOWED_ORIGINS=["http://localhost:8000","http://localhost:8004","http://localhost:8081","chrome-extension://ojcimmjndnlmmlgnjaeojoebaceokpdp"]
- USER_GROUPS_FILENAME=/aa-api/app/user-groups.dev.yaml - USER_GROUPS_FILENAME=/aa-api/app/user-groups.dev.yaml
- DATABASE_PATH=sqlite:////aa-api/database/auto-archiver.db - DATABASE_PATH=sqlite:////aa-api/database/auto-archiver.db
worker: worker:
# command: watchmedo auto-restart --patterns="*.py" --recursive --ignore-directories -- celery -- --app=app.worker.main.celery worker -Q high_priority,low_priority --concurrency=${CONCURRENCY} --max-tasks-per-child=100 # command: watchmedo auto-restart --patterns="*.py" --recursive --ignore-directories -- celery -- --app=app.worker.main.celery worker -Q high_priority,low_priority --concurrency=${CONCURRENCY} --max-tasks-per-child=100

View File

@@ -5,7 +5,7 @@ volumes:
name: "auto-archiver-api" name: "auto-archiver-api"
services: services:
web: web:
build: build:
context: . context: .
dockerfile: web.Dockerfile dockerfile: web.Dockerfile
restart: always restart: always
@@ -29,7 +29,7 @@ services:
retries: 3 retries: 3
worker: worker:
build: build:
context: . context: .
dockerfile: worker.Dockerfile dockerfile: worker.Dockerfile
restart: always restart: always
@@ -68,4 +68,4 @@ services:
test: ["CMD", "redis-cli", "-a", "${REDIS_PASSWORD}", "ping"] test: ["CMD", "redis-cli", "-a", "${REDIS_PASSWORD}", "ping"]
interval: 30s interval: 30s
timeout: 10s timeout: 10s
retries: 3 retries: 3

129
poetry.lock generated
View File

@@ -616,6 +616,18 @@ files = [
[package.dependencies] [package.dependencies]
pycparser = "*" pycparser = "*"
[[package]]
name = "cfgv"
version = "3.4.0"
description = "Validate configuration and produce human readable error messages."
optional = false
python-versions = ">=3.8"
groups = ["dev"]
files = [
{file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"},
{file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
]
[[package]] [[package]]
name = "charset-normalizer" name = "charset-normalizer"
version = "3.4.1" version = "3.4.1"
@@ -959,6 +971,18 @@ calendars = ["convertdate (>=2.2.1)", "hijridate"]
fasttext = ["fasttext (>=0.9.1)", "numpy (>=1.19.3,<2)"] fasttext = ["fasttext (>=0.9.1)", "numpy (>=1.19.3,<2)"]
langdetect = ["langdetect (>=1.0.0)"] langdetect = ["langdetect (>=1.0.0)"]
[[package]]
name = "distlib"
version = "0.3.9"
description = "Distribution utilities"
optional = false
python-versions = "*"
groups = ["dev"]
files = [
{file = "distlib-0.3.9-py2.py3-none-any.whl", hash = "sha256:47f8c22fd27c27e25a65601af709b38e4f0a45ea4fc2e710f65755fa8caaaf87"},
{file = "distlib-0.3.9.tar.gz", hash = "sha256:a60f20dea646b8a33f3e7772f74dc0b2d0772d2837ee1342a00645c81edf9403"},
]
[[package]] [[package]]
name = "dnspython" name = "dnspython"
version = "2.7.0" version = "2.7.0"
@@ -1097,6 +1121,23 @@ future = "*"
[package.extras] [package.extras]
dev = ["Sphinx (==2.1.0)", "future (==0.17.1)", "numpy (==1.16.4)", "pytest (==4.6.1)", "pytest-mock (==1.10.4)", "tox (==3.12.1)"] dev = ["Sphinx (==2.1.0)", "future (==0.17.1)", "numpy (==1.16.4)", "pytest (==4.6.1)", "pytest-mock (==1.10.4)", "tox (==3.12.1)"]
[[package]]
name = "filelock"
version = "3.17.0"
description = "A platform independent file lock."
optional = false
python-versions = ">=3.9"
groups = ["dev"]
files = [
{file = "filelock-3.17.0-py3-none-any.whl", hash = "sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338"},
{file = "filelock-3.17.0.tar.gz", hash = "sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e"},
]
[package.extras]
docs = ["furo (>=2024.8.6)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)"]
testing = ["covdefaults (>=2.3)", "coverage (>=7.6.10)", "diff-cover (>=9.2.1)", "pytest (>=8.3.4)", "pytest-asyncio (>=0.25.2)", "pytest-cov (>=6)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.28.1)"]
typing = ["typing-extensions (>=4.12.2) ; python_version < \"3.11\""]
[[package]] [[package]]
name = "future" name = "future"
version = "1.0.0" version = "1.0.0"
@@ -1409,6 +1450,21 @@ http2 = ["h2 (>=3,<5)"]
socks = ["socksio (==1.*)"] socks = ["socksio (==1.*)"]
zstd = ["zstandard (>=0.18.0)"] zstd = ["zstandard (>=0.18.0)"]
[[package]]
name = "identify"
version = "2.6.8"
description = "File identification library for Python"
optional = false
python-versions = ">=3.9"
groups = ["dev"]
files = [
{file = "identify-2.6.8-py2.py3-none-any.whl", hash = "sha256:83657f0f766a3c8d0eaea16d4ef42494b39b34629a4b3192a9d020d349b3e255"},
{file = "identify-2.6.8.tar.gz", hash = "sha256:61491417ea2c0c5c670484fd8abbb34de34cdae1e5f39a73ee65e48e4bb663fc"},
]
[package.extras]
license = ["ukkonen"]
[[package]] [[package]]
name = "idna" name = "idna"
version = "3.10" version = "3.10"
@@ -1724,6 +1780,18 @@ files = [
{file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
] ]
[[package]]
name = "nodeenv"
version = "1.9.1"
description = "Node.js virtual environment builder"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
groups = ["dev"]
files = [
{file = "nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9"},
{file = "nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f"},
]
[[package]] [[package]]
name = "numpy" name = "numpy"
version = "2.1.3" version = "2.1.3"
@@ -1981,6 +2049,23 @@ tests = ["check-manifest", "coverage (>=7.4.2)", "defusedxml", "markdown2", "ole
typing = ["typing-extensions ; python_version < \"3.10\""] typing = ["typing-extensions ; python_version < \"3.10\""]
xmp = ["defusedxml"] xmp = ["defusedxml"]
[[package]]
name = "platformdirs"
version = "4.3.6"
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`."
optional = false
python-versions = ">=3.8"
groups = ["dev"]
files = [
{file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"},
{file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"},
]
[package.extras]
docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4)"]
test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"]
type = ["mypy (>=1.11.2)"]
[[package]] [[package]]
name = "pluggy" name = "pluggy"
version = "1.5.0" version = "1.5.0"
@@ -1997,6 +2082,25 @@ files = [
dev = ["pre-commit", "tox"] dev = ["pre-commit", "tox"]
testing = ["pytest", "pytest-benchmark"] testing = ["pytest", "pytest-benchmark"]
[[package]]
name = "pre-commit"
version = "4.1.0"
description = "A framework for managing and maintaining multi-language pre-commit hooks."
optional = false
python-versions = ">=3.9"
groups = ["dev"]
files = [
{file = "pre_commit-4.1.0-py2.py3-none-any.whl", hash = "sha256:d29e7cb346295bcc1cc75fc3e92e343495e3ea0196c9ec6ba53f49f10ab6ae7b"},
{file = "pre_commit-4.1.0.tar.gz", hash = "sha256:ae3f018575a588e30dfddfab9a05448bfbd6b73d78709617b5a2b853549716d4"},
]
[package.dependencies]
cfgv = ">=2.0.0"
identify = ">=1.0.0"
nodeenv = ">=0.11.1"
pyyaml = ">=5.1"
virtualenv = ">=20.10.0"
[[package]] [[package]]
name = "prometheus-client" name = "prometheus-client"
version = "0.21.1" version = "0.21.1"
@@ -2557,7 +2661,7 @@ version = "6.0.2"
description = "YAML parser and emitter for Python" description = "YAML parser and emitter for Python"
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
groups = ["web"] groups = ["dev", "web"]
files = [ files = [
{file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"}, {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
{file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"}, {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
@@ -3415,6 +3519,27 @@ files = [
{file = "vine-5.1.0.tar.gz", hash = "sha256:8b62e981d35c41049211cf62a0a1242d8c1ee9bd15bb196ce38aefd6799e61e0"}, {file = "vine-5.1.0.tar.gz", hash = "sha256:8b62e981d35c41049211cf62a0a1242d8c1ee9bd15bb196ce38aefd6799e61e0"},
] ]
[[package]]
name = "virtualenv"
version = "20.29.2"
description = "Virtual Python Environment builder"
optional = false
python-versions = ">=3.8"
groups = ["dev"]
files = [
{file = "virtualenv-20.29.2-py3-none-any.whl", hash = "sha256:febddfc3d1ea571bdb1dc0f98d7b45d24def7428214d4fb73cc486c9568cce6a"},
{file = "virtualenv-20.29.2.tar.gz", hash = "sha256:fdaabebf6d03b5ba83ae0a02cfe96f48a716f4fae556461d180825866f75b728"},
]
[package.dependencies]
distlib = ">=0.3.7,<1"
filelock = ">=3.12.2,<4"
platformdirs = ">=3.9.1,<5"
[package.extras]
docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8) ; platform_python_implementation == \"PyPy\" or platform_python_implementation == \"CPython\" and sys_platform == \"win32\" and python_version >= \"3.13\"", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10) ; platform_python_implementation == \"CPython\""]
[[package]] [[package]]
name = "vk-api" name = "vk-api"
version = "11.9.9" version = "11.9.9"
@@ -3688,4 +3813,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"]
[metadata] [metadata]
lock-version = "2.1" lock-version = "2.1"
python-versions = ">=3.10,<3.13" python-versions = ">=3.10,<3.13"
content-hash = "11d734f2ee32206214a7ecb8dc3ec8d19a7b6281ee98b509a5bb8bdb647c674a" content-hash = "c4a5c50ac109c9912992ca86d2b5ec712c6bcfc84838bf42f90208b02cc27b3c"

View File

@@ -52,4 +52,4 @@ pytest = ">=8.3.4,<9.0.0"
httpx = ">=0.28.1,<0.29.0" httpx = ">=0.28.1,<0.29.0"
coverage = ">=7.6.11,<8.0.0" coverage = ">=7.6.11,<8.0.0"
pytest-asyncio = ">=0.25.3,<0.26.0" pytest-asyncio = ">=0.25.3,<0.26.0"
pre-commit = "^4.1.0"

View File

@@ -59,4 +59,3 @@ groups:
permissions: permissions:
read: ["default"] read: ["default"]
read_public: true read_public: true

View File

@@ -30,4 +30,4 @@ COPY alembic.ini ./
COPY ./app/ ./app/ COPY ./app/ ./app/
COPY user-groups.* ./app/ COPY user-groups.* ./app/
ENTRYPOINT ["./poetry-venv/bin/poetry", "run"] ENTRYPOINT ["./poetry-venv/bin/poetry", "run"]