From fbfebd467192b467056cb4960235345772047ffb Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 12 Feb 2025 00:02:08 +0000 Subject: [PATCH] improves example files --- .env.example | 35 ++++++++++++++++++++ .env.test | 3 +- .example.env | 9 ------ app/shared/settings.py | 33 +++++++++---------- app/user-groups.example.yaml | 62 ++++++++++++++++++++++++++++++++++++ app/web/main.py | 2 +- database/.gitkeep | 0 docker-compose.yml | 4 +-- 8 files changed, 116 insertions(+), 32 deletions(-) create mode 100644 .env.example delete mode 100644 .example.env create mode 100644 app/user-groups.example.yaml create mode 100644 database/.gitkeep diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..46cc8cc --- /dev/null +++ b/.env.example @@ -0,0 +1,35 @@ +# main settings +USER_GROUPS_FILENAME=app/user-groups.yaml +# database +DATABASE_PATH="sqlite:///./database/auto-archiver.db" +DATABASE_QUERY_LIMIT=100 + +# security settings +API_BEARER_TOKEN=TODO-MODIFY-THIS-API-TOKEN +ALLOWED_ORIGINS='["http://localhost:8000","http://localhost:8004","http://localhost:8081","https://auto-archiver.bellingcat.com"]' +CHROME_APP_IDS='[PROJECT_ID.apps.googleusercontent.com"]' +BLOCKED_EMAILS='[]' +# redis configuration +REDIS_PASSWORD=TODO-MODIFY-THIS-REDIS-PASSWORD +REDIS_HOSTNAME="localhost" + +# cronjobs management, enable as needed +CRON_ARCHIVE_SHEETS=true +CRON_DELETE_STALE_SHEETS=true +DELETE_STALE_SHEETS_DAYS=7 +CRON_DELETE_SCHEDULED_ARCHIVES=false +DELETE_SCHEDULED_ARCHIVES_NOTIFY_DAYS=14 + +# observability for prometheus +REPEAT_COUNT_METRICS_SECONDS=30 + +# mail service settings, if you want to email users +MAIL_FROM="noreply@auto-archiver.com" +MAIL_FROM_NAME="My Auto Archiver deployment" +MAIL_USERNAME="USERNAME" +MAIL_PASSWORD="PASSWORD" +MAIL_SERVER="mail.server.com" +MAIL_PORT=587 +MAIL_STARTTLS=False +MAIL_SSL_TLS=True + diff --git a/.env.test b/.env.test index f7da607..32318f0 100644 --- a/.env.test +++ b/.env.test @@ -5,5 +5,4 @@ BLOCKED_EMAILS='["blocked@example.com"]' DATABASE_PATH="sqlite:///auto-archiver.test.db" API_BEARER_TOKEN=this_is_the_test_api_token -USER_GROUPS_FILENAME=app/tests/user-groups.test.yaml -SHEET_ORCHESTRATION_YAML=app/tests/orchestration.test.yaml \ No newline at end of file +USER_GROUPS_FILENAME=app/tests/user-groups.test.yaml \ No newline at end of file diff --git a/.example.env b/.example.env deleted file mode 100644 index b21cf10..0000000 --- a/.example.env +++ /dev/null @@ -1,9 +0,0 @@ -REDIS_PASSWORD=TODO - -DATABASE_PATH="sqlite:///./database/auto-archiver.db" -USER_GROUPS_FILENAME=app/user-groups.yaml -CHROME_APP_IDS=000000000000000000000000000000000000000000000.apps.googleusercontent.com,000000000000000000000000000000000000000000001.apps.googleusercontent.com -#ALLOWED_ORIGINS="http://localhost:8004" # dev only - - -API_BEARER_TOKEN=TODO \ No newline at end of file diff --git a/app/shared/settings.py b/app/shared/settings.py index 039f4fd..da277f2 100644 --- a/app/shared/settings.py +++ b/app/shared/settings.py @@ -13,16 +13,7 @@ class Settings(BaseSettings): # general SERVE_LOCAL_ARCHIVE: str = "" - USER_GROUPS_FILENAME: str = "user-groups.yaml" - SHEET_ORCHESTRATION_YAML : str = "secrets/orchestration-sheet.yaml" - - # cronjobs - #TODO: disable by default? - CRON_ARCHIVE_SHEETS: bool = False - CRON_DELETE_STALE_SHEETS: bool = True - DELETE_STALE_SHEETS_DAYS: int = 14 - CRON_DELETE_SCHEDULED_ARCHIVES: bool = True - DELETE_SCHEDULED_ARCHIVES_NOTIFY_DAYS: int = 14 + USER_GROUPS_FILENAME: str = "app/user-groups.yaml" # database DATABASE_PATH: str @@ -31,26 +22,32 @@ class Settings(BaseSettings): def ASYNC_DATABASE_PATH(self) -> str: return self.DATABASE_PATH.replace("sqlite://", "sqlite+aiosqlite://") + # security + API_BEARER_TOKEN: Annotated[str, Len(min_length=20)] + ALLOWED_ORIGINS: Annotated[Set[str], Len(min_length=1)] + CHROME_APP_IDS: Annotated[Set[Annotated[str, Len(min_length=10)]], Len(min_length=1)] + BLOCKED_EMAILS: Annotated[Set[str], Len(min_length=0)] = set() + # redis REDIS_PASSWORD: str = "" REDIS_HOSTNAME: str = "localhost" + REDIS_EXCEPTIONS_CHANNEL: str = "exceptions-channel" @property def CELERY_BROKER_URL(self)-> str: if self.REDIS_PASSWORD: return f"redis://:{self.REDIS_PASSWORD}@{self.REDIS_HOSTNAME}:6379" return f"redis://{self.REDIS_HOSTNAME}:6379" - REDIS_EXCEPTIONS_CHANNEL: str = "exceptions-channel" + + # cronjobs + CRON_ARCHIVE_SHEETS: bool = False + CRON_DELETE_STALE_SHEETS: bool = False + DELETE_STALE_SHEETS_DAYS: int = 14 + CRON_DELETE_SCHEDULED_ARCHIVES: bool = False + DELETE_SCHEDULED_ARCHIVES_NOTIFY_DAYS: int = 14 # observability REPEAT_COUNT_METRICS_SECONDS: int = 30 - # security - API_BEARER_TOKEN: Annotated[str, Len(min_length=20)] - ALLOWED_ORIGINS: Annotated[Set[str], Len(min_length=1)] - CHROME_APP_IDS: Annotated[Set[Annotated[str, Len(min_length=10)]], Len(min_length=1)] - #TODO: deprecate blocklist? - BLOCKED_EMAILS: Annotated[Set[str], Len(min_length=0)] = set() - # email configuration, if needed MAIL_FROM: str = "noreply@bellingcat.com" MAIL_FROM_NAME: str = "Bellingcat's Auto Archiver" diff --git a/app/user-groups.example.yaml b/app/user-groups.example.yaml new file mode 100644 index 0000000..ec67f86 --- /dev/null +++ b/app/user-groups.example.yaml @@ -0,0 +1,62 @@ +# NOTE: all emails should be lower-cased +users: + user01@example.com: + - group1 + user02@example.com: + - group2 + user03@example.com: + - group1 + - group2 + +domains: + example.com: + - group-for-friends + gmail-example.com: + - group1 + + +groups: + group1: + description: "Group 1 which can do everything, no limits" + orchestrator: secrets/orchestration.group1.yaml + orchestrator_sheet: secrets/orchestration.group1-sheet.yaml + permissions: + read: ["all"] + archive_url: true + archive_sheet: true + sheet_frequency: ["hourly", "daily"] + max_sheets: -1 + max_archive_lifespan_months: -1 + max_monthly_urls: -1 + max_monthly_mbs: -1 + manually_trigger_sheet: true + group2: + description: "Group that can only archive URLs, not sheets, they can search their own group and group-for-friends archives." + orchestrator: secrets/orchestration.group2.yaml + orchestrator_sheet: secrets/orchestration-group2-sheet.yaml + permissions: + read: ["group2", "group-for-friends"] + archive_url: true + max_archive_lifespan_months: 12 + max_monthly_urls: 100 + max_monthly_mbs: 1000 + group-for-friends: + description: "Friends can have one sheet only which archives once a day" + orchestrator: secrets/orchestration.friends.yaml + orchestrator_sheet: secrets/orchestration.friends-sheet.yaml + permissions: + read: ["friends-1"] + archive_sheet: true + sheet_frequency: ["daily"] + max_sheets: 1 + max_archive_lifespan_months: 12 + max_monthly_urls: 1000 + max_monthly_mbs: 1000 + default: + description: "Public access, can only search public archives" + orchestrator: secrets/orchestration-default.yaml + orchestrator_sheet: secrets/orchestration-default.yaml + permissions: + read: ["default"] + read_public: true + \ No newline at end of file diff --git a/app/web/main.py b/app/web/main.py index 2c7c9ee..3bd8962 100644 --- a/app/web/main.py +++ b/app/web/main.py @@ -58,7 +58,7 @@ def app_factory(settings = get_settings()): # prometheus exposed in /metrics with authentication Instrumentator(should_group_status_codes=False, excluded_handlers=["/metrics", "/health", "/openapi.json", "/favicon.ico"]).instrument(app).expose(app, dependencies=[Depends(token_api_key_auth)]) - # TODO: recheck this for security, currently only needed for when local_storage is used + # TODO: recheck this for security, currently only needed for when local_storage is used in development local_dir = settings.SERVE_LOCAL_ARCHIVE if not os.path.isdir(local_dir) and os.path.isdir(local_dir.replace("/app", ".")): local_dir = local_dir.replace("/app", ".") diff --git a/database/.gitkeep b/database/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/docker-compose.yml b/docker-compose.yml index 074b311..723f225 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -18,7 +18,7 @@ services: command: uvicorn app.web:app --factory --host 0.0.0.0 volumes: - ./logs:/aa-api/logs - - ./app/database:/aa-api/database + - ./database:/aa-api/database depends_on: - redis healthcheck: @@ -36,7 +36,7 @@ services: command: celery --app=app.worker.main.celery worker --loglevel=warning --logfile=/aa-api/logs/celery.log volumes: - ./logs:/aa-api/logs - - ./app/database:/aa-api/database + - ./database:/aa-api/database - /var/run/docker.sock:/var/run/docker.sock - crawls:/crawls # BROWSERTRIX_HOME_HOST:BROWSERTRIX_HOME_CONTAINER, do not change /crawls environment: