From 39da4e4eb7c1425bf5226bf48c5af2ae45fc791a Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 27 Feb 2023 12:00:38 +0100 Subject: [PATCH] 0.1.5 --- .example.env | 3 +++ README.md | 6 ++++-- docker-compose.yml | 2 ++ src/.example.env | 6 ++++++ src/db/crud.py | 2 +- src/main.py | 12 ++++++++---- src/worker.py | 37 ++++++++++++++++++++++++++----------- 7 files changed, 50 insertions(+), 18 deletions(-) create mode 100644 .example.env create mode 100644 src/.example.env diff --git a/.example.env b/.example.env new file mode 100644 index 0000000..5e6f3c2 --- /dev/null +++ b/.example.env @@ -0,0 +1,3 @@ +FLOWER_USERNAME=TODO +FLOWER_PASSWORD=TODO +REDIS_PASSWORD=TODO \ No newline at end of file diff --git a/README.md b/README.md index 7e49dc9..47d2130 100644 --- a/README.md +++ b/README.md @@ -23,5 +23,7 @@ Copy `.env` and `src/.env` to deployment, along with the contents of `secrets/` Then `docker compose up -d`. -#### updating packages/app -If pipenv packages are updated: `pipenv lock --requirements -r > requirements.txt` (manually comment line `-i https://pypi.org/simple`) and then `docker compose down` + `docker compose up --build -d` to build images with new packages. \ No newline at end of file +#### updating packages/app/access +If pipenv packages are updated: `pipenv lock --requirements -r > requirements.txt` (manually comment line `-i https://pypi.org/simple`) and then `docker compose down` + `docker compose up --build -d` to build images with new packages. + +New users should be added to the `src/.env` file `ALLOWED_EMAILS` prop \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 04cec33..7318a02 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -10,6 +10,7 @@ services: command: uvicorn main:app --host 0.0.0.0 --reload volumes: - ./src:/usr/src/app + env_file: src/.env environment: - CELERY_BROKER_URL=redis://:${REDIS_PASSWORD}@redis:6379/0 - CELERY_RESULT_BACKEND=redis://:${REDIS_PASSWORD}@redis:6379/0 @@ -22,6 +23,7 @@ services: command: celery worker --app=worker.celery --loglevel=info --logfile=logs/celery.log volumes: - ./src:/usr/src/app + env_file: src/.env environment: - CELERY_BROKER_URL=redis://:${REDIS_PASSWORD}@redis:6379/0 - CELERY_RESULT_BACKEND=redis://:${REDIS_PASSWORD}@redis:6379/0 diff --git a/src/.example.env b/src/.example.env new file mode 100644 index 0000000..1b7f28d --- /dev/null +++ b/src/.example.env @@ -0,0 +1,6 @@ +GOOGLE_CHROME_APP_ID=0000000000000000000000000000000000.apps.googleusercontent.com + +ALLOWED_EMAILS=email1,email2 +ORCHESTRATION_CONFIG_DEFAULT=secrets/orchestration.yaml +# optional +# ORCHESTRATION_CONFIG_BELLINGCAT=secrets/orchestration-bcat.yaml \ No newline at end of file diff --git a/src/db/crud.py b/src/db/crud.py index 8ff473a..b3e4c0d 100644 --- a/src/db/crud.py +++ b/src/db/crud.py @@ -26,7 +26,7 @@ def create_task(db: Session, task: schemas.TaskCreate): db.refresh(db_task) return db_task - +# TODO: implement soft delete so that S3 content can be found ant not dangling def delete_task(db: Session, task_id: str, email:str)->bool: db_task = db.query(models.Task).filter(models.Task.id == task_id, models.Task.author==email).first() if db_task: diff --git a/src/main.py b/src/main.py index 457214c..a352be0 100644 --- a/src/main.py +++ b/src/main.py @@ -7,7 +7,7 @@ from fastapi.middleware.cors import CORSMiddleware # from fastapi.templating import Jinja2Templates # from pydantic.json import pydantic_encoder from dotenv import load_dotenv -import traceback, os, requests +import traceback, os, requests, re from loguru import logger from worker import create_archive_task, celery @@ -26,7 +26,7 @@ assert len(GOOGLE_CHROME_APP_ID)>10, "GOOGLE_CHROME_APP_ID env variable not set" ALLOWED_EMAILS = set(os.environ.get("ALLOWED_EMAILS", "").split(",")) assert len(GOOGLE_CHROME_APP_ID)>=1, "at least one ALLOWED_EMAILS is required from the env variable" ALLOWED_ORIGINS = os.environ.get("ALLOWED_ORIGINS", "chrome-extension://ondkcheoicfckabcnkdgbepofpjmjcmb,chrome-extension://ojcimmjndnlmmlgnjaeojoebaceokpdp").split(",") -VERSION = "0.1.4" +VERSION = "0.1.5" app = FastAPI() app.add_middleware( @@ -119,8 +119,9 @@ def authenticate_user(access_token): j = r.json() if j.get("azp") != GOOGLE_CHROME_APP_ID and j.get("aud")!=GOOGLE_CHROME_APP_ID: return False, f"token does not belong to correct APP_ID" - if j.get("email") not in ALLOWED_EMAILS: - return False, f"email '{j.get('email')}' not in ALLOWED" + # if j.get("email") not in ALLOWED_EMAILS: + if not custom_is_email_allowed(j.get("email"), any_bellingcat_email=True): + return False, f"email '{j.get('email')}' not allowed" if j.get("email_verified") != "true": return False, f"email '{j.get('email')}' not verified" if int(j.get("expires_in", -1)) <= 0: @@ -130,6 +131,9 @@ def authenticate_user(access_token): logger.warning(f"EXCEPTION occurred: {e}") return False, f"EXCEPTION occurred" +def custom_is_email_allowed(email, any_bellingcat_email=False): + return email in ALLOWED_EMAILS or (any_bellingcat_email and re.match(r'^[\w.]+@bellingcat\.com$', email)) + def validate_user_get_email(access_token): valid_user, info = authenticate_user(access_token) if valid_user != True: diff --git a/src/worker.py b/src/worker.py index 24ee914..1627d9a 100644 --- a/src/worker.py +++ b/src/worker.py @@ -1,19 +1,16 @@ -import os +import os, re from celery import Celery -from dataclasses import asdict from auto_archiver import Config, ArchivingOrchestrator, Metadata -from auto_archiver.enrichers import ScreenshotEnricher +# from auto_archiver.enrichers import ScreenshotEnricher from loguru import logger -from db import crud, models, schemas +from db import crud, schemas from db.database import engine, SessionLocal from contextlib import contextmanager import json -# models.Base.metadata.create_all(bind=engine) - celery = Celery(__name__) celery.conf.broker_url = os.environ.get("CELERY_BROKER_URL", "redis://localhost:6379") celery.conf.result_backend = os.environ.get("CELERY_RESULT_BACKEND", "redis://localhost:6379") @@ -24,17 +21,35 @@ def get_db(): try: yield session finally: session.close() -config = Config() -config.parse(use_cli=False, yaml_config_filename="secrets/orchestration.yaml") -orchestrator = None +config_default = Config() +config_default.parse(use_cli=False, yaml_config_filename=os.environ.get("ORCHESTRATION_CONFIG_DEFAULT", "secrets/orchestration.yaml")) + +config_bcat = None +if (config_bcat_file := os.environ.get("ORCHESTRATION_CONFIG_BELLINGCAT")): + config_bcat = Config() + config_bcat.parse(use_cli=False, yaml_config_filename=config_bcat_file) + +orchestrators = {"bellingcat": None, "default": None} @celery.task(name="create_archive_task", bind=True) def create_archive_task(self, url: str, email:str=""): assert type(url)==str and len(url)>5, f"Invalid URL received: {url}" - global orchestrator - if not orchestrator: orchestrator = ArchivingOrchestrator(config) + orchestrator = choose_orchestrator(email) result = orchestrator.feed_item(Metadata().set_url(url)).to_json() with get_db() as session: db_task = crud.create_task(session, task=schemas.TaskCreate(id=self.request.id, url=url, author=email, result=json.loads(result))) logger.debug(f"Added {db_task.id=} to database on {db_task.created_at}") return result + + +def choose_orchestrator(email): + global orchestrators, config_bcat + if re.match(r'^[\w.]+@bellingcat\.com$', email) and config_bcat: + logger.debug("Using bellingcat config for orchestration") + if not orchestrators["bellingcat"]: + orchestrators["bellingcat"] = ArchivingOrchestrator(config_bcat) + return orchestrators["bellingcat"] + logger.debug("Using default config for orchestration") + if not orchestrators["default"]: + orchestrators["default"] = ArchivingOrchestrator(config_default) + return orchestrators["default"] \ No newline at end of file