This commit is contained in:
msramalho
2023-02-27 12:00:38 +01:00
parent f39561f559
commit 39da4e4eb7
7 changed files with 50 additions and 18 deletions

3
.example.env Normal file
View File

@@ -0,0 +1,3 @@
FLOWER_USERNAME=TODO
FLOWER_PASSWORD=TODO
REDIS_PASSWORD=TODO

View File

@@ -23,5 +23,7 @@ Copy `.env` and `src/.env` to deployment, along with the contents of `secrets/`
Then `docker compose up -d`. Then `docker compose up -d`.
#### updating packages/app #### updating packages/app/access
If pipenv packages are updated: `pipenv lock --requirements -r > requirements.txt` (manually comment line `-i https://pypi.org/simple`) and then `docker compose down` + `docker compose up --build -d` to build images with new packages. If pipenv packages are updated: `pipenv lock --requirements -r > requirements.txt` (manually comment line `-i https://pypi.org/simple`) and then `docker compose down` + `docker compose up --build -d` to build images with new packages.
New users should be added to the `src/.env` file `ALLOWED_EMAILS` prop

View File

@@ -10,6 +10,7 @@ services:
command: uvicorn main:app --host 0.0.0.0 --reload command: uvicorn main:app --host 0.0.0.0 --reload
volumes: volumes:
- ./src:/usr/src/app - ./src:/usr/src/app
env_file: src/.env
environment: environment:
- CELERY_BROKER_URL=redis://:${REDIS_PASSWORD}@redis:6379/0 - CELERY_BROKER_URL=redis://:${REDIS_PASSWORD}@redis:6379/0
- CELERY_RESULT_BACKEND=redis://:${REDIS_PASSWORD}@redis:6379/0 - CELERY_RESULT_BACKEND=redis://:${REDIS_PASSWORD}@redis:6379/0
@@ -22,6 +23,7 @@ services:
command: celery worker --app=worker.celery --loglevel=info --logfile=logs/celery.log command: celery worker --app=worker.celery --loglevel=info --logfile=logs/celery.log
volumes: volumes:
- ./src:/usr/src/app - ./src:/usr/src/app
env_file: src/.env
environment: environment:
- CELERY_BROKER_URL=redis://:${REDIS_PASSWORD}@redis:6379/0 - CELERY_BROKER_URL=redis://:${REDIS_PASSWORD}@redis:6379/0
- CELERY_RESULT_BACKEND=redis://:${REDIS_PASSWORD}@redis:6379/0 - CELERY_RESULT_BACKEND=redis://:${REDIS_PASSWORD}@redis:6379/0

6
src/.example.env Normal file
View File

@@ -0,0 +1,6 @@
GOOGLE_CHROME_APP_ID=0000000000000000000000000000000000.apps.googleusercontent.com
ALLOWED_EMAILS=email1,email2
ORCHESTRATION_CONFIG_DEFAULT=secrets/orchestration.yaml
# optional
# ORCHESTRATION_CONFIG_BELLINGCAT=secrets/orchestration-bcat.yaml

View File

@@ -26,7 +26,7 @@ def create_task(db: Session, task: schemas.TaskCreate):
db.refresh(db_task) db.refresh(db_task)
return db_task return db_task
# TODO: implement soft delete so that S3 content can be found ant not dangling
def delete_task(db: Session, task_id: str, email:str)->bool: def delete_task(db: Session, task_id: str, email:str)->bool:
db_task = db.query(models.Task).filter(models.Task.id == task_id, models.Task.author==email).first() db_task = db.query(models.Task).filter(models.Task.id == task_id, models.Task.author==email).first()
if db_task: if db_task:

View File

@@ -7,7 +7,7 @@ from fastapi.middleware.cors import CORSMiddleware
# from fastapi.templating import Jinja2Templates # from fastapi.templating import Jinja2Templates
# from pydantic.json import pydantic_encoder # from pydantic.json import pydantic_encoder
from dotenv import load_dotenv from dotenv import load_dotenv
import traceback, os, requests import traceback, os, requests, re
from loguru import logger from loguru import logger
from worker import create_archive_task, celery from worker import create_archive_task, celery
@@ -26,7 +26,7 @@ assert len(GOOGLE_CHROME_APP_ID)>10, "GOOGLE_CHROME_APP_ID env variable not set"
ALLOWED_EMAILS = set(os.environ.get("ALLOWED_EMAILS", "").split(",")) ALLOWED_EMAILS = set(os.environ.get("ALLOWED_EMAILS", "").split(","))
assert len(GOOGLE_CHROME_APP_ID)>=1, "at least one ALLOWED_EMAILS is required from the env variable" assert len(GOOGLE_CHROME_APP_ID)>=1, "at least one ALLOWED_EMAILS is required from the env variable"
ALLOWED_ORIGINS = os.environ.get("ALLOWED_ORIGINS", "chrome-extension://ondkcheoicfckabcnkdgbepofpjmjcmb,chrome-extension://ojcimmjndnlmmlgnjaeojoebaceokpdp").split(",") ALLOWED_ORIGINS = os.environ.get("ALLOWED_ORIGINS", "chrome-extension://ondkcheoicfckabcnkdgbepofpjmjcmb,chrome-extension://ojcimmjndnlmmlgnjaeojoebaceokpdp").split(",")
VERSION = "0.1.4" VERSION = "0.1.5"
app = FastAPI() app = FastAPI()
app.add_middleware( app.add_middleware(
@@ -119,8 +119,9 @@ def authenticate_user(access_token):
j = r.json() j = r.json()
if j.get("azp") != GOOGLE_CHROME_APP_ID and j.get("aud")!=GOOGLE_CHROME_APP_ID: if j.get("azp") != GOOGLE_CHROME_APP_ID and j.get("aud")!=GOOGLE_CHROME_APP_ID:
return False, f"token does not belong to correct APP_ID" return False, f"token does not belong to correct APP_ID"
if j.get("email") not in ALLOWED_EMAILS: # if j.get("email") not in ALLOWED_EMAILS:
return False, f"email '{j.get('email')}' not in ALLOWED" if not custom_is_email_allowed(j.get("email"), any_bellingcat_email=True):
return False, f"email '{j.get('email')}' not allowed"
if j.get("email_verified") != "true": if j.get("email_verified") != "true":
return False, f"email '{j.get('email')}' not verified" return False, f"email '{j.get('email')}' not verified"
if int(j.get("expires_in", -1)) <= 0: if int(j.get("expires_in", -1)) <= 0:
@@ -130,6 +131,9 @@ def authenticate_user(access_token):
logger.warning(f"EXCEPTION occurred: {e}") logger.warning(f"EXCEPTION occurred: {e}")
return False, f"EXCEPTION occurred" return False, f"EXCEPTION occurred"
def custom_is_email_allowed(email, any_bellingcat_email=False):
return email in ALLOWED_EMAILS or (any_bellingcat_email and re.match(r'^[\w.]+@bellingcat\.com$', email))
def validate_user_get_email(access_token): def validate_user_get_email(access_token):
valid_user, info = authenticate_user(access_token) valid_user, info = authenticate_user(access_token)
if valid_user != True: if valid_user != True:

View File

@@ -1,19 +1,16 @@
import os import os, re
from celery import Celery from celery import Celery
from dataclasses import asdict
from auto_archiver import Config, ArchivingOrchestrator, Metadata from auto_archiver import Config, ArchivingOrchestrator, Metadata
from auto_archiver.enrichers import ScreenshotEnricher # from auto_archiver.enrichers import ScreenshotEnricher
from loguru import logger from loguru import logger
from db import crud, models, schemas from db import crud, schemas
from db.database import engine, SessionLocal from db.database import engine, SessionLocal
from contextlib import contextmanager from contextlib import contextmanager
import json import json
# models.Base.metadata.create_all(bind=engine)
celery = Celery(__name__) celery = Celery(__name__)
celery.conf.broker_url = os.environ.get("CELERY_BROKER_URL", "redis://localhost:6379") celery.conf.broker_url = os.environ.get("CELERY_BROKER_URL", "redis://localhost:6379")
celery.conf.result_backend = os.environ.get("CELERY_RESULT_BACKEND", "redis://localhost:6379") celery.conf.result_backend = os.environ.get("CELERY_RESULT_BACKEND", "redis://localhost:6379")
@@ -24,17 +21,35 @@ def get_db():
try: yield session try: yield session
finally: session.close() finally: session.close()
config = Config() config_default = Config()
config.parse(use_cli=False, yaml_config_filename="secrets/orchestration.yaml") config_default.parse(use_cli=False, yaml_config_filename=os.environ.get("ORCHESTRATION_CONFIG_DEFAULT", "secrets/orchestration.yaml"))
orchestrator = None
config_bcat = None
if (config_bcat_file := os.environ.get("ORCHESTRATION_CONFIG_BELLINGCAT")):
config_bcat = Config()
config_bcat.parse(use_cli=False, yaml_config_filename=config_bcat_file)
orchestrators = {"bellingcat": None, "default": None}
@celery.task(name="create_archive_task", bind=True) @celery.task(name="create_archive_task", bind=True)
def create_archive_task(self, url: str, email:str=""): def create_archive_task(self, url: str, email:str=""):
assert type(url)==str and len(url)>5, f"Invalid URL received: {url}" assert type(url)==str and len(url)>5, f"Invalid URL received: {url}"
global orchestrator orchestrator = choose_orchestrator(email)
if not orchestrator: orchestrator = ArchivingOrchestrator(config)
result = orchestrator.feed_item(Metadata().set_url(url)).to_json() result = orchestrator.feed_item(Metadata().set_url(url)).to_json()
with get_db() as session: with get_db() as session:
db_task = crud.create_task(session, task=schemas.TaskCreate(id=self.request.id, url=url, author=email, result=json.loads(result))) db_task = crud.create_task(session, task=schemas.TaskCreate(id=self.request.id, url=url, author=email, result=json.loads(result)))
logger.debug(f"Added {db_task.id=} to database on {db_task.created_at}") logger.debug(f"Added {db_task.id=} to database on {db_task.created_at}")
return result return result
def choose_orchestrator(email):
global orchestrators, config_bcat
if re.match(r'^[\w.]+@bellingcat\.com$', email) and config_bcat:
logger.debug("Using bellingcat config for orchestration")
if not orchestrators["bellingcat"]:
orchestrators["bellingcat"] = ArchivingOrchestrator(config_bcat)
return orchestrators["bellingcat"]
logger.debug("Using default config for orchestration")
if not orchestrators["default"]:
orchestrators["default"] = ArchivingOrchestrator(config_default)
return orchestrators["default"]