mirror of
https://github.com/bellingcat/auto-archiver-api.git
synced 2026-06-12 21:48:35 +03:00
0.1.5
This commit is contained in:
3
.example.env
Normal file
3
.example.env
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
FLOWER_USERNAME=TODO
|
||||||
|
FLOWER_PASSWORD=TODO
|
||||||
|
REDIS_PASSWORD=TODO
|
||||||
@@ -23,5 +23,7 @@ Copy `.env` and `src/.env` to deployment, along with the contents of `secrets/`
|
|||||||
|
|
||||||
Then `docker compose up -d`.
|
Then `docker compose up -d`.
|
||||||
|
|
||||||
#### updating packages/app
|
#### updating packages/app/access
|
||||||
If pipenv packages are updated: `pipenv lock --requirements -r > requirements.txt` (manually comment line `-i https://pypi.org/simple`) and then `docker compose down` + `docker compose up --build -d` to build images with new packages.
|
If pipenv packages are updated: `pipenv lock --requirements -r > requirements.txt` (manually comment line `-i https://pypi.org/simple`) and then `docker compose down` + `docker compose up --build -d` to build images with new packages.
|
||||||
|
|
||||||
|
New users should be added to the `src/.env` file `ALLOWED_EMAILS` prop
|
||||||
@@ -10,6 +10,7 @@ services:
|
|||||||
command: uvicorn main:app --host 0.0.0.0 --reload
|
command: uvicorn main:app --host 0.0.0.0 --reload
|
||||||
volumes:
|
volumes:
|
||||||
- ./src:/usr/src/app
|
- ./src:/usr/src/app
|
||||||
|
env_file: src/.env
|
||||||
environment:
|
environment:
|
||||||
- CELERY_BROKER_URL=redis://:${REDIS_PASSWORD}@redis:6379/0
|
- CELERY_BROKER_URL=redis://:${REDIS_PASSWORD}@redis:6379/0
|
||||||
- CELERY_RESULT_BACKEND=redis://:${REDIS_PASSWORD}@redis:6379/0
|
- CELERY_RESULT_BACKEND=redis://:${REDIS_PASSWORD}@redis:6379/0
|
||||||
@@ -22,6 +23,7 @@ services:
|
|||||||
command: celery worker --app=worker.celery --loglevel=info --logfile=logs/celery.log
|
command: celery worker --app=worker.celery --loglevel=info --logfile=logs/celery.log
|
||||||
volumes:
|
volumes:
|
||||||
- ./src:/usr/src/app
|
- ./src:/usr/src/app
|
||||||
|
env_file: src/.env
|
||||||
environment:
|
environment:
|
||||||
- CELERY_BROKER_URL=redis://:${REDIS_PASSWORD}@redis:6379/0
|
- CELERY_BROKER_URL=redis://:${REDIS_PASSWORD}@redis:6379/0
|
||||||
- CELERY_RESULT_BACKEND=redis://:${REDIS_PASSWORD}@redis:6379/0
|
- CELERY_RESULT_BACKEND=redis://:${REDIS_PASSWORD}@redis:6379/0
|
||||||
|
|||||||
6
src/.example.env
Normal file
6
src/.example.env
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
GOOGLE_CHROME_APP_ID=0000000000000000000000000000000000.apps.googleusercontent.com
|
||||||
|
|
||||||
|
ALLOWED_EMAILS=email1,email2
|
||||||
|
ORCHESTRATION_CONFIG_DEFAULT=secrets/orchestration.yaml
|
||||||
|
# optional
|
||||||
|
# ORCHESTRATION_CONFIG_BELLINGCAT=secrets/orchestration-bcat.yaml
|
||||||
@@ -26,7 +26,7 @@ def create_task(db: Session, task: schemas.TaskCreate):
|
|||||||
db.refresh(db_task)
|
db.refresh(db_task)
|
||||||
return db_task
|
return db_task
|
||||||
|
|
||||||
|
# TODO: implement soft delete so that S3 content can be found ant not dangling
|
||||||
def delete_task(db: Session, task_id: str, email:str)->bool:
|
def delete_task(db: Session, task_id: str, email:str)->bool:
|
||||||
db_task = db.query(models.Task).filter(models.Task.id == task_id, models.Task.author==email).first()
|
db_task = db.query(models.Task).filter(models.Task.id == task_id, models.Task.author==email).first()
|
||||||
if db_task:
|
if db_task:
|
||||||
|
|||||||
12
src/main.py
12
src/main.py
@@ -7,7 +7,7 @@ from fastapi.middleware.cors import CORSMiddleware
|
|||||||
# from fastapi.templating import Jinja2Templates
|
# from fastapi.templating import Jinja2Templates
|
||||||
# from pydantic.json import pydantic_encoder
|
# from pydantic.json import pydantic_encoder
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
import traceback, os, requests
|
import traceback, os, requests, re
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from worker import create_archive_task, celery
|
from worker import create_archive_task, celery
|
||||||
@@ -26,7 +26,7 @@ assert len(GOOGLE_CHROME_APP_ID)>10, "GOOGLE_CHROME_APP_ID env variable not set"
|
|||||||
ALLOWED_EMAILS = set(os.environ.get("ALLOWED_EMAILS", "").split(","))
|
ALLOWED_EMAILS = set(os.environ.get("ALLOWED_EMAILS", "").split(","))
|
||||||
assert len(GOOGLE_CHROME_APP_ID)>=1, "at least one ALLOWED_EMAILS is required from the env variable"
|
assert len(GOOGLE_CHROME_APP_ID)>=1, "at least one ALLOWED_EMAILS is required from the env variable"
|
||||||
ALLOWED_ORIGINS = os.environ.get("ALLOWED_ORIGINS", "chrome-extension://ondkcheoicfckabcnkdgbepofpjmjcmb,chrome-extension://ojcimmjndnlmmlgnjaeojoebaceokpdp").split(",")
|
ALLOWED_ORIGINS = os.environ.get("ALLOWED_ORIGINS", "chrome-extension://ondkcheoicfckabcnkdgbepofpjmjcmb,chrome-extension://ojcimmjndnlmmlgnjaeojoebaceokpdp").split(",")
|
||||||
VERSION = "0.1.4"
|
VERSION = "0.1.5"
|
||||||
|
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
app.add_middleware(
|
app.add_middleware(
|
||||||
@@ -119,8 +119,9 @@ def authenticate_user(access_token):
|
|||||||
j = r.json()
|
j = r.json()
|
||||||
if j.get("azp") != GOOGLE_CHROME_APP_ID and j.get("aud")!=GOOGLE_CHROME_APP_ID:
|
if j.get("azp") != GOOGLE_CHROME_APP_ID and j.get("aud")!=GOOGLE_CHROME_APP_ID:
|
||||||
return False, f"token does not belong to correct APP_ID"
|
return False, f"token does not belong to correct APP_ID"
|
||||||
if j.get("email") not in ALLOWED_EMAILS:
|
# if j.get("email") not in ALLOWED_EMAILS:
|
||||||
return False, f"email '{j.get('email')}' not in ALLOWED"
|
if not custom_is_email_allowed(j.get("email"), any_bellingcat_email=True):
|
||||||
|
return False, f"email '{j.get('email')}' not allowed"
|
||||||
if j.get("email_verified") != "true":
|
if j.get("email_verified") != "true":
|
||||||
return False, f"email '{j.get('email')}' not verified"
|
return False, f"email '{j.get('email')}' not verified"
|
||||||
if int(j.get("expires_in", -1)) <= 0:
|
if int(j.get("expires_in", -1)) <= 0:
|
||||||
@@ -130,6 +131,9 @@ def authenticate_user(access_token):
|
|||||||
logger.warning(f"EXCEPTION occurred: {e}")
|
logger.warning(f"EXCEPTION occurred: {e}")
|
||||||
return False, f"EXCEPTION occurred"
|
return False, f"EXCEPTION occurred"
|
||||||
|
|
||||||
|
def custom_is_email_allowed(email, any_bellingcat_email=False):
|
||||||
|
return email in ALLOWED_EMAILS or (any_bellingcat_email and re.match(r'^[\w.]+@bellingcat\.com$', email))
|
||||||
|
|
||||||
def validate_user_get_email(access_token):
|
def validate_user_get_email(access_token):
|
||||||
valid_user, info = authenticate_user(access_token)
|
valid_user, info = authenticate_user(access_token)
|
||||||
if valid_user != True:
|
if valid_user != True:
|
||||||
|
|||||||
@@ -1,19 +1,16 @@
|
|||||||
|
|
||||||
import os
|
import os, re
|
||||||
|
|
||||||
from celery import Celery
|
from celery import Celery
|
||||||
from dataclasses import asdict
|
|
||||||
from auto_archiver import Config, ArchivingOrchestrator, Metadata
|
from auto_archiver import Config, ArchivingOrchestrator, Metadata
|
||||||
from auto_archiver.enrichers import ScreenshotEnricher
|
# from auto_archiver.enrichers import ScreenshotEnricher
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from db import crud, models, schemas
|
from db import crud, schemas
|
||||||
from db.database import engine, SessionLocal
|
from db.database import engine, SessionLocal
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
import json
|
import json
|
||||||
|
|
||||||
# models.Base.metadata.create_all(bind=engine)
|
|
||||||
|
|
||||||
celery = Celery(__name__)
|
celery = Celery(__name__)
|
||||||
celery.conf.broker_url = os.environ.get("CELERY_BROKER_URL", "redis://localhost:6379")
|
celery.conf.broker_url = os.environ.get("CELERY_BROKER_URL", "redis://localhost:6379")
|
||||||
celery.conf.result_backend = os.environ.get("CELERY_RESULT_BACKEND", "redis://localhost:6379")
|
celery.conf.result_backend = os.environ.get("CELERY_RESULT_BACKEND", "redis://localhost:6379")
|
||||||
@@ -24,17 +21,35 @@ def get_db():
|
|||||||
try: yield session
|
try: yield session
|
||||||
finally: session.close()
|
finally: session.close()
|
||||||
|
|
||||||
config = Config()
|
config_default = Config()
|
||||||
config.parse(use_cli=False, yaml_config_filename="secrets/orchestration.yaml")
|
config_default.parse(use_cli=False, yaml_config_filename=os.environ.get("ORCHESTRATION_CONFIG_DEFAULT", "secrets/orchestration.yaml"))
|
||||||
orchestrator = None
|
|
||||||
|
config_bcat = None
|
||||||
|
if (config_bcat_file := os.environ.get("ORCHESTRATION_CONFIG_BELLINGCAT")):
|
||||||
|
config_bcat = Config()
|
||||||
|
config_bcat.parse(use_cli=False, yaml_config_filename=config_bcat_file)
|
||||||
|
|
||||||
|
orchestrators = {"bellingcat": None, "default": None}
|
||||||
|
|
||||||
@celery.task(name="create_archive_task", bind=True)
|
@celery.task(name="create_archive_task", bind=True)
|
||||||
def create_archive_task(self, url: str, email:str=""):
|
def create_archive_task(self, url: str, email:str=""):
|
||||||
assert type(url)==str and len(url)>5, f"Invalid URL received: {url}"
|
assert type(url)==str and len(url)>5, f"Invalid URL received: {url}"
|
||||||
global orchestrator
|
orchestrator = choose_orchestrator(email)
|
||||||
if not orchestrator: orchestrator = ArchivingOrchestrator(config)
|
|
||||||
result = orchestrator.feed_item(Metadata().set_url(url)).to_json()
|
result = orchestrator.feed_item(Metadata().set_url(url)).to_json()
|
||||||
with get_db() as session:
|
with get_db() as session:
|
||||||
db_task = crud.create_task(session, task=schemas.TaskCreate(id=self.request.id, url=url, author=email, result=json.loads(result)))
|
db_task = crud.create_task(session, task=schemas.TaskCreate(id=self.request.id, url=url, author=email, result=json.loads(result)))
|
||||||
logger.debug(f"Added {db_task.id=} to database on {db_task.created_at}")
|
logger.debug(f"Added {db_task.id=} to database on {db_task.created_at}")
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def choose_orchestrator(email):
|
||||||
|
global orchestrators, config_bcat
|
||||||
|
if re.match(r'^[\w.]+@bellingcat\.com$', email) and config_bcat:
|
||||||
|
logger.debug("Using bellingcat config for orchestration")
|
||||||
|
if not orchestrators["bellingcat"]:
|
||||||
|
orchestrators["bellingcat"] = ArchivingOrchestrator(config_bcat)
|
||||||
|
return orchestrators["bellingcat"]
|
||||||
|
logger.debug("Using default config for orchestration")
|
||||||
|
if not orchestrators["default"]:
|
||||||
|
orchestrators["default"] = ArchivingOrchestrator(config_default)
|
||||||
|
return orchestrators["default"]
|
||||||
Reference in New Issue
Block a user