Merge pull request #3 from bellingcat/sheet-endpoint

This commit is contained in:
Miguel Sozinho Ramalho
2023-05-23 20:21:24 +01:00
committed by GitHub
11 changed files with 1078 additions and 662 deletions

View File

@@ -55,7 +55,13 @@ Copy `.env` and `src/.env` to deployment, along with the contents of `secrets/`
Then `docker compose up -d`. Then `docker compose up -d`.
#### updating packages/app/access #### updating packages/app/access
If pipenv packages are updated: `pipenv lock --requirements -r > requirements.txt` (manually comment line `-i https://pypi.org/simple`) and then `docker compose down` + `docker compose up --build -d` to build images with new packages. If pipenv packages are updated: `pipenv lock --requirements -r > requirements.txt` (or ` pipenv requirements > requirements.txt` depending on pipenv version) (manually comment line `-i https://pypi.org/simple`) and then `docker compose down` + `docker compose up --build -d` to build images with new packages.
New users should be added to the `src/.env` file `ALLOWED_EMAILS` prop New users should be added to the `src/.env` file `ALLOWED_EMAILS` prop
```bash
# CALL /sheet POST endpoint
curl -XPOST -H "Authorization: Bearer GOOGLE_OAUTH_TOKEN" -H "Content-type: application/json" -d '{"sheet_id": "SHEET_ID", "header": 1}' 'http://localhost:8004/sheet'
```

View File

@@ -33,6 +33,7 @@ services:
redis: redis:
image: redis:6-alpine image: redis:6-alpine
# command: redis-server /conf/redis.conf # DEV ONLY
command: redis-server /conf/redis.conf --requirepass ${REDIS_PASSWORD} command: redis-server /conf/redis.conf --requirepass ${REDIS_PASSWORD}
volumes: volumes:
- "./redis/data:/data" - "./redis/data:/data"

View File

@@ -1,4 +1,6 @@
GOOGLE_CHROME_APP_ID=0000000000000000000000000000000000.apps.googleusercontent.com GOOGLE_CHROME_APP_ID=0000000000000000000000000000000000.apps.googleusercontent.com
# for validating non-email limited JWT
GOOGLE_CHROME_APP_ID_PUBLIC=0000000000000000000000000000000000.apps.googleusercontent.com
ALLOWED_EMAILS=email1,email2 ALLOWED_EMAILS=email1,email2
ORCHESTRATION_CONFIG_DEFAULT=secrets/orchestration.yaml ORCHESTRATION_CONFIG_DEFAULT=secrets/orchestration.yaml

View File

@@ -8,7 +8,7 @@ aiofiles = "==0.6.0"
celery = "==4.4.7" celery = "==4.4.7"
fastapi = "*" fastapi = "*"
flower = "==0.9.7" flower = "==0.9.7"
jinja2 = ">=3.0.3" jinja2 = "*"
pytest = "==6.2.4" pytest = "==6.2.4"
redis = "==3.5.3" redis = "==3.5.3"
requests = ">=2.25.1" requests = ">=2.25.1"

1257
src/Pipfile.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,6 @@
from functools import cache from functools import cache
from sqlalchemy.orm import Session, load_only from sqlalchemy.orm import Session, load_only
from sqlalchemy import Column from sqlalchemy import Column, or_
from loguru import logger from loguru import logger
from . import models, schemas from . import models, schemas
import yaml import yaml
@@ -13,17 +13,17 @@ def get_task(db: Session, task_id: str):
def get_tasks(db: Session, skip: int = 0, limit: int = 100): def get_tasks(db: Session, skip: int = 0, limit: int = 100):
return base_query(db).offset(skip).limit(limit).all() return base_query(db).offset(skip).limit(limit).all()
def search_tasks_by_url(db: Session, url:str, skip: int = 0, limit: int = 100): def search_tasks_by_url(db: Session, url:str, email:str, skip: int = 0, limit: int = 100):
return base_query(db).filter(models.Archive.url.like(f'%{url}%')).offset(skip).limit(limit).all() groups = get_user_groups(db, email)
return base_query(db).filter(or_(models.Archive.public==True, models.Archive.author_id==email, models.Archive.group_id.in_(groups))).filter(models.Archive.url.like(f'%{url}%')).offset(skip).limit(limit).all()
def search_tasks_by_email(db: Session, email:str, skip: int = 0, limit: int = 100): def search_tasks_by_email(db: Session, email:str, skip: int = 0, limit: int = 100):
return base_query(db).filter(models.Archive.author.has(email=email)).offset(skip).limit(limit).all() return base_query(db).filter(models.Archive.author.has(email=email)).offset(skip).limit(limit).all()
def create_task(db: Session, task: schemas.ArchiveCreate, tags:list[models.Tag],urls:list[models.ArchiveUrl]): def create_task(db: Session, task: schemas.ArchiveCreate, tags:list[models.Tag],urls:list[models.ArchiveUrl]):
db_task = models.Archive(id=task.id, url=task.url, author_id=task.author_id, result=task.result, group_id=task.group_id) db_task = models.Archive(id=task.id, url=task.url, result=task.result, public=task.public, author_id=task.author_id, group_id=task.group_id)
logger.debug(tags) db_task.tags = tags
db_task.tags = tags # will this work? TODO: test if I don't call create tag before db_task.urls = urls
db_task.urls = urls # will this work to create ArchiveUrl? TODO: test
db.add(db_task) db.add(db_task)
db.commit() db.commit()
db.refresh(db_task) db.refresh(db_task)

View File

@@ -1,6 +1,7 @@
from pydantic import BaseModel from pydantic import BaseModel
from datetime import datetime from datetime import datetime
class ArchiveCreate(BaseModel): class ArchiveCreate(BaseModel):
id: str | None = None id: str | None = None
url: str url: str
@@ -8,11 +9,10 @@ class ArchiveCreate(BaseModel):
public: bool = True public: bool = True
author_id: str | None = None author_id: str | None = None
group_id: str | None = None group_id: str | None = None
tags: list = [] tags: set = set()
# urls: list = [] # urls: list = []
class Archive(ArchiveCreate): class Archive(ArchiveCreate):
created_at: datetime created_at: datetime
updated_at: datetime | None updated_at: datetime | None
@@ -22,10 +22,12 @@ class Archive(ArchiveCreate):
orm_mode = True orm_mode = True
# class TagCreate(BaseModel): class SubmitSheet(BaseModel):
# id: str sheet_name: str | None = None
sheet_id: str | None = None
# class Tag(TagCreate): header: int = 1
# created_at: datetime public: bool = False
# # class Config: author_id: str | None = None
# # orm_mode = True group_id: str | None = None
tags: set | None = set()
columns: dict | None = {} # TODO: implement

View File

@@ -10,7 +10,7 @@ from dotenv import load_dotenv
import traceback, os, logging import traceback, os, logging
from loguru import logger from loguru import logger
from worker import create_archive_task, celery from worker import create_archive_task, create_sheet_task, celery
from db import crud, models, schemas from db import crud, models, schemas
from db.database import engine, SessionLocal from db.database import engine, SessionLocal
@@ -21,11 +21,12 @@ load_dotenv()
# Configuration # Configuration
ALLOWED_ORIGINS = os.environ.get("ALLOWED_ORIGINS", "chrome-extension://ondkcheoicfckabcnkdgbepofpjmjcmb,chrome-extension://ojcimmjndnlmmlgnjaeojoebaceokpdp").split(",") ALLOWED_ORIGINS = os.environ.get("ALLOWED_ORIGINS", "chrome-extension://ondkcheoicfckabcnkdgbepofpjmjcmb,chrome-extension://ojcimmjndnlmmlgnjaeojoebaceokpdp").split(",")
VERSION = "0.4.0" VERSION = "0.5.0"
# min-version refers to the version of auto-archiver-extension on the webstore
BREAKING_CHANGES = {"minVersion": "0.3.0", "message": "The latest update has breaking changes, please update the extension to the most recent version."}
app = FastAPI() # min-version refers to the version of auto-archiver-extension on the webstore
BREAKING_CHANGES = {"minVersion": "0.3.1", "message": "The latest update has breaking changes, please update the extension to the most recent version."}
app = FastAPI(title="Auto-Archiver API", version=VERSION, contact={"name":"Bellingcat", "url":"https://github.com/bellingcat/auto-archiver-api"})
app.add_middleware( app.add_middleware(
CORSMiddleware, CORSMiddleware,
allow_origins=ALLOWED_ORIGINS, allow_origins=ALLOWED_ORIGINS,
@@ -40,6 +41,13 @@ def get_db():
try: yield session try: yield session
finally: session.close() finally: session.close()
# logging configurations
logger.add("logs/api_logs.log", retention="30 days", rotation="3 days")
@app.middleware("http")
async def logging_middleware(request: Request, call_next):
response = await call_next(request)
logger.info(f"{request.client.host}:{request.client.port} {request.method} {request.url._url} - HTTP {response.status_code}")
return response
@app.get("/") @app.get("/")
async def home(request: Request): async def home(request: Request):
@@ -53,14 +61,6 @@ async def home(request: Request):
except Exception as e: logger.error(e) except Exception as e: logger.error(e)
return JSONResponse(status) return JSONResponse(status)
# logging configurations
logger.add("logs/api_logs.log", retention="30 days", rotation="3 days")
@app.middleware("http")
async def logging_middleware(request: Request, call_next):
response = await call_next(request)
logger.info(f"{request.client.host}:{request.client.port} {request.method} {request.url._url} - HTTP {response.status_code}")
return response
# Bearer protected below # Bearer protected below
@@ -69,22 +69,17 @@ def get_user_groups(db: Session = Depends(get_db), email = Depends(get_bearer_au
return crud.get_user_groups(db, email) return crud.get_user_groups(db, email)
@app.get("/tasks/search-url", response_model=list[schemas.Archive]) @app.get("/tasks/search-url", response_model=list[schemas.Archive])
def search(url:str, skip: int = 0, limit: int = 100, db: Session = Depends(get_db), _email = Depends(get_bearer_auth)): def search(url:str, skip: int = 0, limit: int = 100, db: Session = Depends(get_db), email = Depends(get_bearer_auth)):
return crud.search_tasks_by_url(db, url, skip=skip, limit=limit) return crud.search_tasks_by_url(db, url, email, skip=skip, limit=limit)
# @app.get("/tasks/search", response_model=list[schemas.Task])
# def search(skip: int = 0, limit: int = 100, db: Session = Depends(get_db), email = Depends(get_bearer_auth)):
# return crud.get_tasks(db, skip=skip, limit=limit)
@app.get("/tasks/sync", response_model=list[schemas.Archive]) @app.get("/tasks/sync", response_model=list[schemas.Archive])
def search(skip: int = 0, limit: int = 100, db: Session = Depends(get_db), email = Depends(get_bearer_auth)): def search(skip: int = 0, limit: int = 100, db: Session = Depends(get_db), email = Depends(get_bearer_auth)):
return crud.search_tasks_by_email(db, email, skip=skip, limit=limit) return crud.search_tasks_by_email(db, email, skip=skip, limit=limit)
@app.post("/tasks", status_code=201) @app.post("/tasks", status_code=201)
def run_task(archive:schemas.ArchiveCreate, email = Depends(get_bearer_auth)): def archive_sheet(archive:schemas.ArchiveCreate, email = Depends(get_bearer_auth)):
archive.author_id = email archive.author_id = email
url = archive.url url = archive.url
logger.warning(archive)
logger.info(f"new {archive.public=} task for {email=} and {archive.group_id=}: {url}") logger.info(f"new {archive.public=} task for {email=} and {archive.group_id=}: {url}")
if type(url)!=str or len(url)<=5: if type(url)!=str or len(url)<=5:
raise HTTPException(status_code=422, detail=f"Invalid URL received: {url}") raise HTTPException(status_code=422, detail=f"Invalid URL received: {url}")
@@ -92,22 +87,10 @@ def run_task(archive:schemas.ArchiveCreate, email = Depends(get_bearer_auth)):
task = create_archive_task.delay(archive.json()) task = create_archive_task.delay(archive.json())
return JSONResponse({"id": task.id}) return JSONResponse({"id": task.id})
# @app.post("/tasks", status_code=201)
# def run_task(payload = Body(...), email = Depends(get_bearer_auth)):
# url = payload.get('url')
# public = payload.get('public', True)
# group = payload.get('group', None)
# logger.info(f"new {public=} task for {email=} and {group=}: {url}")
# if type(url)!=str or len(url)<=5:
# raise HTTPException(status_code=422, detail=f"Invalid URL received: {url}")
# task = create_archive_task.delay(url=payload.get('url'), email=email, public=public, group=group)
# return JSONResponse({"id": task.id})
@app.get("/tasks/{task_id}") @app.get("/tasks/{task_id}")
def get_status(task_id, email = Depends(get_bearer_auth)): def get_status(task_id, email = Depends(get_bearer_auth)):
logger.info(f"status check for user {email}") logger.info(f"status check for user {email}")
task_result = AsyncResult(task_id, app=celery) task_result = AsyncResult(task_id, app=celery)
logger.info(task_result)
result = { result = {
"id": task_id, "id": task_id,
"status": task_result.status, "status": task_result.status,
@@ -116,7 +99,10 @@ def get_status(task_id, email = Depends(get_bearer_auth)):
try: try:
if task_result.result and "error" in task_result.result: if task_result.result and "error" in task_result.result:
result["status"] = "FAILURE" result["status"] = "FAILURE"
except Exception as e: logger.error(traceback.format_exc()) except Exception as e:
logger.error(e)
logger.error(traceback.format_exc())
result["status"] = "FAILURE"
try: try:
json_result = jsonable_encoder(result, exclude_unset=True) json_result = jsonable_encoder(result, exclude_unset=True)
return JSONResponse(json_result) return JSONResponse(json_result)
@@ -131,13 +117,22 @@ def get_status(task_id, email = Depends(get_bearer_auth)):
@app.delete("/tasks/{task_id}") @app.delete("/tasks/{task_id}")
def get_status(task_id, db: Session = Depends(get_db), email = Depends(get_bearer_auth)): def delete_task(task_id, db: Session = Depends(get_db), email = Depends(get_bearer_auth)):
logger.info(f"deleting task {task_id} request by {email}") logger.info(f"deleting task {task_id} request by {email}")
return JSONResponse({ return JSONResponse({
"id": task_id, "id": task_id,
"deleted": crud.soft_delete_task(db, task_id, email) "deleted": crud.soft_delete_task(db, task_id, email)
}) })
@app.post("/sheet", status_code=201)
def archive_sheet(sheet:schemas.SubmitSheet, email = Depends(get_bearer_auth)):
logger.info(f"SHEET TASK for {sheet=}")
sheet.author_id = email
if not sheet.sheet_name and not sheet.sheet_id:
raise HTTPException(status_code=422, detail=f"sheet name or id is required")
task = create_sheet_task.delay(sheet.json())
return JSONResponse({"id": task.id})
# Basic protected logic to allow access to 1 static file # Basic protected logic to allow access to 1 static file
SF = os.environ.get("STATIC_FILE", "") SF = os.environ.get("STATIC_FILE", "")
if len(SF) > 1 and os.path.isfile(SF): if len(SF) > 1 and os.path.isfile(SF):

View File

@@ -1,133 +1,134 @@
#
# These requirements were autogenerated by pipenv
# To regenerate from the project's Pipfile, run:
#
# pipenv lock --requirements
#
# -i https://pypi.org/simple # -i https://pypi.org/simple
aiofiles==0.6.0 aiofiles==0.6.0
aiohttp==3.8.4 ; python_version >= '3.6'
aiosignal==1.3.1 ; python_version >= '3.7'
aiosqlite==0.19.0 aiosqlite==0.19.0
alembic==1.10.3 alembic==1.11.1
amqp==2.6.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' amqp==2.6.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
anyio==3.6.2; python_full_version >= '3.6.2' anyio==3.6.2 ; python_full_version >= '3.6.2'
argparse==1.4.0 argparse==1.4.0
async-generator==1.10; python_version >= '3.5' async-generator==1.10 ; python_version >= '3.5'
attrs==23.1.0; python_version >= '3.7' async-timeout==4.0.2 ; python_version >= '3.6'
attrs==23.1.0 ; python_version >= '3.7'
authlib==0.15.6 authlib==0.15.6
auto-archiver==0.5.6 auto-archiver==0.5.17
beautifulsoup4==4.12.2; python_version >= '3.6' beautifulsoup4==4.12.2 ; python_full_version >= '3.6.0'
billiard==3.6.4.0 billiard==3.6.4.0
boto3==1.26.115; python_version >= '3.7' blinker==1.6.2 ; python_version >= '3.7'
botocore==1.29.115; python_version >= '3.7' boto3==1.26.138 ; python_version >= '3.7'
brotli==1.0.9; platform_python_implementation == 'CPython' botocore==1.29.138 ; python_version >= '3.7'
brotli==1.0.9 ; platform_python_implementation == 'CPython'
bs4==0.0.1 bs4==0.0.1
cachetools==5.3.0; python_version ~= '3.7' cachetools==5.3.0 ; python_version ~= '3.7'
celery==4.4.7 celery==4.4.7
certifi==2022.12.7; python_version >= '3.6' certifi==2023.5.7 ; python_version >= '3.6'
cffi==1.15.1 cffi==1.15.1
charset-normalizer==3.0.1 charset-normalizer==3.1.0 ; python_full_version >= '3.7.0'
click==8.1.3; python_version >= '3.7' click==8.1.3 ; python_version >= '3.7'
cloudscraper==1.2.69 cloudscraper==1.2.71
cryptography==38.0.4; python_version >= '3.6' cryptography==38.0.4 ; python_version >= '3.6'
dataclasses-json==0.5.7; python_version >= '3.6' dataclasses-json==0.5.7 ; python_version >= '3.6'
dateparser==1.1.8; python_version >= '3.7' dateparser==1.1.8 ; python_version >= '3.7'
exceptiongroup==1.1.1; python_version < '3.11' exceptiongroup==1.1.1 ; python_version < '3.11'
fastapi==0.95.2
fastapi-utils==0.2.1 fastapi-utils==0.2.1
fastapi==0.95.1
ffmpeg-python==0.2.0 ffmpeg-python==0.2.0
filelock==3.12.0; python_version >= '3.7' filelock==3.12.0 ; python_version >= '3.7'
flask==2.2.3; python_version >= '3.7' flask==2.3.2 ; python_version >= '3.8'
flower==0.9.7 flower==0.9.7
future==0.18.3; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3' frozenlist==1.3.3 ; python_version >= '3.7'
google-api-core==2.11.0; python_version >= '3.7' future==0.18.3 ; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'
google-api-python-client==2.86.0; python_version >= '3.7' google-api-core==2.11.0 ; python_version >= '3.7'
google-api-python-client==2.86.0 ; python_version >= '3.7'
google-auth==2.18.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
google-auth-httplib2==0.1.0 google-auth-httplib2==0.1.0
google-auth-oauthlib==1.0.0; python_version >= '3.6' google-auth-oauthlib==1.0.0 ; python_version >= '3.6'
google-auth==2.17.3; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' googleapis-common-protos==1.59.0 ; python_version >= '3.7'
googleapis-common-protos==1.59.0; python_version >= '3.7' greenlet==2.0.2 ; python_version >= '3' and (platform_machine == 'aarch64' or (platform_machine == 'ppc64le' or (platform_machine == 'x86_64' or (platform_machine == 'amd64' or (platform_machine == 'AMD64' or (platform_machine == 'win32' or platform_machine == 'WIN32'))))))
greenlet==2.0.2; python_version >= '3' and platform_machine == 'aarch64' or (platform_machine == 'ppc64le' or (platform_machine == 'x86_64' or (platform_machine == 'amd64' or (platform_machine == 'AMD64' or (platform_machine == 'win32' or platform_machine == 'WIN32'))))) gspread==5.9.0 ; python_version not in '3.0, 3.1, 3.2, 3.3' and python_version >= '3.6'
gspread==5.8.0; python_version not in '3.0, 3.1, 3.2, 3.3' and python_version >= '3.6' h11==0.14.0 ; python_version >= '3.7'
h11==0.14.0; python_version >= '3.7' httpcore==0.17.2 ; python_version >= '3.7'
httplib2==0.22.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' httplib2==0.22.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
humanize==4.6.0; python_version >= '3.7' httpx==0.24.1 ; python_version >= '3.7'
idna==3.4; python_version >= '3.5' humanize==4.6.0 ; python_version >= '3.7'
iniconfig==2.0.0; python_version >= '3.7' idna==3.4 ; python_version >= '3.5'
instaloader==4.9.6; python_version >= '3.8' iniconfig==2.0.0 ; python_version >= '3.7'
itsdangerous==2.1.2; python_version >= '3.7' instaloader==4.9.6 ; python_version >= '3.8'
itsdangerous==2.1.2 ; python_version >= '3.7'
jinja2==3.1.2 jinja2==3.1.2
jmespath==1.0.1; python_version >= '3.7' jmespath==1.0.1 ; python_version >= '3.7'
kombu==4.6.11; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' kombu==4.6.11 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
loguru==0.7.0 loguru==0.7.0
lxml==4.9.2; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' lxml==4.9.2 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
mako==1.2.4; python_version >= '3.7' mako==1.2.4 ; python_version >= '3.7'
markdown-it-py==2.2.0; python_version >= '3.7' markdown-it-py==2.2.0 ; python_version >= '3.7'
markupsafe==2.1.2; python_version >= '3.7' markupsafe==2.1.2 ; python_version >= '3.7'
marshmallow==3.19.0 ; python_version >= '3.7'
marshmallow-enum==1.5.1 marshmallow-enum==1.5.1
marshmallow==3.19.0; python_version >= '3.7' mdurl==0.1.2 ; python_version >= '3.7'
mdurl==0.1.2; python_version >= '3.7' multidict==6.0.4 ; python_version >= '3.7'
mutagen==1.46.0; python_version >= '3.7' mutagen==1.46.0 ; python_version >= '3.7'
mypy-extensions==1.0.0; python_version >= '3.5' mypy-extensions==1.0.0 ; python_version >= '3.5'
oauth2client==4.1.3 oauth2client==4.1.3
oauthlib==3.2.2; python_version >= '3.6' oauthlib==3.2.2 ; python_version >= '3.6'
outcome==1.2.0; python_version >= '3.7' outcome==1.2.0 ; python_version >= '3.7'
packaging==23.1; python_version >= '3.7' packaging==23.1 ; python_version >= '3.7'
pluggy==0.13.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' pluggy==0.13.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
prometheus-client==0.8.0 prometheus-client==0.8.0
protobuf==4.22.3; python_version >= '3.7' protobuf==4.23.1 ; python_version >= '3.7'
py==1.11.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' py==1.11.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
pyaes==1.6.1 pyaes==1.6.1
pyasn1-modules==0.2.8 pyasn1==0.5.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
pyasn1==0.4.8 pyasn1-modules==0.3.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
pycparser==2.21 pycparser==2.21
pycryptodomex==3.17; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' pycryptodomex==3.18.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
pydantic==1.10.7; python_version >= '3.7' pydantic==1.10.7 ; python_version >= '3.7'
pygments==2.15.0; python_version >= '3.7' pygments==2.15.1 ; python_version >= '3.7'
pyparsing==3.0.9; python_version >= '3.1' pyparsing==3.0.9 ; python_version >= '3.1'
pysocks==1.7.1 pysocks==1.7.1
pytest==6.2.4 pytest==6.2.4
python-dateutil==2.8.2; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' python-dateutil==2.8.2 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
python-dotenv==1.0.0 python-dotenv==1.0.0
python-slugify==8.0.1; python_version >= '3.7' python-slugify==8.0.1 ; python_version >= '3.7'
python-twitter-v2==0.8.1; python_version >= '3.6' and python_version < '4' python-twitter-v2==0.8.1 ; python_version >= '3.6' and python_version < '4.0'
pytz-deprecation-shim==0.1.0.post0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
pytz==2023.3 pytz==2023.3
pyyaml==6.0; python_version >= '3.6' pyyaml==6.0 ; python_version >= '3.6'
redis==3.5.3 redis==3.5.3
regex==2023.3.23; python_version >= '3.8' regex==2023.5.5 ; python_version >= '3.6'
requests-oauthlib==1.3.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' requests==2.31.0
requests-toolbelt==0.10.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' requests-oauthlib==1.3.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
requests==2.28.2 requests-toolbelt==1.0.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
rich==13.3.4; python_version >= '3.7' rich==13.3.5 ; python_full_version >= '3.7.0'
rsa==4.9; python_version >= '3.6' and python_version < '4' rsa==4.9 ; python_version >= '3.6' and python_version < '4'
s3transfer==0.6.0; python_version >= '3.7' s3transfer==0.6.1 ; python_version >= '3.7'
selenium==4.8.3; python_version >= '3.7' selenium==4.9.1 ; python_version >= '3.7'
six==1.16.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' six==1.16.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
sniffio==1.3.0; python_version >= '3.7' sniffio==1.3.0 ; python_version >= '3.7'
snscrape==0.6.2.20230320; python_version ~= '3.8' snscrape==0.6.2.20230320 ; python_version ~= '3.8'
sortedcontainers==2.4.0 sortedcontainers==2.4.0
soupsieve==2.4.1; python_version >= '3.7' soupsieve==2.4.1 ; python_version >= '3.7'
sqlalchemy==1.4.47 sqlalchemy==1.4.48
starlette==0.26.1; python_version >= '3.7' starlette==0.27.0 ; python_version >= '3.7'
telethon==1.28.5; python_version >= '3.5' telethon==1.28.5 ; python_version >= '3.5'
text-unidecode==1.3 text-unidecode==1.3
tiktok-downloader==0.3.4 tiktok-downloader==0.3.5
toml==0.10.2; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3' toml==0.10.2 ; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2'
tornado==6.3; python_full_version >= '3.5.2' tornado==6.3.2 ; python_version >= '3.5.2'
tqdm==4.65.0; python_version >= '3.7' tqdm==4.65.0 ; python_version >= '3.7'
trio-websocket==0.10.2; python_version >= '3.7' trio==0.22.0 ; python_version >= '3.7'
trio==0.22.0; python_version >= '3.7' trio-websocket==0.10.2 ; python_version >= '3.7'
typing-extensions==4.5.0; python_version >= '3.7' typing-extensions==4.6.0 ; python_version >= '3.7'
typing-inspect==0.8.0 typing-inspect==0.8.0
tzdata==2023.3; python_version >= '3.6' tzlocal==5.0.1 ; python_version >= '3.7'
tzlocal==4.3; python_version >= '3.7' uritemplate==4.1.1 ; python_version >= '3.6'
uritemplate==4.1.1; python_version >= '3.6' urllib3==1.26.16 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
urllib3==1.26.14; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' uvicorn==0.22.0
uvicorn==0.21.1 uwsgi==2.0.21
vine==1.3.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' vine==1.3.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
vk-api==11.9.9 vk-api==11.9.9
vk-url-scraper==0.3.15; python_version >= '3.7' vk-url-scraper==0.3.24 ; python_version >= '3.7'
websockets==10.4; python_version >= '3.7' websockets==11.0.3 ; python_version >= '3.7'
werkzeug==2.2.3; python_version >= '3.7' werkzeug==2.3.4 ; python_version >= '3.8'
wsproto==1.2.0; python_version >= '3.7' wsproto==1.2.0 ; python_full_version >= '3.7.0'
yt-dlp==2023.2.17; python_version >= '3.7' yarl==1.9.2 ; python_version >= '3.7'
yt-dlp==2023.3.4 ; python_version >= '3.7'

View File

@@ -4,18 +4,21 @@ from fastapi import HTTPException, status, Depends
from fastapi.security import HTTPBasic, HTTPBasicCredentials, HTTPBearer, HTTPAuthorizationCredentials from fastapi.security import HTTPBasic, HTTPBasicCredentials, HTTPBearer, HTTPAuthorizationCredentials
# Configuration # Configuration
GOOGLE_CHROME_APP_ID = os.environ.get("GOOGLE_CHROME_APP_ID") CHROME_APP_IDS = set([app_id.strip() for app_id in os.environ.get("CHROME_APP_IDS", "").split(",")])
assert len(GOOGLE_CHROME_APP_ID)>10, "GOOGLE_CHROME_APP_ID env variable not set" assert len(CHROME_APP_IDS) > 0, "CHROME_APP_IDS env variable not properly set, it's a csv"
ALLOWED_EMAILS = set([e.strip().lower() for e in os.environ.get("ALLOWED_EMAILS", "").split(",")]) for app_id in CHROME_APP_IDS:
assert len(GOOGLE_CHROME_APP_ID)>=1, "at least one ALLOWED_EMAILS is required from the env variable" assert len(app_id) > 10, f"CHROME_APP_IDS got invalid id: {app_id} env variable not set"
logger.info(f"{len(ALLOWED_EMAILS)=}") logger.info(f"{CHROME_APP_IDS=}")
BLOCKED_EMAILS = set([e.strip().lower() for e in os.environ.get("BLOCKED_EMAILS", "").split(",")])
logger.info(f"{len(BLOCKED_EMAILS)=}")
basic_security = HTTPBasic() basic_security = HTTPBasic()
bearer_security = HTTPBearer() bearer_security = HTTPBearer()
#--------------------- Bearer Auth # --------------------- Bearer Auth
async def get_bearer_auth(credentials: HTTPAuthorizationCredentials = Depends(bearer_security)): async def get_bearer_auth(credentials: HTTPAuthorizationCredentials = Depends(bearer_security)):
# validates the Bearer token in the case that it requires it # validates the Bearer token in the case that it requires it
@@ -29,17 +32,17 @@ async def get_bearer_auth(credentials: HTTPAuthorizationCredentials = Depends(be
headers={"WWW-Authenticate": "Bearer"}, headers={"WWW-Authenticate": "Bearer"},
) )
def authenticate_user(access_token): def authenticate_user(access_token):
# https://cloud.google.com/docs/authentication/token-types#access # https://cloud.google.com/docs/authentication/token-types#access
if type(access_token)!=str or len(access_token)<10: return False, "invalid access_token" if type(access_token) != str or len(access_token) < 10: return False, "invalid access_token"
r = requests.get("https://oauth2.googleapis.com/tokeninfo", {"access_token":access_token}) r = requests.get("https://oauth2.googleapis.com/tokeninfo", {"access_token": access_token})
if r.status_code!=200: return False, "error occurred" if r.status_code != 200: return False, "error occurred"
try: try:
j = r.json() j = r.json()
if j.get("azp") != GOOGLE_CHROME_APP_ID and j.get("aud")!=GOOGLE_CHROME_APP_ID: if j.get("azp") not in CHROME_APP_IDS and j.get("aud") not in CHROME_APP_IDS:
return False, f"token does not belong to correct APP_ID" return False, f"token does not belong to valid APP_ID"
# if j.get("email") not in ALLOWED_EMAILS: if j.get("email") in BLOCKED_EMAILS:
if not custom_is_email_allowed(j.get("email"), any_bellingcat_email=True):
return False, f"email '{j.get('email')}' not allowed" return False, f"email '{j.get('email')}' not allowed"
if j.get("email_verified") != "true": if j.get("email_verified") != "true":
return False, f"email '{j.get('email')}' not verified" return False, f"email '{j.get('email')}' not verified"
@@ -50,12 +53,11 @@ def authenticate_user(access_token):
logger.warning(f"EXCEPTION occurred: {e}") logger.warning(f"EXCEPTION occurred: {e}")
return False, f"EXCEPTION occurred" return False, f"EXCEPTION occurred"
def custom_is_email_allowed(email, any_bellingcat_email=False):
return email.lower() in ALLOWED_EMAILS or (any_bellingcat_email and re.match(r'^[\w.]+@bellingcat\.com$', email)) # --------------------- Basic Auth
SFP = os.environ.get("STATIC_FILE_PASSWORD", "") # min length is 20 chars
#--------------------- Basic Auth
SFP = os.environ.get("STATIC_FILE_PASSWORD", "") # min length is 20 chars
async def get_basic_auth(credentials: HTTPBasicCredentials = Depends(basic_security)): async def get_basic_auth(credentials: HTTPBasicCredentials = Depends(basic_security)):
# validates that the Basic token in the case that it requires it # validates that the Basic token in the case that it requires it
assert len(SFP) >= 20, "Invalid STATIC_FILE_PASSWORD, must be at least 20 chars" assert len(SFP) >= 20, "Invalid STATIC_FILE_PASSWORD, must be at least 20 chars"

View File

@@ -1,22 +1,21 @@
import os, re, traceback, yaml import os, traceback, yaml, datetime
from typing import List, Set
from celery import Celery, states from celery import Celery
from celery.exceptions import Ignore
from celery.signals import task_failure from celery.signals import task_failure
from auto_archiver import Config, ArchivingOrchestrator, Metadata from auto_archiver import Config, ArchivingOrchestrator, Metadata
# from auto_archiver.enrichers import ScreenshotEnricher
from loguru import logger from loguru import logger
from db import crud, schemas, models from db import crud, schemas, models
from db.database import engine, SessionLocal from db.database import SessionLocal
from contextlib import contextmanager from contextlib import contextmanager
import json import json
celery = Celery(__name__) celery = Celery(__name__)
celery.conf.broker_url = os.environ.get("CELERY_BROKER_URL", "redis://localhost:6379") celery.conf.broker_url = os.environ.get("CELERY_BROKER_URL", "redis://localhost:6379")
celery.conf.result_backend = os.environ.get("CELERY_RESULT_BACKEND", "redis://localhost:6379") celery.conf.result_backend = os.environ.get("CELERY_RESULT_BACKEND", "redis://localhost:6379")
USER_GROUPS_FILENAME=os.environ.get("USER_GROUPS_FILENAME", "user-groups.yaml") USER_GROUPS_FILENAME = os.environ.get("USER_GROUPS_FILENAME", "user-groups.yaml")
@contextmanager @contextmanager
@@ -25,38 +24,56 @@ def get_db():
try: yield session try: yield session
finally: session.close() finally: session.close()
@celery.task(name="create_archive_task", bind=True, autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={'max_retries': 5}) @celery.task(name="create_archive_task", bind=True, autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={'max_retries': 5})
def create_archive_task(self, archive_json: str): def create_archive_task(self, archive_json: str):
archive = schemas.ArchiveCreate.parse_raw(archive_json) archive = schemas.ArchiveCreate.parse_raw(archive_json)
if not archive.public and archive.group_id and len(archive.group_id) > 0:
# ensure group is valid for user if (em := is_group_invalid_for_user(archive.public, archive.group_id, archive.author_id)): return {"error": em}
with get_db() as session:
db_group = crud.get_group_for_user(session, archive.group_id, archive.author_id)
if not db_group:
logger.error(em := f"User {archive.author_id} is not part of {archive.group_id}, no permission")
return {"error": em}
url = archive.url url = archive.url
logger.info(f"{url=}") logger.info(f"{url=} {archive=}")
logger.info(f"{archive=}")
orchestrator = choose_orchestrator(archive.group_id, archive.author_id) orchestrator = choose_orchestrator(archive.group_id, archive.author_id)
result = orchestrator.feed_item(Metadata().set_url(url)) result = orchestrator.feed_item(Metadata().set_url(url))
if not result:
logger.error(f"UNABLE TO archive: {url}")
return {"error": "unable to archive"}
result_json = result.to_json() try:
with get_db() as session: insert_result_into_db(result, archive.tags, archive.public, archive.group_id, archive.author_id, self.request.id)
# create DB URLs except Exception as e:
db_urls = [models.ArchiveUrl(url=url, key=m.get("id", f"media_{i}")) for i, m in enumerate(result.media) for url in m.urls] logger.error(e)
# create DB TAGs if needed logger.error(traceback.format_exc())
db_tags = [crud.create_tag(session, tag) for tag in archive.tags] return {"error": e}
# insert archive return result.to_json()
db_task = crud.create_task(session, task=schemas.ArchiveCreate(id=self.request.id, url=url, result=json.loads(result_json), public=archive.public, author_id=archive.author_id, group_id=archive.group_id), tags=db_tags, urls=db_urls)
logger.debug(f"Added {db_task.id=} to database on {db_task.created_at}")
return result_json
@celery.task(name="create_sheet_task", bind=True, autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={'max_retries': 0})
def create_sheet_task(self, sheet_json: str):
sheet = schemas.SubmitSheet.parse_raw(sheet_json)
sheet.tags.add("gsheet")
logger.info(f"SHEET START {sheet=}")
if (em := is_group_invalid_for_user(sheet.public, sheet.group_id, sheet.author_id)): return {"error": em}
config = Config()
#TODO: use choose_orchestrator and overwrite the feeder
config.parse(use_cli=False, yaml_config_filename="secrets/orchestration-sheet.yaml", overwrite_configs={"configurations": {"gsheet_feeder": {"sheet": sheet.sheet_name, "sheet_id": sheet.sheet_id, "header": sheet.header}}})
orchestrator = ArchivingOrchestrator(config)
stats = {"archived": 0, "failed": 0, "errors": []}
for result in orchestrator.feed():
try:
insert_result_into_db(result, sheet.tags, sheet.public, sheet.group_id, sheet.author_id, models.generate_uuid())
stats["archived"]+=1
except Exception as e:
logger.error(e)
logger.error(traceback.format_exc())
stats["failed"]+=1
stats["errors"].append(e)
logger.info(f"SHEET DONE {sheet=}")
return {"success": True, "sheet": sheet.sheet_name, "sheet_id": sheet.sheet_id, "time": datetime.datetime.now().isoformat(), **stats}
@task_failure.connect(sender=create_sheet_task)
@task_failure.connect(sender=create_archive_task) @task_failure.connect(sender=create_archive_task)
def task_failure_notifier(sender=None, **kwargs): def task_failure_notifier(sender=None, **kwargs):
logger.warning("😅 From task_failure_notifier ==> Task failed successfully! ") logger.warning("😅 From task_failure_notifier ==> Task failed successfully! ")
@@ -64,6 +81,7 @@ def task_failure_notifier(sender=None, **kwargs):
logger.error(kwargs['traceback']) logger.error(kwargs['traceback'])
logger.error("\n".join(traceback.format_list(traceback.extract_tb(kwargs['traceback'])))) logger.error("\n".join(traceback.format_list(traceback.extract_tb(kwargs['traceback']))))
def choose_orchestrator(group, email): def choose_orchestrator(group, email):
global ORCHESTRATORS global ORCHESTRATORS
if group not in ORCHESTRATORS: group = get_user_first_group(email) if group not in ORCHESTRATORS: group = get_user_first_group(email)
@@ -71,6 +89,7 @@ def choose_orchestrator(group, email):
logger.info(f"CHOOSE Orchestrator for {group=}, {email=}") logger.info(f"CHOOSE Orchestrator for {group=}, {email=}")
return ArchivingOrchestrator(ORCHESTRATORS.get(group)) return ArchivingOrchestrator(ORCHESTRATORS.get(group))
def read_user_groups(): def read_user_groups():
# read yaml safely # read yaml safely
with open(USER_GROUPS_FILENAME) as inf: with open(USER_GROUPS_FILENAME) as inf:
@@ -80,6 +99,7 @@ def read_user_groups():
logger.error(f"could not open user groups filename {USER_GROUPS_FILENAME}: {e}") logger.error(f"could not open user groups filename {USER_GROUPS_FILENAME}: {e}")
raise e raise e
def get_user_first_group(email): def get_user_first_group(email):
user_groups_yaml = read_user_groups() user_groups_yaml = read_user_groups()
groups = user_groups_yaml.get("users", {}).get(email, []) groups = user_groups_yaml.get("users", {}).get(email, [])
@@ -107,7 +127,35 @@ def load_orchestrators():
return ORCHESTRATORS return ORCHESTRATORS
## INIT def is_group_invalid_for_user(public: bool, group_id: str, author_id: str):
"""
ensures that, if a group is specified, the user belongs to it.
if public is true the requirement is not needed
returns an error message if invalid, or False if all is good.
"""
if not public and group_id and len(group_id) > 0:
# ensure group is valid for user
with get_db() as session:
db_group = crud.get_group_for_user(session, group_id, author_id)
if not db_group:
logger.error(em := f"User {author_id} is not part of {group_id}, no permission")
return em
return False
def insert_result_into_db(result: Metadata, tags: Set[str], public: bool, group_id: str, author_id: str, task_id:str):
logger.info(f"INSERTING {public=} {result} into {task_id}")
assert result, "UNABLE TO archive: {url}"
with get_db() as session:
# create DB URLs
db_urls = [models.ArchiveUrl(url=url, key=m.get("id", f"media_{i}")) for i, m in enumerate(result.media) for url in m.urls]
# create DB TAGs if needed
db_tags = [crud.create_tag(session, tag) for tag in tags]
# insert archive
db_task = crud.create_task(session, task=schemas.ArchiveCreate(id=task_id, url=result.get_url(), result=json.loads(result.to_json()), public=public, author_id=author_id, group_id=group_id), tags=db_tags, urls=db_urls)
logger.debug(f"Added {db_task.id=} to database on {db_task.created_at}")
# INIT
ORCHESTRATORS = {} ORCHESTRATORS = {}
load_orchestrators() load_orchestrators()