Merge pull request #3 from bellingcat/sheet-endpoint

This commit is contained in:
Miguel Sozinho Ramalho
2023-05-23 20:21:24 +01:00
committed by GitHub
11 changed files with 1078 additions and 662 deletions

View File

@@ -55,7 +55,13 @@ Copy `.env` and `src/.env` to deployment, along with the contents of `secrets/`
Then `docker compose up -d`.
#### updating packages/app/access
If pipenv packages are updated: `pipenv lock --requirements -r > requirements.txt` (manually comment line `-i https://pypi.org/simple`) and then `docker compose down` + `docker compose up --build -d` to build images with new packages.
If pipenv packages are updated: `pipenv lock --requirements -r > requirements.txt` (or ` pipenv requirements > requirements.txt` depending on pipenv version) (manually comment line `-i https://pypi.org/simple`) and then `docker compose down` + `docker compose up --build -d` to build images with new packages.
New users should be added to the `src/.env` file `ALLOWED_EMAILS` prop
```bash
# CALL /sheet POST endpoint
curl -XPOST -H "Authorization: Bearer GOOGLE_OAUTH_TOKEN" -H "Content-type: application/json" -d '{"sheet_id": "SHEET_ID", "header": 1}' 'http://localhost:8004/sheet'
```

View File

@@ -33,6 +33,7 @@ services:
redis:
image: redis:6-alpine
# command: redis-server /conf/redis.conf # DEV ONLY
command: redis-server /conf/redis.conf --requirepass ${REDIS_PASSWORD}
volumes:
- "./redis/data:/data"

View File

@@ -1,4 +1,6 @@
GOOGLE_CHROME_APP_ID=0000000000000000000000000000000000.apps.googleusercontent.com
# for validating non-email limited JWT
GOOGLE_CHROME_APP_ID_PUBLIC=0000000000000000000000000000000000.apps.googleusercontent.com
ALLOWED_EMAILS=email1,email2
ORCHESTRATION_CONFIG_DEFAULT=secrets/orchestration.yaml

View File

@@ -8,7 +8,7 @@ aiofiles = "==0.6.0"
celery = "==4.4.7"
fastapi = "*"
flower = "==0.9.7"
jinja2 = ">=3.0.3"
jinja2 = "*"
pytest = "==6.2.4"
redis = "==3.5.3"
requests = ">=2.25.1"

1257
src/Pipfile.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,6 @@
from functools import cache
from sqlalchemy.orm import Session, load_only
from sqlalchemy import Column
from sqlalchemy import Column, or_
from loguru import logger
from . import models, schemas
import yaml
@@ -13,17 +13,17 @@ def get_task(db: Session, task_id: str):
def get_tasks(db: Session, skip: int = 0, limit: int = 100):
return base_query(db).offset(skip).limit(limit).all()
def search_tasks_by_url(db: Session, url:str, skip: int = 0, limit: int = 100):
return base_query(db).filter(models.Archive.url.like(f'%{url}%')).offset(skip).limit(limit).all()
def search_tasks_by_url(db: Session, url:str, email:str, skip: int = 0, limit: int = 100):
groups = get_user_groups(db, email)
return base_query(db).filter(or_(models.Archive.public==True, models.Archive.author_id==email, models.Archive.group_id.in_(groups))).filter(models.Archive.url.like(f'%{url}%')).offset(skip).limit(limit).all()
def search_tasks_by_email(db: Session, email:str, skip: int = 0, limit: int = 100):
return base_query(db).filter(models.Archive.author.has(email=email)).offset(skip).limit(limit).all()
def create_task(db: Session, task: schemas.ArchiveCreate, tags:list[models.Tag],urls:list[models.ArchiveUrl]):
db_task = models.Archive(id=task.id, url=task.url, author_id=task.author_id, result=task.result, group_id=task.group_id)
logger.debug(tags)
db_task.tags = tags # will this work? TODO: test if I don't call create tag before
db_task.urls = urls # will this work to create ArchiveUrl? TODO: test
db_task = models.Archive(id=task.id, url=task.url, result=task.result, public=task.public, author_id=task.author_id, group_id=task.group_id)
db_task.tags = tags
db_task.urls = urls
db.add(db_task)
db.commit()
db.refresh(db_task)

View File

@@ -1,6 +1,7 @@
from pydantic import BaseModel
from datetime import datetime
class ArchiveCreate(BaseModel):
id: str | None = None
url: str
@@ -8,11 +9,10 @@ class ArchiveCreate(BaseModel):
public: bool = True
author_id: str | None = None
group_id: str | None = None
tags: list = []
tags: set = set()
# urls: list = []
class Archive(ArchiveCreate):
created_at: datetime
updated_at: datetime | None
@@ -22,10 +22,12 @@ class Archive(ArchiveCreate):
orm_mode = True
# class TagCreate(BaseModel):
# id: str
# class Tag(TagCreate):
# created_at: datetime
# # class Config:
# # orm_mode = True
class SubmitSheet(BaseModel):
sheet_name: str | None = None
sheet_id: str | None = None
header: int = 1
public: bool = False
author_id: str | None = None
group_id: str | None = None
tags: set | None = set()
columns: dict | None = {} # TODO: implement

View File

@@ -10,7 +10,7 @@ from dotenv import load_dotenv
import traceback, os, logging
from loguru import logger
from worker import create_archive_task, celery
from worker import create_archive_task, create_sheet_task, celery
from db import crud, models, schemas
from db.database import engine, SessionLocal
@@ -21,11 +21,12 @@ load_dotenv()
# Configuration
ALLOWED_ORIGINS = os.environ.get("ALLOWED_ORIGINS", "chrome-extension://ondkcheoicfckabcnkdgbepofpjmjcmb,chrome-extension://ojcimmjndnlmmlgnjaeojoebaceokpdp").split(",")
VERSION = "0.4.0"
# min-version refers to the version of auto-archiver-extension on the webstore
BREAKING_CHANGES = {"minVersion": "0.3.0", "message": "The latest update has breaking changes, please update the extension to the most recent version."}
VERSION = "0.5.0"
app = FastAPI()
# min-version refers to the version of auto-archiver-extension on the webstore
BREAKING_CHANGES = {"minVersion": "0.3.1", "message": "The latest update has breaking changes, please update the extension to the most recent version."}
app = FastAPI(title="Auto-Archiver API", version=VERSION, contact={"name":"Bellingcat", "url":"https://github.com/bellingcat/auto-archiver-api"})
app.add_middleware(
CORSMiddleware,
allow_origins=ALLOWED_ORIGINS,
@@ -39,7 +40,14 @@ def get_db():
session = SessionLocal()
try: yield session
finally: session.close()
# logging configurations
logger.add("logs/api_logs.log", retention="30 days", rotation="3 days")
@app.middleware("http")
async def logging_middleware(request: Request, call_next):
response = await call_next(request)
logger.info(f"{request.client.host}:{request.client.port} {request.method} {request.url._url} - HTTP {response.status_code}")
return response
@app.get("/")
async def home(request: Request):
@@ -53,14 +61,6 @@ async def home(request: Request):
except Exception as e: logger.error(e)
return JSONResponse(status)
# logging configurations
logger.add("logs/api_logs.log", retention="30 days", rotation="3 days")
@app.middleware("http")
async def logging_middleware(request: Request, call_next):
response = await call_next(request)
logger.info(f"{request.client.host}:{request.client.port} {request.method} {request.url._url} - HTTP {response.status_code}")
return response
# Bearer protected below
@@ -69,22 +69,17 @@ def get_user_groups(db: Session = Depends(get_db), email = Depends(get_bearer_au
return crud.get_user_groups(db, email)
@app.get("/tasks/search-url", response_model=list[schemas.Archive])
def search(url:str, skip: int = 0, limit: int = 100, db: Session = Depends(get_db), _email = Depends(get_bearer_auth)):
return crud.search_tasks_by_url(db, url, skip=skip, limit=limit)
# @app.get("/tasks/search", response_model=list[schemas.Task])
# def search(skip: int = 0, limit: int = 100, db: Session = Depends(get_db), email = Depends(get_bearer_auth)):
# return crud.get_tasks(db, skip=skip, limit=limit)
def search(url:str, skip: int = 0, limit: int = 100, db: Session = Depends(get_db), email = Depends(get_bearer_auth)):
return crud.search_tasks_by_url(db, url, email, skip=skip, limit=limit)
@app.get("/tasks/sync", response_model=list[schemas.Archive])
def search(skip: int = 0, limit: int = 100, db: Session = Depends(get_db), email = Depends(get_bearer_auth)):
return crud.search_tasks_by_email(db, email, skip=skip, limit=limit)
@app.post("/tasks", status_code=201)
def run_task(archive:schemas.ArchiveCreate, email = Depends(get_bearer_auth)):
def archive_sheet(archive:schemas.ArchiveCreate, email = Depends(get_bearer_auth)):
archive.author_id = email
url = archive.url
logger.warning(archive)
logger.info(f"new {archive.public=} task for {email=} and {archive.group_id=}: {url}")
if type(url)!=str or len(url)<=5:
raise HTTPException(status_code=422, detail=f"Invalid URL received: {url}")
@@ -92,22 +87,10 @@ def run_task(archive:schemas.ArchiveCreate, email = Depends(get_bearer_auth)):
task = create_archive_task.delay(archive.json())
return JSONResponse({"id": task.id})
# @app.post("/tasks", status_code=201)
# def run_task(payload = Body(...), email = Depends(get_bearer_auth)):
# url = payload.get('url')
# public = payload.get('public', True)
# group = payload.get('group', None)
# logger.info(f"new {public=} task for {email=} and {group=}: {url}")
# if type(url)!=str or len(url)<=5:
# raise HTTPException(status_code=422, detail=f"Invalid URL received: {url}")
# task = create_archive_task.delay(url=payload.get('url'), email=email, public=public, group=group)
# return JSONResponse({"id": task.id})
@app.get("/tasks/{task_id}")
def get_status(task_id, email = Depends(get_bearer_auth)):
logger.info(f"status check for user {email}")
task_result = AsyncResult(task_id, app=celery)
logger.info(task_result)
result = {
"id": task_id,
"status": task_result.status,
@@ -116,7 +99,10 @@ def get_status(task_id, email = Depends(get_bearer_auth)):
try:
if task_result.result and "error" in task_result.result:
result["status"] = "FAILURE"
except Exception as e: logger.error(traceback.format_exc())
except Exception as e:
logger.error(e)
logger.error(traceback.format_exc())
result["status"] = "FAILURE"
try:
json_result = jsonable_encoder(result, exclude_unset=True)
return JSONResponse(json_result)
@@ -131,13 +117,22 @@ def get_status(task_id, email = Depends(get_bearer_auth)):
@app.delete("/tasks/{task_id}")
def get_status(task_id, db: Session = Depends(get_db), email = Depends(get_bearer_auth)):
def delete_task(task_id, db: Session = Depends(get_db), email = Depends(get_bearer_auth)):
logger.info(f"deleting task {task_id} request by {email}")
return JSONResponse({
"id": task_id,
"deleted": crud.soft_delete_task(db, task_id, email)
})
@app.post("/sheet", status_code=201)
def archive_sheet(sheet:schemas.SubmitSheet, email = Depends(get_bearer_auth)):
logger.info(f"SHEET TASK for {sheet=}")
sheet.author_id = email
if not sheet.sheet_name and not sheet.sheet_id:
raise HTTPException(status_code=422, detail=f"sheet name or id is required")
task = create_sheet_task.delay(sheet.json())
return JSONResponse({"id": task.id})
# Basic protected logic to allow access to 1 static file
SF = os.environ.get("STATIC_FILE", "")
if len(SF) > 1 and os.path.isfile(SF):

View File

@@ -1,133 +1,134 @@
#
# These requirements were autogenerated by pipenv
# To regenerate from the project's Pipfile, run:
#
# pipenv lock --requirements
#
# -i https://pypi.org/simple
aiofiles==0.6.0
aiohttp==3.8.4 ; python_version >= '3.6'
aiosignal==1.3.1 ; python_version >= '3.7'
aiosqlite==0.19.0
alembic==1.10.3
amqp==2.6.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
anyio==3.6.2; python_full_version >= '3.6.2'
alembic==1.11.1
amqp==2.6.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
anyio==3.6.2 ; python_full_version >= '3.6.2'
argparse==1.4.0
async-generator==1.10; python_version >= '3.5'
attrs==23.1.0; python_version >= '3.7'
async-generator==1.10 ; python_version >= '3.5'
async-timeout==4.0.2 ; python_version >= '3.6'
attrs==23.1.0 ; python_version >= '3.7'
authlib==0.15.6
auto-archiver==0.5.6
beautifulsoup4==4.12.2; python_version >= '3.6'
auto-archiver==0.5.17
beautifulsoup4==4.12.2 ; python_full_version >= '3.6.0'
billiard==3.6.4.0
boto3==1.26.115; python_version >= '3.7'
botocore==1.29.115; python_version >= '3.7'
brotli==1.0.9; platform_python_implementation == 'CPython'
blinker==1.6.2 ; python_version >= '3.7'
boto3==1.26.138 ; python_version >= '3.7'
botocore==1.29.138 ; python_version >= '3.7'
brotli==1.0.9 ; platform_python_implementation == 'CPython'
bs4==0.0.1
cachetools==5.3.0; python_version ~= '3.7'
cachetools==5.3.0 ; python_version ~= '3.7'
celery==4.4.7
certifi==2022.12.7; python_version >= '3.6'
certifi==2023.5.7 ; python_version >= '3.6'
cffi==1.15.1
charset-normalizer==3.0.1
click==8.1.3; python_version >= '3.7'
cloudscraper==1.2.69
cryptography==38.0.4; python_version >= '3.6'
dataclasses-json==0.5.7; python_version >= '3.6'
dateparser==1.1.8; python_version >= '3.7'
exceptiongroup==1.1.1; python_version < '3.11'
charset-normalizer==3.1.0 ; python_full_version >= '3.7.0'
click==8.1.3 ; python_version >= '3.7'
cloudscraper==1.2.71
cryptography==38.0.4 ; python_version >= '3.6'
dataclasses-json==0.5.7 ; python_version >= '3.6'
dateparser==1.1.8 ; python_version >= '3.7'
exceptiongroup==1.1.1 ; python_version < '3.11'
fastapi==0.95.2
fastapi-utils==0.2.1
fastapi==0.95.1
ffmpeg-python==0.2.0
filelock==3.12.0; python_version >= '3.7'
flask==2.2.3; python_version >= '3.7'
filelock==3.12.0 ; python_version >= '3.7'
flask==2.3.2 ; python_version >= '3.8'
flower==0.9.7
future==0.18.3; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'
google-api-core==2.11.0; python_version >= '3.7'
google-api-python-client==2.86.0; python_version >= '3.7'
frozenlist==1.3.3 ; python_version >= '3.7'
future==0.18.3 ; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'
google-api-core==2.11.0 ; python_version >= '3.7'
google-api-python-client==2.86.0 ; python_version >= '3.7'
google-auth==2.18.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
google-auth-httplib2==0.1.0
google-auth-oauthlib==1.0.0; python_version >= '3.6'
google-auth==2.17.3; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
googleapis-common-protos==1.59.0; python_version >= '3.7'
greenlet==2.0.2; python_version >= '3' and platform_machine == 'aarch64' or (platform_machine == 'ppc64le' or (platform_machine == 'x86_64' or (platform_machine == 'amd64' or (platform_machine == 'AMD64' or (platform_machine == 'win32' or platform_machine == 'WIN32')))))
gspread==5.8.0; python_version not in '3.0, 3.1, 3.2, 3.3' and python_version >= '3.6'
h11==0.14.0; python_version >= '3.7'
httplib2==0.22.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
humanize==4.6.0; python_version >= '3.7'
idna==3.4; python_version >= '3.5'
iniconfig==2.0.0; python_version >= '3.7'
instaloader==4.9.6; python_version >= '3.8'
itsdangerous==2.1.2; python_version >= '3.7'
google-auth-oauthlib==1.0.0 ; python_version >= '3.6'
googleapis-common-protos==1.59.0 ; python_version >= '3.7'
greenlet==2.0.2 ; python_version >= '3' and (platform_machine == 'aarch64' or (platform_machine == 'ppc64le' or (platform_machine == 'x86_64' or (platform_machine == 'amd64' or (platform_machine == 'AMD64' or (platform_machine == 'win32' or platform_machine == 'WIN32'))))))
gspread==5.9.0 ; python_version not in '3.0, 3.1, 3.2, 3.3' and python_version >= '3.6'
h11==0.14.0 ; python_version >= '3.7'
httpcore==0.17.2 ; python_version >= '3.7'
httplib2==0.22.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
httpx==0.24.1 ; python_version >= '3.7'
humanize==4.6.0 ; python_version >= '3.7'
idna==3.4 ; python_version >= '3.5'
iniconfig==2.0.0 ; python_version >= '3.7'
instaloader==4.9.6 ; python_version >= '3.8'
itsdangerous==2.1.2 ; python_version >= '3.7'
jinja2==3.1.2
jmespath==1.0.1; python_version >= '3.7'
kombu==4.6.11; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
jmespath==1.0.1 ; python_version >= '3.7'
kombu==4.6.11 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
loguru==0.7.0
lxml==4.9.2; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
mako==1.2.4; python_version >= '3.7'
markdown-it-py==2.2.0; python_version >= '3.7'
markupsafe==2.1.2; python_version >= '3.7'
lxml==4.9.2 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
mako==1.2.4 ; python_version >= '3.7'
markdown-it-py==2.2.0 ; python_version >= '3.7'
markupsafe==2.1.2 ; python_version >= '3.7'
marshmallow==3.19.0 ; python_version >= '3.7'
marshmallow-enum==1.5.1
marshmallow==3.19.0; python_version >= '3.7'
mdurl==0.1.2; python_version >= '3.7'
mutagen==1.46.0; python_version >= '3.7'
mypy-extensions==1.0.0; python_version >= '3.5'
mdurl==0.1.2 ; python_version >= '3.7'
multidict==6.0.4 ; python_version >= '3.7'
mutagen==1.46.0 ; python_version >= '3.7'
mypy-extensions==1.0.0 ; python_version >= '3.5'
oauth2client==4.1.3
oauthlib==3.2.2; python_version >= '3.6'
outcome==1.2.0; python_version >= '3.7'
packaging==23.1; python_version >= '3.7'
pluggy==0.13.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
oauthlib==3.2.2 ; python_version >= '3.6'
outcome==1.2.0 ; python_version >= '3.7'
packaging==23.1 ; python_version >= '3.7'
pluggy==0.13.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
prometheus-client==0.8.0
protobuf==4.22.3; python_version >= '3.7'
py==1.11.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
protobuf==4.23.1 ; python_version >= '3.7'
py==1.11.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
pyaes==1.6.1
pyasn1-modules==0.2.8
pyasn1==0.4.8
pyasn1==0.5.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
pyasn1-modules==0.3.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
pycparser==2.21
pycryptodomex==3.17; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
pydantic==1.10.7; python_version >= '3.7'
pygments==2.15.0; python_version >= '3.7'
pyparsing==3.0.9; python_version >= '3.1'
pycryptodomex==3.18.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
pydantic==1.10.7 ; python_version >= '3.7'
pygments==2.15.1 ; python_version >= '3.7'
pyparsing==3.0.9 ; python_version >= '3.1'
pysocks==1.7.1
pytest==6.2.4
python-dateutil==2.8.2; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
python-dateutil==2.8.2 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
python-dotenv==1.0.0
python-slugify==8.0.1; python_version >= '3.7'
python-twitter-v2==0.8.1; python_version >= '3.6' and python_version < '4'
pytz-deprecation-shim==0.1.0.post0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
python-slugify==8.0.1 ; python_version >= '3.7'
python-twitter-v2==0.8.1 ; python_version >= '3.6' and python_version < '4.0'
pytz==2023.3
pyyaml==6.0; python_version >= '3.6'
pyyaml==6.0 ; python_version >= '3.6'
redis==3.5.3
regex==2023.3.23; python_version >= '3.8'
requests-oauthlib==1.3.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
requests-toolbelt==0.10.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
requests==2.28.2
rich==13.3.4; python_version >= '3.7'
rsa==4.9; python_version >= '3.6' and python_version < '4'
s3transfer==0.6.0; python_version >= '3.7'
selenium==4.8.3; python_version >= '3.7'
six==1.16.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
sniffio==1.3.0; python_version >= '3.7'
snscrape==0.6.2.20230320; python_version ~= '3.8'
regex==2023.5.5 ; python_version >= '3.6'
requests==2.31.0
requests-oauthlib==1.3.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
requests-toolbelt==1.0.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
rich==13.3.5 ; python_full_version >= '3.7.0'
rsa==4.9 ; python_version >= '3.6' and python_version < '4'
s3transfer==0.6.1 ; python_version >= '3.7'
selenium==4.9.1 ; python_version >= '3.7'
six==1.16.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
sniffio==1.3.0 ; python_version >= '3.7'
snscrape==0.6.2.20230320 ; python_version ~= '3.8'
sortedcontainers==2.4.0
soupsieve==2.4.1; python_version >= '3.7'
sqlalchemy==1.4.47
starlette==0.26.1; python_version >= '3.7'
telethon==1.28.5; python_version >= '3.5'
soupsieve==2.4.1 ; python_version >= '3.7'
sqlalchemy==1.4.48
starlette==0.27.0 ; python_version >= '3.7'
telethon==1.28.5 ; python_version >= '3.5'
text-unidecode==1.3
tiktok-downloader==0.3.4
toml==0.10.2; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'
tornado==6.3; python_full_version >= '3.5.2'
tqdm==4.65.0; python_version >= '3.7'
trio-websocket==0.10.2; python_version >= '3.7'
trio==0.22.0; python_version >= '3.7'
typing-extensions==4.5.0; python_version >= '3.7'
tiktok-downloader==0.3.5
toml==0.10.2 ; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2'
tornado==6.3.2 ; python_version >= '3.5.2'
tqdm==4.65.0 ; python_version >= '3.7'
trio==0.22.0 ; python_version >= '3.7'
trio-websocket==0.10.2 ; python_version >= '3.7'
typing-extensions==4.6.0 ; python_version >= '3.7'
typing-inspect==0.8.0
tzdata==2023.3; python_version >= '3.6'
tzlocal==4.3; python_version >= '3.7'
uritemplate==4.1.1; python_version >= '3.6'
urllib3==1.26.14; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
uvicorn==0.21.1
vine==1.3.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
tzlocal==5.0.1 ; python_version >= '3.7'
uritemplate==4.1.1 ; python_version >= '3.6'
urllib3==1.26.16 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
uvicorn==0.22.0
uwsgi==2.0.21
vine==1.3.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
vk-api==11.9.9
vk-url-scraper==0.3.15; python_version >= '3.7'
websockets==10.4; python_version >= '3.7'
werkzeug==2.2.3; python_version >= '3.7'
wsproto==1.2.0; python_version >= '3.7'
yt-dlp==2023.2.17; python_version >= '3.7'
vk-url-scraper==0.3.24 ; python_version >= '3.7'
websockets==11.0.3 ; python_version >= '3.7'
werkzeug==2.3.4 ; python_version >= '3.8'
wsproto==1.2.0 ; python_full_version >= '3.7.0'
yarl==1.9.2 ; python_version >= '3.7'
yt-dlp==2023.3.4 ; python_version >= '3.7'

View File

@@ -4,18 +4,21 @@ from fastapi import HTTPException, status, Depends
from fastapi.security import HTTPBasic, HTTPBasicCredentials, HTTPBearer, HTTPAuthorizationCredentials
# Configuration
GOOGLE_CHROME_APP_ID = os.environ.get("GOOGLE_CHROME_APP_ID")
assert len(GOOGLE_CHROME_APP_ID)>10, "GOOGLE_CHROME_APP_ID env variable not set"
ALLOWED_EMAILS = set([e.strip().lower() for e in os.environ.get("ALLOWED_EMAILS", "").split(",")])
assert len(GOOGLE_CHROME_APP_ID)>=1, "at least one ALLOWED_EMAILS is required from the env variable"
logger.info(f"{len(ALLOWED_EMAILS)=}")
CHROME_APP_IDS = set([app_id.strip() for app_id in os.environ.get("CHROME_APP_IDS", "").split(",")])
assert len(CHROME_APP_IDS) > 0, "CHROME_APP_IDS env variable not properly set, it's a csv"
for app_id in CHROME_APP_IDS:
assert len(app_id) > 10, f"CHROME_APP_IDS got invalid id: {app_id} env variable not set"
logger.info(f"{CHROME_APP_IDS=}")
BLOCKED_EMAILS = set([e.strip().lower() for e in os.environ.get("BLOCKED_EMAILS", "").split(",")])
logger.info(f"{len(BLOCKED_EMAILS)=}")
basic_security = HTTPBasic()
bearer_security = HTTPBearer()
#--------------------- Bearer Auth
# --------------------- Bearer Auth
async def get_bearer_auth(credentials: HTTPAuthorizationCredentials = Depends(bearer_security)):
# validates the Bearer token in the case that it requires it
@@ -28,20 +31,20 @@ async def get_bearer_auth(credentials: HTTPAuthorizationCredentials = Depends(be
detail=info,
headers={"WWW-Authenticate": "Bearer"},
)
def authenticate_user(access_token):
# https://cloud.google.com/docs/authentication/token-types#access
if type(access_token)!=str or len(access_token)<10: return False, "invalid access_token"
r = requests.get("https://oauth2.googleapis.com/tokeninfo", {"access_token":access_token})
if r.status_code!=200: return False, "error occurred"
if type(access_token) != str or len(access_token) < 10: return False, "invalid access_token"
r = requests.get("https://oauth2.googleapis.com/tokeninfo", {"access_token": access_token})
if r.status_code != 200: return False, "error occurred"
try:
j = r.json()
if j.get("azp") != GOOGLE_CHROME_APP_ID and j.get("aud")!=GOOGLE_CHROME_APP_ID:
return False, f"token does not belong to correct APP_ID"
# if j.get("email") not in ALLOWED_EMAILS:
if not custom_is_email_allowed(j.get("email"), any_bellingcat_email=True):
if j.get("azp") not in CHROME_APP_IDS and j.get("aud") not in CHROME_APP_IDS:
return False, f"token does not belong to valid APP_ID"
if j.get("email") in BLOCKED_EMAILS:
return False, f"email '{j.get('email')}' not allowed"
if j.get("email_verified") != "true":
if j.get("email_verified") != "true":
return False, f"email '{j.get('email')}' not verified"
if int(j.get("expires_in", -1)) <= 0:
return False, "Token expired"
@@ -50,12 +53,11 @@ def authenticate_user(access_token):
logger.warning(f"EXCEPTION occurred: {e}")
return False, f"EXCEPTION occurred"
def custom_is_email_allowed(email, any_bellingcat_email=False):
return email.lower() in ALLOWED_EMAILS or (any_bellingcat_email and re.match(r'^[\w.]+@bellingcat\.com$', email))
# --------------------- Basic Auth
SFP = os.environ.get("STATIC_FILE_PASSWORD", "") # min length is 20 chars
#--------------------- Basic Auth
SFP = os.environ.get("STATIC_FILE_PASSWORD", "") # min length is 20 chars
async def get_basic_auth(credentials: HTTPBasicCredentials = Depends(basic_security)):
# validates that the Basic token in the case that it requires it
assert len(SFP) >= 20, "Invalid STATIC_FILE_PASSWORD, must be at least 20 chars"
@@ -66,4 +68,4 @@ async def get_basic_auth(credentials: HTTPBasicCredentials = Depends(basic_secur
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Wrong static file access credentials",
headers={"WWW-Authenticate": "Basic"}
)
)

View File

@@ -1,22 +1,21 @@
import os, re, traceback, yaml
import os, traceback, yaml, datetime
from typing import List, Set
from celery import Celery, states
from celery.exceptions import Ignore
from celery import Celery
from celery.signals import task_failure
from auto_archiver import Config, ArchivingOrchestrator, Metadata
# from auto_archiver.enrichers import ScreenshotEnricher
from loguru import logger
from db import crud, schemas, models
from db.database import engine, SessionLocal
from db.database import SessionLocal
from contextlib import contextmanager
import json
celery = Celery(__name__)
celery.conf.broker_url = os.environ.get("CELERY_BROKER_URL", "redis://localhost:6379")
celery.conf.result_backend = os.environ.get("CELERY_RESULT_BACKEND", "redis://localhost:6379")
USER_GROUPS_FILENAME=os.environ.get("USER_GROUPS_FILENAME", "user-groups.yaml")
USER_GROUPS_FILENAME = os.environ.get("USER_GROUPS_FILENAME", "user-groups.yaml")
@contextmanager
@@ -25,38 +24,56 @@ def get_db():
try: yield session
finally: session.close()
@celery.task(name="create_archive_task", bind=True, autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={'max_retries': 5})
@celery.task(name="create_archive_task", bind=True, autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={'max_retries': 5})
def create_archive_task(self, archive_json: str):
archive = schemas.ArchiveCreate.parse_raw(archive_json)
if not archive.public and archive.group_id and len(archive.group_id) > 0:
# ensure group is valid for user
with get_db() as session:
db_group = crud.get_group_for_user(session, archive.group_id, archive.author_id)
if not db_group:
logger.error(em := f"User {archive.author_id} is not part of {archive.group_id}, no permission")
return {"error": em}
if (em := is_group_invalid_for_user(archive.public, archive.group_id, archive.author_id)): return {"error": em}
url = archive.url
logger.info(f"{url=}")
logger.info(f"{archive=}")
logger.info(f"{url=} {archive=}")
orchestrator = choose_orchestrator(archive.group_id, archive.author_id)
result = orchestrator.feed_item(Metadata().set_url(url))
if not result:
logger.error(f"UNABLE TO archive: {url}")
return {"error": "unable to archive"}
result_json = result.to_json()
with get_db() as session:
# create DB URLs
db_urls = [models.ArchiveUrl(url=url, key=m.get("id", f"media_{i}")) for i, m in enumerate(result.media) for url in m.urls]
# create DB TAGs if needed
db_tags = [crud.create_tag(session, tag) for tag in archive.tags]
# insert archive
db_task = crud.create_task(session, task=schemas.ArchiveCreate(id=self.request.id, url=url, result=json.loads(result_json), public=archive.public, author_id=archive.author_id, group_id=archive.group_id), tags=db_tags, urls=db_urls)
logger.debug(f"Added {db_task.id=} to database on {db_task.created_at}")
return result_json
try:
insert_result_into_db(result, archive.tags, archive.public, archive.group_id, archive.author_id, self.request.id)
except Exception as e:
logger.error(e)
logger.error(traceback.format_exc())
return {"error": e}
return result.to_json()
@celery.task(name="create_sheet_task", bind=True, autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={'max_retries': 0})
def create_sheet_task(self, sheet_json: str):
sheet = schemas.SubmitSheet.parse_raw(sheet_json)
sheet.tags.add("gsheet")
logger.info(f"SHEET START {sheet=}")
if (em := is_group_invalid_for_user(sheet.public, sheet.group_id, sheet.author_id)): return {"error": em}
config = Config()
#TODO: use choose_orchestrator and overwrite the feeder
config.parse(use_cli=False, yaml_config_filename="secrets/orchestration-sheet.yaml", overwrite_configs={"configurations": {"gsheet_feeder": {"sheet": sheet.sheet_name, "sheet_id": sheet.sheet_id, "header": sheet.header}}})
orchestrator = ArchivingOrchestrator(config)
stats = {"archived": 0, "failed": 0, "errors": []}
for result in orchestrator.feed():
try:
insert_result_into_db(result, sheet.tags, sheet.public, sheet.group_id, sheet.author_id, models.generate_uuid())
stats["archived"]+=1
except Exception as e:
logger.error(e)
logger.error(traceback.format_exc())
stats["failed"]+=1
stats["errors"].append(e)
logger.info(f"SHEET DONE {sheet=}")
return {"success": True, "sheet": sheet.sheet_name, "sheet_id": sheet.sheet_id, "time": datetime.datetime.now().isoformat(), **stats}
@task_failure.connect(sender=create_sheet_task)
@task_failure.connect(sender=create_archive_task)
def task_failure_notifier(sender=None, **kwargs):
logger.warning("😅 From task_failure_notifier ==> Task failed successfully! ")
@@ -64,6 +81,7 @@ def task_failure_notifier(sender=None, **kwargs):
logger.error(kwargs['traceback'])
logger.error("\n".join(traceback.format_list(traceback.extract_tb(kwargs['traceback']))))
def choose_orchestrator(group, email):
global ORCHESTRATORS
if group not in ORCHESTRATORS: group = get_user_first_group(email)
@@ -71,6 +89,7 @@ def choose_orchestrator(group, email):
logger.info(f"CHOOSE Orchestrator for {group=}, {email=}")
return ArchivingOrchestrator(ORCHESTRATORS.get(group))
def read_user_groups():
# read yaml safely
with open(USER_GROUPS_FILENAME) as inf:
@@ -80,6 +99,7 @@ def read_user_groups():
logger.error(f"could not open user groups filename {USER_GROUPS_FILENAME}: {e}")
raise e
def get_user_first_group(email):
user_groups_yaml = read_user_groups()
groups = user_groups_yaml.get("users", {}).get(email, [])
@@ -94,12 +114,12 @@ def load_orchestrators():
reads the orchestrators key in the config file to load different orchestrators for different groups
"""
user_groups_yaml = read_user_groups()
orchestrators_config = user_groups_yaml.get("orchestrators", {})
assert len(orchestrators_config), f"No orchestrators key found in {USER_GROUPS_FILENAME}. please see the example file"
assert "default" in orchestrators_config, "please include a 'default' orchestrator to be used when the user has no group"
logger.debug(f"Found {len(orchestrators_config)} group orchestrators.")
for group, config_filename in orchestrators_config.items():
config = Config()
config.parse(use_cli=False, yaml_config_filename=config_filename)
@@ -107,7 +127,35 @@ def load_orchestrators():
return ORCHESTRATORS
## INIT
def is_group_invalid_for_user(public: bool, group_id: str, author_id: str):
"""
ensures that, if a group is specified, the user belongs to it.
if public is true the requirement is not needed
returns an error message if invalid, or False if all is good.
"""
if not public and group_id and len(group_id) > 0:
# ensure group is valid for user
with get_db() as session:
db_group = crud.get_group_for_user(session, group_id, author_id)
if not db_group:
logger.error(em := f"User {author_id} is not part of {group_id}, no permission")
return em
return False
def insert_result_into_db(result: Metadata, tags: Set[str], public: bool, group_id: str, author_id: str, task_id:str):
logger.info(f"INSERTING {public=} {result} into {task_id}")
assert result, "UNABLE TO archive: {url}"
with get_db() as session:
# create DB URLs
db_urls = [models.ArchiveUrl(url=url, key=m.get("id", f"media_{i}")) for i, m in enumerate(result.media) for url in m.urls]
# create DB TAGs if needed
db_tags = [crud.create_tag(session, tag) for tag in tags]
# insert archive
db_task = crud.create_task(session, task=schemas.ArchiveCreate(id=task_id, url=result.get_url(), result=json.loads(result.to_json()), public=public, author_id=author_id, group_id=group_id), tags=db_tags, urls=db_urls)
logger.debug(f"Added {db_task.id=} to database on {db_task.created_at}")
# INIT
ORCHESTRATORS = {}
load_orchestrators()
load_orchestrators()