mirror of
https://github.com/bellingcat/auto-archiver-api.git
synced 2026-06-12 21:48:35 +03:00
fully isolate worker from web via shared
This commit is contained in:
@@ -1,3 +1,11 @@
|
|||||||
|
from typing import List
|
||||||
|
|
||||||
|
from auto_archiver.core import Media, Metadata
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from app.shared.db import models
|
||||||
|
|
||||||
|
|
||||||
def fnv1a_hash_mod(s: str, modulo: int) -> int:
|
def fnv1a_hash_mod(s: str, modulo: int) -> int:
|
||||||
# receives a string and returns a number in [0:modulo-1], ensures an even
|
# receives a string and returns a number in [0:modulo-1], ensures an even
|
||||||
# distribution over the modulo range
|
# distribution over the modulo range
|
||||||
@@ -12,3 +20,44 @@ def fnv1a_hash_mod(s: str, modulo: int) -> int:
|
|||||||
if offset_basis_hash < 0x80000000
|
if offset_basis_hash < 0x80000000
|
||||||
else offset_basis_hash - 0x100000000
|
else offset_basis_hash - 0x100000000
|
||||||
) % modulo
|
) % modulo
|
||||||
|
|
||||||
|
|
||||||
|
def convert_if_media(media):
|
||||||
|
if isinstance(media, Media):
|
||||||
|
return media
|
||||||
|
elif isinstance(media, dict):
|
||||||
|
try:
|
||||||
|
return Media.from_dict(media)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"error parsing {media} : {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_urls(result: Metadata) -> List[models.ArchiveUrl]:
|
||||||
|
db_urls = []
|
||||||
|
for m in result.media:
|
||||||
|
for i, url in enumerate(m.urls):
|
||||||
|
db_urls.append(
|
||||||
|
models.ArchiveUrl(url=url, key=m.get("id", f"media_{i}"))
|
||||||
|
)
|
||||||
|
for k, prop in m.properties.items():
|
||||||
|
if prop_converted := convert_if_media(prop):
|
||||||
|
for i, url in enumerate(prop_converted.urls):
|
||||||
|
db_urls.append(
|
||||||
|
models.ArchiveUrl(
|
||||||
|
url=url, key=prop_converted.get("id", f"{k}_{i}")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if isinstance(prop, list):
|
||||||
|
for i, prop_media in enumerate(prop):
|
||||||
|
if prop_media := convert_if_media(prop_media):
|
||||||
|
for j, url in enumerate(prop_media.urls):
|
||||||
|
db_urls.append(
|
||||||
|
models.ArchiveUrl(
|
||||||
|
url=url,
|
||||||
|
key=prop_media.get(
|
||||||
|
"id", f"{k}{prop_media.key}_{i}.{j}"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return db_urls
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from app.shared.utils.misc import fnv1a_hash_mod
|
from app.shared.utils.misc import fnv1a_hash_mod, get_all_urls
|
||||||
|
|
||||||
|
|
||||||
def test_fnv1a_hash_mod():
|
def test_fnv1a_hash_mod():
|
||||||
@@ -29,3 +29,32 @@ def test_fnv1a_hash_mod():
|
|||||||
|
|
||||||
# Test modulo = 1 edge case
|
# Test modulo = 1 edge case
|
||||||
assert fnv1a_hash_mod("test", 1) == 0
|
assert fnv1a_hash_mod("test", 1) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_all_urls(db_session):
|
||||||
|
from auto_archiver.core import Media, Metadata
|
||||||
|
|
||||||
|
meta = Metadata().set_url("https://example.com")
|
||||||
|
m1 = meta.add_media(Media("fn1.txt", urls=["outcome1.com"]))
|
||||||
|
m2 = meta.add_media(Media("fn2.txt", urls=["outcome2.com"]))
|
||||||
|
m3 = meta.add_media(Media("fn3.txt", urls=["outcome3.com"]))
|
||||||
|
m1.set("screenshot", Media("screenshot.png", urls=["screenshot.com"]))
|
||||||
|
m2.set(
|
||||||
|
"thumbnails",
|
||||||
|
[
|
||||||
|
Media("thumb1.png", urls=["thumb1.com"]),
|
||||||
|
Media("thumb2.png", urls=["thumb2.com"]),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
m3.set("ssl_data", Media("ssl_data.txt", urls=["ssl_data.com"]).to_dict())
|
||||||
|
m3.set("bad_data", {"bad": "dict is ignored"})
|
||||||
|
|
||||||
|
urls = [u.url for u in get_all_urls(meta)]
|
||||||
|
assert len(urls) == 7
|
||||||
|
assert "outcome1.com" in urls
|
||||||
|
assert "outcome2.com" in urls
|
||||||
|
assert "outcome3.com" in urls
|
||||||
|
assert "screenshot.com" in urls
|
||||||
|
assert "thumb1.com" in urls
|
||||||
|
assert "thumb2.com" in urls
|
||||||
|
assert "ssl_data.com" in urls
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ from auto_archiver.core import Media, Metadata
|
|||||||
|
|
||||||
from app.shared import constants, schemas
|
from app.shared import constants, schemas
|
||||||
from app.shared.db import models
|
from app.shared.db import models
|
||||||
from app.web.utils.misc import get_all_urls
|
|
||||||
from app.worker.main import create_archive_task, create_sheet_task
|
from app.worker.main import create_archive_task, create_sheet_task
|
||||||
|
|
||||||
|
|
||||||
@@ -147,30 +146,3 @@ class TestCreateSheetTask:
|
|||||||
assert inserted.group_id == "interstellar"
|
assert inserted.group_id == "interstellar"
|
||||||
assert inserted.author_id == "rick@example.com"
|
assert inserted.author_id == "rick@example.com"
|
||||||
assert inserted.public is False
|
assert inserted.public is False
|
||||||
|
|
||||||
|
|
||||||
def test_get_all_urls(db_session):
|
|
||||||
meta = Metadata().set_url("https://example.com")
|
|
||||||
m1 = meta.add_media(Media("fn1.txt", urls=["outcome1.com"]))
|
|
||||||
m2 = meta.add_media(Media("fn2.txt", urls=["outcome2.com"]))
|
|
||||||
m3 = meta.add_media(Media("fn3.txt", urls=["outcome3.com"]))
|
|
||||||
m1.set("screenshot", Media("screenshot.png", urls=["screenshot.com"]))
|
|
||||||
m2.set(
|
|
||||||
"thumbnails",
|
|
||||||
[
|
|
||||||
Media("thumb1.png", urls=["thumb1.com"]),
|
|
||||||
Media("thumb2.png", urls=["thumb2.com"]),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
m3.set("ssl_data", Media("ssl_data.txt", urls=["ssl_data.com"]).to_dict())
|
|
||||||
m3.set("bad_data", {"bad": "dict is ignored"})
|
|
||||||
|
|
||||||
urls = [u.url for u in get_all_urls(meta)]
|
|
||||||
assert len(urls) == 7
|
|
||||||
assert "outcome1.com" in urls
|
|
||||||
assert "outcome2.com" in urls
|
|
||||||
assert "outcome3.com" in urls
|
|
||||||
assert "screenshot.com" in urls
|
|
||||||
assert "thumb1.com" in urls
|
|
||||||
assert "thumb2.com" in urls
|
|
||||||
assert "ssl_data.com" in urls
|
|
||||||
|
|||||||
@@ -12,9 +12,9 @@ from app.shared import business_logic, schemas
|
|||||||
from app.shared.db import models, worker_crud
|
from app.shared.db import models, worker_crud
|
||||||
from app.shared.db.database import get_db_dependency
|
from app.shared.db.database import get_db_dependency
|
||||||
from app.shared.log import log_error
|
from app.shared.log import log_error
|
||||||
|
from app.shared.utils.misc import get_all_urls
|
||||||
from app.web.config import ALLOW_ANY_EMAIL
|
from app.web.config import ALLOW_ANY_EMAIL
|
||||||
from app.web.security import token_api_key_auth
|
from app.web.security import token_api_key_auth
|
||||||
from app.web.utils.misc import get_all_urls
|
|
||||||
|
|
||||||
|
|
||||||
router = APIRouter(prefix="/interop", tags=["Interoperability endpoints."])
|
router = APIRouter(prefix="/interop", tags=["Interoperability endpoints."])
|
||||||
|
|||||||
@@ -1,11 +1,6 @@
|
|||||||
import base64
|
import base64
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from auto_archiver.core import Media, Metadata
|
|
||||||
from fastapi.encoders import jsonable_encoder
|
from fastapi.encoders import jsonable_encoder
|
||||||
from loguru import logger
|
|
||||||
|
|
||||||
from app.shared.db import models
|
|
||||||
|
|
||||||
|
|
||||||
def custom_jsonable_encoder(obj):
|
def custom_jsonable_encoder(obj):
|
||||||
@@ -19,44 +14,3 @@ def convert_priority_to_queue_dict(priority: str) -> dict:
|
|||||||
"priority": 0 if priority == "high" else 10,
|
"priority": 0 if priority == "high" else 10,
|
||||||
"queue": f"{priority}_priority",
|
"queue": f"{priority}_priority",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def convert_if_media(media):
|
|
||||||
if isinstance(media, Media):
|
|
||||||
return media
|
|
||||||
elif isinstance(media, dict):
|
|
||||||
try:
|
|
||||||
return Media.from_dict(media)
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"error parsing {media} : {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def get_all_urls(result: Metadata) -> List[models.ArchiveUrl]:
|
|
||||||
db_urls = []
|
|
||||||
for m in result.media:
|
|
||||||
for i, url in enumerate(m.urls):
|
|
||||||
db_urls.append(
|
|
||||||
models.ArchiveUrl(url=url, key=m.get("id", f"media_{i}"))
|
|
||||||
)
|
|
||||||
for k, prop in m.properties.items():
|
|
||||||
if prop_converted := convert_if_media(prop):
|
|
||||||
for i, url in enumerate(prop_converted.urls):
|
|
||||||
db_urls.append(
|
|
||||||
models.ArchiveUrl(
|
|
||||||
url=url, key=prop_converted.get("id", f"{k}_{i}")
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if isinstance(prop, list):
|
|
||||||
for i, prop_media in enumerate(prop):
|
|
||||||
if prop_media := convert_if_media(prop_media):
|
|
||||||
for j, url in enumerate(prop_media.urls):
|
|
||||||
db_urls.append(
|
|
||||||
models.ArchiveUrl(
|
|
||||||
url=url,
|
|
||||||
key=prop_media.get(
|
|
||||||
"id", f"{k}{prop_media.key}_{i}.{j}"
|
|
||||||
),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return db_urls
|
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ from app.shared.db.database import get_db
|
|||||||
from app.shared.log import log_error
|
from app.shared.log import log_error
|
||||||
from app.shared.settings import get_settings
|
from app.shared.settings import get_settings
|
||||||
from app.shared.task_messaging import get_celery, get_redis
|
from app.shared.task_messaging import get_celery, get_redis
|
||||||
from app.web.utils.misc import get_all_urls
|
from app.shared.utils.misc import get_all_urls
|
||||||
from app.worker.worker_log import setup_celery_logger
|
from app.worker.worker_log import setup_celery_logger
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user