diff --git a/app/shared/utils/misc.py b/app/shared/utils/misc.py index 21e349a..7fd250d 100644 --- a/app/shared/utils/misc.py +++ b/app/shared/utils/misc.py @@ -1,3 +1,11 @@ +from typing import List + +from auto_archiver.core import Media, Metadata +from loguru import logger + +from app.shared.db import models + + def fnv1a_hash_mod(s: str, modulo: int) -> int: # receives a string and returns a number in [0:modulo-1], ensures an even # distribution over the modulo range @@ -12,3 +20,44 @@ def fnv1a_hash_mod(s: str, modulo: int) -> int: if offset_basis_hash < 0x80000000 else offset_basis_hash - 0x100000000 ) % modulo + + +def convert_if_media(media): + if isinstance(media, Media): + return media + elif isinstance(media, dict): + try: + return Media.from_dict(media) + except Exception as e: + logger.debug(f"error parsing {media} : {e}") + return False + + +def get_all_urls(result: Metadata) -> List[models.ArchiveUrl]: + db_urls = [] + for m in result.media: + for i, url in enumerate(m.urls): + db_urls.append( + models.ArchiveUrl(url=url, key=m.get("id", f"media_{i}")) + ) + for k, prop in m.properties.items(): + if prop_converted := convert_if_media(prop): + for i, url in enumerate(prop_converted.urls): + db_urls.append( + models.ArchiveUrl( + url=url, key=prop_converted.get("id", f"{k}_{i}") + ) + ) + if isinstance(prop, list): + for i, prop_media in enumerate(prop): + if prop_media := convert_if_media(prop_media): + for j, url in enumerate(prop_media.urls): + db_urls.append( + models.ArchiveUrl( + url=url, + key=prop_media.get( + "id", f"{k}{prop_media.key}_{i}.{j}" + ), + ) + ) + return db_urls diff --git a/app/tests/shared/utils/test_misc.py b/app/tests/shared/utils/test_misc.py index 18db28d..0bf0e17 100644 --- a/app/tests/shared/utils/test_misc.py +++ b/app/tests/shared/utils/test_misc.py @@ -1,4 +1,4 @@ -from app.shared.utils.misc import fnv1a_hash_mod +from app.shared.utils.misc import fnv1a_hash_mod, get_all_urls def test_fnv1a_hash_mod(): @@ -29,3 +29,32 @@ def test_fnv1a_hash_mod(): # Test modulo = 1 edge case assert fnv1a_hash_mod("test", 1) == 0 + + +def test_get_all_urls(db_session): + from auto_archiver.core import Media, Metadata + + meta = Metadata().set_url("https://example.com") + m1 = meta.add_media(Media("fn1.txt", urls=["outcome1.com"])) + m2 = meta.add_media(Media("fn2.txt", urls=["outcome2.com"])) + m3 = meta.add_media(Media("fn3.txt", urls=["outcome3.com"])) + m1.set("screenshot", Media("screenshot.png", urls=["screenshot.com"])) + m2.set( + "thumbnails", + [ + Media("thumb1.png", urls=["thumb1.com"]), + Media("thumb2.png", urls=["thumb2.com"]), + ], + ) + m3.set("ssl_data", Media("ssl_data.txt", urls=["ssl_data.com"]).to_dict()) + m3.set("bad_data", {"bad": "dict is ignored"}) + + urls = [u.url for u in get_all_urls(meta)] + assert len(urls) == 7 + assert "outcome1.com" in urls + assert "outcome2.com" in urls + assert "outcome3.com" in urls + assert "screenshot.com" in urls + assert "thumb1.com" in urls + assert "thumb2.com" in urls + assert "ssl_data.com" in urls diff --git a/app/tests/worker/test_worker_main.py b/app/tests/worker/test_worker_main.py index 39b2b17..948f554 100644 --- a/app/tests/worker/test_worker_main.py +++ b/app/tests/worker/test_worker_main.py @@ -6,7 +6,6 @@ from auto_archiver.core import Media, Metadata from app.shared import constants, schemas from app.shared.db import models -from app.web.utils.misc import get_all_urls from app.worker.main import create_archive_task, create_sheet_task @@ -147,30 +146,3 @@ class TestCreateSheetTask: assert inserted.group_id == "interstellar" assert inserted.author_id == "rick@example.com" assert inserted.public is False - - -def test_get_all_urls(db_session): - meta = Metadata().set_url("https://example.com") - m1 = meta.add_media(Media("fn1.txt", urls=["outcome1.com"])) - m2 = meta.add_media(Media("fn2.txt", urls=["outcome2.com"])) - m3 = meta.add_media(Media("fn3.txt", urls=["outcome3.com"])) - m1.set("screenshot", Media("screenshot.png", urls=["screenshot.com"])) - m2.set( - "thumbnails", - [ - Media("thumb1.png", urls=["thumb1.com"]), - Media("thumb2.png", urls=["thumb2.com"]), - ], - ) - m3.set("ssl_data", Media("ssl_data.txt", urls=["ssl_data.com"]).to_dict()) - m3.set("bad_data", {"bad": "dict is ignored"}) - - urls = [u.url for u in get_all_urls(meta)] - assert len(urls) == 7 - assert "outcome1.com" in urls - assert "outcome2.com" in urls - assert "outcome3.com" in urls - assert "screenshot.com" in urls - assert "thumb1.com" in urls - assert "thumb2.com" in urls - assert "ssl_data.com" in urls diff --git a/app/web/routers/interoperability.py b/app/web/routers/interoperability.py index 1292698..008d248 100644 --- a/app/web/routers/interoperability.py +++ b/app/web/routers/interoperability.py @@ -12,9 +12,9 @@ from app.shared import business_logic, schemas from app.shared.db import models, worker_crud from app.shared.db.database import get_db_dependency from app.shared.log import log_error +from app.shared.utils.misc import get_all_urls from app.web.config import ALLOW_ANY_EMAIL from app.web.security import token_api_key_auth -from app.web.utils.misc import get_all_urls router = APIRouter(prefix="/interop", tags=["Interoperability endpoints."]) diff --git a/app/web/utils/misc.py b/app/web/utils/misc.py index b561975..f78ae1e 100644 --- a/app/web/utils/misc.py +++ b/app/web/utils/misc.py @@ -1,11 +1,6 @@ import base64 -from typing import List -from auto_archiver.core import Media, Metadata from fastapi.encoders import jsonable_encoder -from loguru import logger - -from app.shared.db import models def custom_jsonable_encoder(obj): @@ -19,44 +14,3 @@ def convert_priority_to_queue_dict(priority: str) -> dict: "priority": 0 if priority == "high" else 10, "queue": f"{priority}_priority", } - - -def convert_if_media(media): - if isinstance(media, Media): - return media - elif isinstance(media, dict): - try: - return Media.from_dict(media) - except Exception as e: - logger.debug(f"error parsing {media} : {e}") - return False - - -def get_all_urls(result: Metadata) -> List[models.ArchiveUrl]: - db_urls = [] - for m in result.media: - for i, url in enumerate(m.urls): - db_urls.append( - models.ArchiveUrl(url=url, key=m.get("id", f"media_{i}")) - ) - for k, prop in m.properties.items(): - if prop_converted := convert_if_media(prop): - for i, url in enumerate(prop_converted.urls): - db_urls.append( - models.ArchiveUrl( - url=url, key=prop_converted.get("id", f"{k}_{i}") - ) - ) - if isinstance(prop, list): - for i, prop_media in enumerate(prop): - if prop_media := convert_if_media(prop_media): - for j, url in enumerate(prop_media.urls): - db_urls.append( - models.ArchiveUrl( - url=url, - key=prop_media.get( - "id", f"{k}{prop_media.key}_{i}.{j}" - ), - ) - ) - return db_urls diff --git a/app/worker/main.py b/app/worker/main.py index ad93122..20b7252 100644 --- a/app/worker/main.py +++ b/app/worker/main.py @@ -13,7 +13,7 @@ from app.shared.db.database import get_db from app.shared.log import log_error from app.shared.settings import get_settings from app.shared.task_messaging import get_celery, get_redis -from app.web.utils.misc import get_all_urls +from app.shared.utils.misc import get_all_urls from app.worker.worker_log import setup_celery_logger