fully isolate worker from web via shared

2026-06-07 19:18:34 +03:00 · 2025-06-30 02:48:27 +01:00
parent f27fb12b27
commit 7d66f2b6ac
6 changed files with 81 additions and 77 deletions
--- a/app/shared/utils/misc.py
+++ b/app/shared/utils/misc.py
@@ -1,3 +1,11 @@
+from typing import List
+
+from auto_archiver.core import Media, Metadata
+from loguru import logger
+
+from app.shared.db import models
+
+
 def fnv1a_hash_mod(s: str, modulo: int) -> int:
    # receives a string and returns a number in [0:modulo-1], ensures an even
    # distribution over the modulo range
@@ -12,3 +20,44 @@ def fnv1a_hash_mod(s: str, modulo: int) -> int:
        if offset_basis_hash < 0x80000000
        else offset_basis_hash - 0x100000000
    ) % modulo
+
+
+def convert_if_media(media):
+    if isinstance(media, Media):
+        return media
+    elif isinstance(media, dict):
+        try:
+            return Media.from_dict(media)
+        except Exception as e:
+            logger.debug(f"error parsing {media} : {e}")
+    return False
+
+
+def get_all_urls(result: Metadata) -> List[models.ArchiveUrl]:
+    db_urls = []
+    for m in result.media:
+        for i, url in enumerate(m.urls):
+            db_urls.append(
+                models.ArchiveUrl(url=url, key=m.get("id", f"media_{i}"))
+            )
+        for k, prop in m.properties.items():
+            if prop_converted := convert_if_media(prop):
+                for i, url in enumerate(prop_converted.urls):
+                    db_urls.append(
+                        models.ArchiveUrl(
+                            url=url, key=prop_converted.get("id", f"{k}_{i}")
+                        )
+                    )
+            if isinstance(prop, list):
+                for i, prop_media in enumerate(prop):
+                    if prop_media := convert_if_media(prop_media):
+                        for j, url in enumerate(prop_media.urls):
+                            db_urls.append(
+                                models.ArchiveUrl(
+                                    url=url,
+                                    key=prop_media.get(
+                                        "id", f"{k}{prop_media.key}_{i}.{j}"
+                                    ),
+                                )
+                            )
+    return db_urls
--- a/app/tests/shared/utils/test_misc.py
+++ b/app/tests/shared/utils/test_misc.py
@@ -1,4 +1,4 @@
-from app.shared.utils.misc import fnv1a_hash_mod
+from app.shared.utils.misc import fnv1a_hash_mod, get_all_urls


 def test_fnv1a_hash_mod():
@@ -29,3 +29,32 @@ def test_fnv1a_hash_mod():

    # Test modulo = 1 edge case
    assert fnv1a_hash_mod("test", 1) == 0
+
+
+def test_get_all_urls(db_session):
+    from auto_archiver.core import Media, Metadata
+
+    meta = Metadata().set_url("https://example.com")
+    m1 = meta.add_media(Media("fn1.txt", urls=["outcome1.com"]))
+    m2 = meta.add_media(Media("fn2.txt", urls=["outcome2.com"]))
+    m3 = meta.add_media(Media("fn3.txt", urls=["outcome3.com"]))
+    m1.set("screenshot", Media("screenshot.png", urls=["screenshot.com"]))
+    m2.set(
+        "thumbnails",
+        [
+            Media("thumb1.png", urls=["thumb1.com"]),
+            Media("thumb2.png", urls=["thumb2.com"]),
+        ],
+    )
+    m3.set("ssl_data", Media("ssl_data.txt", urls=["ssl_data.com"]).to_dict())
+    m3.set("bad_data", {"bad": "dict is ignored"})
+
+    urls = [u.url for u in get_all_urls(meta)]
+    assert len(urls) == 7
+    assert "outcome1.com" in urls
+    assert "outcome2.com" in urls
+    assert "outcome3.com" in urls
+    assert "screenshot.com" in urls
+    assert "thumb1.com" in urls
+    assert "thumb2.com" in urls
+    assert "ssl_data.com" in urls
--- a/app/tests/worker/test_worker_main.py
+++ b/app/tests/worker/test_worker_main.py
@@ -6,7 +6,6 @@ from auto_archiver.core import Media, Metadata

 from app.shared import constants, schemas
 from app.shared.db import models
-from app.web.utils.misc import get_all_urls
 from app.worker.main import create_archive_task, create_sheet_task


@@ -147,30 +146,3 @@ class TestCreateSheetTask:
        assert inserted.group_id == "interstellar"
        assert inserted.author_id == "rick@example.com"
        assert inserted.public is False
-
-
-def test_get_all_urls(db_session):
-    meta = Metadata().set_url("https://example.com")
-    m1 = meta.add_media(Media("fn1.txt", urls=["outcome1.com"]))
-    m2 = meta.add_media(Media("fn2.txt", urls=["outcome2.com"]))
-    m3 = meta.add_media(Media("fn3.txt", urls=["outcome3.com"]))
-    m1.set("screenshot", Media("screenshot.png", urls=["screenshot.com"]))
-    m2.set(
-        "thumbnails",
-        [
-            Media("thumb1.png", urls=["thumb1.com"]),
-            Media("thumb2.png", urls=["thumb2.com"]),
-        ],
-    )
-    m3.set("ssl_data", Media("ssl_data.txt", urls=["ssl_data.com"]).to_dict())
-    m3.set("bad_data", {"bad": "dict is ignored"})
-
-    urls = [u.url for u in get_all_urls(meta)]
-    assert len(urls) == 7
-    assert "outcome1.com" in urls
-    assert "outcome2.com" in urls
-    assert "outcome3.com" in urls
-    assert "screenshot.com" in urls
-    assert "thumb1.com" in urls
-    assert "thumb2.com" in urls
-    assert "ssl_data.com" in urls
--- a/app/web/routers/interoperability.py
+++ b/app/web/routers/interoperability.py
@@ -12,9 +12,9 @@ from app.shared import business_logic, schemas
 from app.shared.db import models, worker_crud
 from app.shared.db.database import get_db_dependency
 from app.shared.log import log_error
+from app.shared.utils.misc import get_all_urls
 from app.web.config import ALLOW_ANY_EMAIL
 from app.web.security import token_api_key_auth
-from app.web.utils.misc import get_all_urls


 router = APIRouter(prefix="/interop", tags=["Interoperability endpoints."])
--- a/app/web/utils/misc.py
+++ b/app/web/utils/misc.py
@@ -1,11 +1,6 @@
 import base64
-from typing import List

-from auto_archiver.core import Media, Metadata
 from fastapi.encoders import jsonable_encoder
-from loguru import logger
-
-from app.shared.db import models


 def custom_jsonable_encoder(obj):
@@ -19,44 +14,3 @@ def convert_priority_to_queue_dict(priority: str) -> dict:
        "priority": 0 if priority == "high" else 10,
        "queue": f"{priority}_priority",
    }
-
-
-def convert_if_media(media):
-    if isinstance(media, Media):
-        return media
-    elif isinstance(media, dict):
-        try:
-            return Media.from_dict(media)
-        except Exception as e:
-            logger.debug(f"error parsing {media} : {e}")
-    return False
-
-
-def get_all_urls(result: Metadata) -> List[models.ArchiveUrl]:
-    db_urls = []
-    for m in result.media:
-        for i, url in enumerate(m.urls):
-            db_urls.append(
-                models.ArchiveUrl(url=url, key=m.get("id", f"media_{i}"))
-            )
-        for k, prop in m.properties.items():
-            if prop_converted := convert_if_media(prop):
-                for i, url in enumerate(prop_converted.urls):
-                    db_urls.append(
-                        models.ArchiveUrl(
-                            url=url, key=prop_converted.get("id", f"{k}_{i}")
-                        )
-                    )
-            if isinstance(prop, list):
-                for i, prop_media in enumerate(prop):
-                    if prop_media := convert_if_media(prop_media):
-                        for j, url in enumerate(prop_media.urls):
-                            db_urls.append(
-                                models.ArchiveUrl(
-                                    url=url,
-                                    key=prop_media.get(
-                                        "id", f"{k}{prop_media.key}_{i}.{j}"
-                                    ),
-                                )
-                            )
-    return db_urls
--- a/app/worker/main.py
+++ b/app/worker/main.py
@@ -13,7 +13,7 @@ from app.shared.db.database import get_db
 from app.shared.log import log_error
 from app.shared.settings import get_settings
 from app.shared.task_messaging import get_celery, get_redis
-from app.web.utils.misc import get_all_urls
+from app.shared.utils.misc import get_all_urls
 from app.worker.worker_log import setup_celery_logger