diff --git a/.github/workflows/ruff.yaml b/.github/workflows/ruff.yaml index fc81582..48d5288 100644 --- a/.github/workflows/ruff.yaml +++ b/.github/workflows/ruff.yaml @@ -24,7 +24,7 @@ jobs: - name: Install Python uses: actions/setup-python@v6 with: - python-version: "3.11" + python-version: "3.12" - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/poetry.lock b/poetry.lock index ae7d05a..45b36fc 100644 --- a/poetry.lock +++ b/poetry.lock @@ -755,23 +755,25 @@ files = [ [[package]] name = "curl-cffi" -version = "0.13.0" +version = "0.14.0" description = "libcurl ffi bindings for Python, with impersonation support." optional = false -python-versions = ">=3.9" +python-versions = ">=3.10" groups = ["main"] markers = "implementation_name == \"cpython\"" files = [ - {file = "curl_cffi-0.13.0-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:434cadbe8df2f08b2fc2c16dff2779fb40b984af99c06aa700af898e185bb9db"}, - {file = "curl_cffi-0.13.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:59afa877a9ae09efa04646a7d068eeea48915a95d9add0a29854e7781679fcd7"}, - {file = "curl_cffi-0.13.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d06ed389e45a7ca97b17c275dbedd3d6524560270e675c720e93a2018a766076"}, - {file = "curl_cffi-0.13.0-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b4e0de45ab3b7a835c72bd53640c2347415111b43421b5c7a1a0b18deae2e541"}, - {file = "curl_cffi-0.13.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8eb4083371bbb94e9470d782de235fb5268bf43520de020c9e5e6be8f395443f"}, - {file = "curl_cffi-0.13.0-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:28911b526e8cd4aa0e5e38401bfe6887e8093907272f1f67ca22e6beb2933a51"}, - {file = "curl_cffi-0.13.0-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6d433ffcb455ab01dd0d7bde47109083aa38b59863aa183d29c668ae4c96bf8e"}, - {file = "curl_cffi-0.13.0-cp39-abi3-win_amd64.whl", hash = "sha256:66a6b75ce971de9af64f1b6812e275f60b88880577bac47ef1fa19694fa21cd3"}, - {file = "curl_cffi-0.13.0-cp39-abi3-win_arm64.whl", hash = "sha256:d438a3b45244e874794bc4081dc1e356d2bb926dcc7021e5a8fef2e2105ef1d8"}, - {file = "curl_cffi-0.13.0.tar.gz", hash = "sha256:62ecd90a382bd5023750e3606e0aa7cb1a3a8ba41c14270b8e5e149ebf72c5ca"}, + {file = "curl_cffi-0.14.0-cp39-abi3-macosx_14_0_arm64.whl", hash = "sha256:e35e89c6a69872f9749d6d5fda642ed4fc159619329e99d577d0104c9aad5893"}, + {file = "curl_cffi-0.14.0-cp39-abi3-macosx_15_0_x86_64.whl", hash = "sha256:5945478cd28ad7dfb5c54473bcfb6743ee1d66554d57951fdf8fc0e7d8cf4e45"}, + {file = "curl_cffi-0.14.0-cp39-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c42e8fa3c667db9ccd2e696ee47adcd3cd5b0838d7282f3fc45f6c0ef3cfdfa7"}, + {file = "curl_cffi-0.14.0-cp39-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:060fe2c99c41d3cb7f894de318ddf4b0301b08dca70453d769bd4e74b36b8483"}, + {file = "curl_cffi-0.14.0-cp39-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b158c41a25388690dd0d40b5bc38d1e0f512135f17fdb8029868cbc1993d2e5b"}, + {file = "curl_cffi-0.14.0-cp39-abi3-manylinux_2_28_i686.whl", hash = "sha256:1439fbef3500fb723333c826adf0efb0e2e5065a703fb5eccce637a2250db34a"}, + {file = "curl_cffi-0.14.0-cp39-abi3-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e7176f2c2d22b542e3cf261072a81deb018cfa7688930f95dddef215caddb469"}, + {file = "curl_cffi-0.14.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:03f21ade2d72978c2bb8670e9b6de5260e2755092b02d94b70b906813662998d"}, + {file = "curl_cffi-0.14.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:58ebf02de64ee5c95613209ddacb014c2d2f86298d7080c0a1c12ed876ee0690"}, + {file = "curl_cffi-0.14.0-cp39-abi3-win_amd64.whl", hash = "sha256:6e503f9a103f6ae7acfb3890c843b53ec030785a22ae7682a22cc43afb94123e"}, + {file = "curl_cffi-0.14.0-cp39-abi3-win_arm64.whl", hash = "sha256:2eed50a969201605c863c4c31269dfc3e0da52916086ac54553cfa353022425c"}, + {file = "curl_cffi-0.14.0.tar.gz", hash = "sha256:5ffbc82e59f05008ec08ea432f0e535418823cda44178ee518906a54f27a5f0f"}, ] [package.dependencies] @@ -780,9 +782,9 @@ cffi = ">=1.12.0" [package.extras] build = ["cibuildwheel", "wheel"] -dev = ["charset_normalizer (>=3.3.2,<4.0)", "coverage (>=6.4.1,<7.0)", "cryptography (>=42.0.5,<43.0)", "httpx (==0.23.1)", "mypy (>=1.9.0,<2.0)", "pytest (>=8.1.1,<9.0)", "pytest-asyncio (>=0.23.6,<1.0)", "pytest-trio (>=0.8.0,<1.0)", "ruff (>=0.3.5,<1.0)", "trio (>=0.25.0,<1.0)", "trustme (>=1.1.0,<2.0)", "typing_extensions", "uvicorn (>=0.29.0,<1.0)", "websockets (>=12.0,<13.0)"] +dev = ["charset_normalizer (>=3.3.2,<4.0)", "coverage (>=6.4.1,<7.0)", "cryptography (>=42.0.5,<43.0)", "httpx (==0.23.1)", "mypy (>=1.9.0,<2.0)", "pytest (>=8.1.1,<9.0)", "pytest-asyncio (>=0.23.6,<1.0)", "pytest-trio (>=0.8.0,<1.0)", "ruff (>=0.3.5,<1.0)", "trio (>=0.25.0,<1.0)", "trustme (>=1.1.0,<2.0)", "typing_extensions", "uvicorn (>=0.29.0,<1.0)", "websockets (>=14.0)"] extra = ["lxml_html_clean", "markdownify (>=1.1.0)", "readability-lxml (>=0.8.1)"] -test = ["charset_normalizer (>=3.3.2,<4.0)", "cryptography (>=42.0.5,<43.0)", "fastapi (==0.110.0)", "httpx (==0.23.1)", "proxy.py (>=2.4.3,<3.0)", "pytest (>=8.1.1,<9.0)", "pytest-asyncio (>=0.23.6,<1.0)", "pytest-trio (>=0.8.0,<1.0)", "python-multipart (>=0.0.9,<1.0)", "trio (>=0.25.0,<1.0)", "trustme (>=1.1.0,<2.0)", "typing_extensions", "uvicorn (>=0.29.0,<1.0)", "websockets (>=12.0,<13.0)"] +test = ["charset_normalizer (>=3.3.2,<4.0)", "cryptography (>=42.0.5,<43.0)", "fastapi (>=0.110.0,<1.0)", "httpx (==0.23.1)", "proxy.py (>=2.4.3,<3.0)", "pytest (>=8.1.1,<9.0)", "pytest-asyncio (>=0.23.6,<1.0)", "pytest-trio (>=0.8.0,<1.0)", "python-multipart (>=0.0.9,<1.0)", "trio (>=0.25.0,<1.0)", "trustme (>=1.1.0,<2.0)", "typing_extensions", "uvicorn (>=0.29.0,<1.0)", "websockets (>=14.0)"] [[package]] name = "dataclasses-json" @@ -3108,30 +3110,30 @@ files = [ [[package]] name = "ruff" -version = "0.9.10" +version = "0.15.2" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" groups = ["dev"] files = [ - {file = "ruff-0.9.10-py3-none-linux_armv6l.whl", hash = "sha256:eb4d25532cfd9fe461acc83498361ec2e2252795b4f40b17e80692814329e42d"}, - {file = "ruff-0.9.10-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:188a6638dab1aa9bb6228a7302387b2c9954e455fb25d6b4470cb0641d16759d"}, - {file = "ruff-0.9.10-py3-none-macosx_11_0_arm64.whl", hash = "sha256:5284dcac6b9dbc2fcb71fdfc26a217b2ca4ede6ccd57476f52a587451ebe450d"}, - {file = "ruff-0.9.10-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:47678f39fa2a3da62724851107f438c8229a3470f533894b5568a39b40029c0c"}, - {file = "ruff-0.9.10-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:99713a6e2766b7a17147b309e8c915b32b07a25c9efd12ada79f217c9c778b3e"}, - {file = "ruff-0.9.10-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:524ee184d92f7c7304aa568e2db20f50c32d1d0caa235d8ddf10497566ea1a12"}, - {file = "ruff-0.9.10-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:df92aeac30af821f9acf819fc01b4afc3dfb829d2782884f8739fb52a8119a16"}, - {file = "ruff-0.9.10-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de42e4edc296f520bb84954eb992a07a0ec5a02fecb834498415908469854a52"}, - {file = "ruff-0.9.10-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d257f95b65806104b6b1ffca0ea53f4ef98454036df65b1eda3693534813ecd1"}, - {file = "ruff-0.9.10-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b60dec7201c0b10d6d11be00e8f2dbb6f40ef1828ee75ed739923799513db24c"}, - {file = "ruff-0.9.10-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:d838b60007da7a39c046fcdd317293d10b845001f38bcb55ba766c3875b01e43"}, - {file = "ruff-0.9.10-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:ccaf903108b899beb8e09a63ffae5869057ab649c1e9231c05ae354ebc62066c"}, - {file = "ruff-0.9.10-py3-none-musllinux_1_2_i686.whl", hash = "sha256:f9567d135265d46e59d62dc60c0bfad10e9a6822e231f5b24032dba5a55be6b5"}, - {file = "ruff-0.9.10-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5f202f0d93738c28a89f8ed9eaba01b7be339e5d8d642c994347eaa81c6d75b8"}, - {file = "ruff-0.9.10-py3-none-win32.whl", hash = "sha256:bfb834e87c916521ce46b1788fbb8484966e5113c02df216680102e9eb960029"}, - {file = "ruff-0.9.10-py3-none-win_amd64.whl", hash = "sha256:f2160eeef3031bf4b17df74e307d4c5fb689a6f3a26a2de3f7ef4044e3c484f1"}, - {file = "ruff-0.9.10-py3-none-win_arm64.whl", hash = "sha256:5fd804c0327a5e5ea26615550e706942f348b197d5475ff34c19733aee4b2e69"}, - {file = "ruff-0.9.10.tar.gz", hash = "sha256:9bacb735d7bada9cfb0f2c227d3658fc443d90a727b47f206fb33f52f3c0eac7"}, + {file = "ruff-0.15.2-py3-none-linux_armv6l.whl", hash = "sha256:120691a6fdae2f16d65435648160f5b81a9625288f75544dc40637436b5d3c0d"}, + {file = "ruff-0.15.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:a89056d831256099658b6bba4037ac6dd06f49d194199215befe2bb10457ea5e"}, + {file = "ruff-0.15.2-py3-none-macosx_11_0_arm64.whl", hash = "sha256:e36dee3a64be0ebd23c86ffa3aa3fd3ac9a712ff295e192243f814a830b6bd87"}, + {file = "ruff-0.15.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9fb47b6d9764677f8c0a193c0943ce9a05d6763523f132325af8a858eadc2b9"}, + {file = "ruff-0.15.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f376990f9d0d6442ea9014b19621d8f2aaf2b8e39fdbfc79220b7f0c596c9b80"}, + {file = "ruff-0.15.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2dcc987551952d73cbf5c88d9fdee815618d497e4df86cd4c4824cc59d5dd75f"}, + {file = "ruff-0.15.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:42a47fd785cbe8c01b9ff45031af875d101b040ad8f4de7bbb716487c74c9a77"}, + {file = "ruff-0.15.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cbe9f49354866e575b4c6943856989f966421870e85cd2ac94dccb0a9dcb2fea"}, + {file = "ruff-0.15.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7a672c82b5f9887576087d97be5ce439f04bbaf548ee987b92d3a7dede41d3a"}, + {file = "ruff-0.15.2-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:72ecc64f46f7019e2bcc3cdc05d4a7da958b629a5ab7033195e11a438403d956"}, + {file = "ruff-0.15.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:8dcf243b15b561c655c1ef2f2b0050e5d50db37fe90115507f6ff37d865dc8b4"}, + {file = "ruff-0.15.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:dab6941c862c05739774677c6273166d2510d254dac0695c0e3f5efa1b5585de"}, + {file = "ruff-0.15.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:1b9164f57fc36058e9a6806eb92af185b0697c9fe4c7c52caa431c6554521e5c"}, + {file = "ruff-0.15.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:80d24fcae24d42659db7e335b9e1531697a7102c19185b8dc4a028b952865fd8"}, + {file = "ruff-0.15.2-py3-none-win32.whl", hash = "sha256:fd5ff9e5f519a7e1bd99cbe8daa324010a74f5e2ebc97c6242c08f26f3714f6f"}, + {file = "ruff-0.15.2-py3-none-win_amd64.whl", hash = "sha256:d20014e3dfa400f3ff84830dfb5755ece2de45ab62ecea4af6b7262d0fb4f7c5"}, + {file = "ruff-0.15.2-py3-none-win_arm64.whl", hash = "sha256:cabddc5822acdc8f7b5527b36ceac55cc51eec7b1946e60181de8fe83ca8876e"}, + {file = "ruff-0.15.2.tar.gz", hash = "sha256:14b965afee0969e68bb871eba625343b8673375f457af4abe98553e8bbb98342"}, ] [[package]] @@ -4313,4 +4315,4 @@ files = [ [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.13" -content-hash = "e4fd9a44b9541f610e862ba51410092473a587a097c7a5a3e3b197ee6b30e4d4" +content-hash = "fd273f9a20a34e64849a61cb654a7eb75dfa9499375eb8efbdcc953f9a9f6f95" diff --git a/pyproject.toml b/pyproject.toml index 1dc21cb..4b2c58a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [project] name = "auto-archiver" -version = "1.2.1" +version = "1.2.2" description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)." requires-python = ">=3.10,<3.13" @@ -66,7 +66,7 @@ pytest = "^8.3.4" autopep8 = "^2.3.1" pytest-loguru = "^0.4.0" pytest-mock = "^3.14.0" -ruff = "^0.9.10" +ruff = "^0.15.2" pre-commit = "^4.1.0" [tool.poetry.group.docs.dependencies] diff --git a/src/auto_archiver/modules/generic_extractor/facebook.py b/src/auto_archiver/modules/generic_extractor/facebook.py index 5b264c6..8213638 100644 --- a/src/auto_archiver/modules/generic_extractor/facebook.py +++ b/src/auto_archiver/modules/generic_extractor/facebook.py @@ -34,7 +34,7 @@ def _extract_metadata(self, webpage, video_id): ..., "attachments", ..., - lambda k, v: (k == "media" and str(v["id"]) == video_id and v["__typename"] == "Video"), + lambda k, v: k == "media" and str(v["id"]) == video_id and v["__typename"] == "Video", ), expected_type=dict, ) diff --git a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py index 5ba2112..358529b 100644 --- a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py +++ b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py @@ -24,8 +24,7 @@ class WaczExtractorEnricher(Enricher, Extractor): self.use_docker = os.environ.get("WACZ_ENABLE_DOCKER") or not os.environ.get("RUNNING_IN_DOCKER") self.docker_in_docker = os.environ.get("WACZ_ENABLE_DOCKER") and os.environ.get("RUNNING_IN_DOCKER") - self.crawl_id = random_str(8) - self.cwd_dind = f"/crawls/crawls{self.crawl_id}" + self.cwd_dind = f"/crawls/crawls{random_str(8)}" self.browsertrix_home_host = os.environ.get("BROWSERTRIX_HOME_HOST") self.browsertrix_home_container = os.environ.get("BROWSERTRIX_HOME_CONTAINER") or self.browsertrix_home_host # create crawls folder if not exists, so it can be safely removed in cleanup @@ -51,7 +50,8 @@ class WaczExtractorEnricher(Enricher, Extractor): url = to_enrich.get_url() - collection = self.crawl_id + crawl_id = random_str(8) + collection = crawl_id browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir) browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host @@ -83,8 +83,10 @@ class WaczExtractorEnricher(Enricher, Extractor): # "--blockAds" # note: this has been known to cause issues on cloudflare protected sites ] + crawl_cwd_dind = os.path.join(self.cwd_dind, crawl_id) if self.docker_in_docker: - cmd.extend(["--cwd", self.cwd_dind]) + os.makedirs(crawl_cwd_dind, exist_ok=True) + cmd.extend(["--cwd", crawl_cwd_dind]) if self.auth_for_site(url): # there's an auth for this site, but browsertrix only supports username/password auth @@ -109,7 +111,7 @@ class WaczExtractorEnricher(Enricher, Extractor): ] + cmd if self.profile: - profile_file = f"profile-{self.crawl_id}.tar.gz" + profile_file = f"profile-{crawl_id}.tar.gz" profile_fn = os.path.join(browsertrix_home_container, profile_file) logger.debug(f"Copying {self.profile} to {profile_fn}") shutil.copyfile(self.profile, profile_fn) @@ -137,7 +139,7 @@ class WaczExtractorEnricher(Enricher, Extractor): return False if self.docker_in_docker: - wacz_fn = os.path.join(self.cwd_dind, "collections", collection, f"{collection}.wacz") + wacz_fn = os.path.join(crawl_cwd_dind, "collections", collection, f"{collection}.wacz") elif self.use_docker: wacz_fn = os.path.join(browsertrix_home_container, "collections", collection, f"{collection}.wacz") else: @@ -152,7 +154,7 @@ class WaczExtractorEnricher(Enricher, Extractor): self.extract_media_from_wacz(to_enrich, wacz_fn) if self.docker_in_docker: - jsonl_fn = os.path.join(self.cwd_dind, "collections", collection, "pages", "pages.jsonl") + jsonl_fn = os.path.join(crawl_cwd_dind, "collections", collection, "pages", "pages.jsonl") elif self.use_docker: jsonl_fn = os.path.join(browsertrix_home_container, "collections", collection, "pages", "pages.jsonl") else: diff --git a/tests/core/__init__.py b/tests/core/__init__.py new file mode 100644 index 0000000..8605732 --- /dev/null +++ b/tests/core/__init__.py @@ -0,0 +1 @@ +# Core module tests diff --git a/tests/core/test_media.py b/tests/core/test_media.py new file mode 100644 index 0000000..cce2625 --- /dev/null +++ b/tests/core/test_media.py @@ -0,0 +1,198 @@ +""" +Tests for the Media class from auto_archiver.core.media +""" + +import pytest +from unittest.mock import Mock, patch +from auto_archiver.core.media import Media + + +class TestMediaBasics: + """Test basic Media properties and methods.""" + + def test_media_creation_with_filename(self): + media = Media(filename="test.mp4") + assert media.filename == "test.mp4" + assert media.urls == [] + assert media.properties == {} + + def test_media_key_property(self): + media = Media(filename="test.mp4", _key="my_key") + assert media.key == "my_key" + + def test_media_set_get_properties(self): + media = Media(filename="test.mp4") + result = media.set("author", "John Doe") + assert result is media # returns self for chaining + assert media.get("author") == "John Doe" + assert media.get("nonexistent") is None + assert media.get("nonexistent", "default") == "default" + + def test_media_add_url(self): + media = Media(filename="test.mp4") + media.add_url("https://example.com/test.mp4") + assert "https://example.com/test.mp4" in media.urls + media.add_url("https://cdn.example.com/test.mp4") + assert len(media.urls) == 2 + + +class TestMediaMimetype: + """Test mimetype detection and handling.""" + + @pytest.mark.parametrize( + "filename,expected_mimetype", + [ + ("video.mp4", "video/mp4"), + ("image.jpg", "image/jpeg"), + ("image.png", "image/png"), + ("audio.mp3", "audio/mpeg"), + ("document.pdf", "application/pdf"), + ("text.txt", "text/plain"), + ], + ) + def test_mimetype_detection(self, filename, expected_mimetype): + media = Media(filename=filename) + assert media.mimetype == expected_mimetype + + def test_mimetype_setter(self): + media = Media(filename="file.unknown") + media.mimetype = "custom/type" + assert media.mimetype == "custom/type" + + def test_mimetype_empty_filename(self): + media = Media(filename="") + assert media.mimetype == "" + + +class TestMediaTypeChecks: + """Test media type checking methods.""" + + @pytest.mark.parametrize( + "filename,is_video,is_audio,is_image", + [ + ("video.mp4", True, False, False), + ("video.avi", True, False, False), + ("audio.mp3", False, True, False), + ("audio.wav", False, True, False), + ("image.jpg", False, False, True), + ("image.png", False, False, True), + ("document.pdf", False, False, False), + ], + ) + def test_type_checks(self, filename, is_video, is_audio, is_image): + media = Media(filename=filename) + assert media.is_video() == is_video + assert media.is_audio() == is_audio + assert media.is_image() == is_image + + +class TestMediaStore: + """Test media storage functionality.""" + + def test_store_with_no_storages(self, caplog): + media = Media(filename="test.mp4") + metadata = Mock() + media.store(metadata, storages=[]) + assert "No storages found" in caplog.text + + def test_store_with_storage(self): + media = Media(filename="test.mp4") + metadata = Mock() + mock_storage = Mock() + media.store(metadata, url="https://example.com", storages=[mock_storage]) + mock_storage.store.assert_called_once() + + +class TestMediaInnerMedia: + """Test nested media retrieval.""" + + def test_all_inner_media_no_nested(self): + media = Media(filename="test.mp4") + inner = list(media.all_inner_media(include_self=False)) + assert len(inner) == 0 + + inner_with_self = list(media.all_inner_media(include_self=True)) + assert len(inner_with_self) == 1 + assert inner_with_self[0] is media + + def test_all_inner_media_with_nested(self): + parent = Media(filename="parent.mp4") + child = Media(filename="child.jpg") + grandchild = Media(filename="grandchild.png") + + child.set("thumbnail", grandchild) + parent.set("preview", child) + + inner = list(parent.all_inner_media(include_self=False)) + assert len(inner) == 2 + assert child in inner + assert grandchild in inner + + def test_all_inner_media_with_list_property(self): + parent = Media(filename="parent.mp4") + child1 = Media(filename="frame1.jpg") + child2 = Media(filename="frame2.jpg") + + parent.set("frames", [child1, child2]) + + inner = list(parent.all_inner_media(include_self=False)) + assert len(inner) == 2 + assert child1 in inner + assert child2 in inner + + +class TestMediaIsStored: + """Test the is_stored method.""" + + def test_is_stored_no_urls(self): + media = Media(filename="test.mp4") + storage = Mock() + storage.config = {"steps": {"storages": ["s3", "local"]}} + assert media.is_stored(storage) is False + + def test_is_stored_partial_urls(self): + media = Media(filename="test.mp4") + media.add_url("https://s3.example.com/test.mp4") + storage = Mock() + storage.config = {"steps": {"storages": ["s3", "local"]}} + assert media.is_stored(storage) is False + + def test_is_stored_full_urls(self): + media = Media(filename="test.mp4") + media.add_url("https://s3.example.com/test.mp4") + media.add_url("file:///local/test.mp4") + storage = Mock() + storage.config = {"steps": {"storages": ["s3", "local"]}} + assert media.is_stored(storage) is True + + +class TestMediaValidVideo: + """Test video validation functionality.""" + + def test_is_valid_video_with_valid_probe(self): + media = Media(filename="test.mp4") + + mock_streams = {"streams": [{"duration_ts": 1000}]} + + with patch("ffmpeg.probe", return_value=mock_streams): + assert media.is_valid_video() is True + + def test_is_valid_video_with_no_duration(self): + media = Media(filename="test.mp4") + + mock_streams = {"streams": [{"duration_ts": 0}]} + + with patch("ffmpeg.probe", return_value=mock_streams): + assert media.is_valid_video() is False + + def test_is_valid_video_with_ffmpeg_error(self): + media = Media(filename="test.mp4") + + with patch("ffmpeg.probe", side_effect=Exception("ffmpeg error")): + with patch("os.path.getsize", return_value=100): + # Falls back to file size check, small file + assert media.is_valid_video() is False + + with patch("os.path.getsize", return_value=30000): + # Falls back to file size check, larger file + assert media.is_valid_video() is True diff --git a/tests/core/test_validators.py b/tests/core/test_validators.py new file mode 100644 index 0000000..ab8fe0c --- /dev/null +++ b/tests/core/test_validators.py @@ -0,0 +1,98 @@ +""" +Tests for validators module from auto_archiver.core.validators +""" + +import argparse +import json +import pytest + +from auto_archiver.core.validators import positive_number, valid_file, json_loader + + +class TestPositiveNumber: + """Test the positive_number validator.""" + + @pytest.mark.parametrize( + "value,expected", + [ + (0, 0), + (1, 1), + (100, 100), + (0.5, 0.5), + (999999, 999999), + ], + ) + def test_positive_values(self, value, expected): + assert positive_number(value) == expected + + @pytest.mark.parametrize( + "value", + [ + -1, + -100, + -0.5, + -999999, + ], + ) + def test_negative_values_raise_error(self, value): + with pytest.raises(argparse.ArgumentTypeError) as exc_info: + positive_number(value) + assert "not a positive number" in str(exc_info.value) + + +class TestValidFile: + """Test the valid_file validator.""" + + def test_valid_file_exists(self, tmp_path): + test_file = tmp_path / "test.txt" + test_file.write_text("test content") + result = valid_file(str(test_file)) + assert result == str(test_file) + + def test_valid_file_not_exists(self): + with pytest.raises(argparse.ArgumentTypeError) as exc_info: + valid_file("/nonexistent/path/to/file.txt") + assert "does not exist" in str(exc_info.value) + + def test_valid_file_directory_not_file(self, tmp_path): + # A directory is not a file + with pytest.raises(argparse.ArgumentTypeError) as exc_info: + valid_file(str(tmp_path)) + assert "does not exist" in str(exc_info.value) + + +class TestJsonLoader: + """Test the json_loader validator.""" + + @pytest.mark.parametrize( + "json_str,expected", + [ + ('{"key": "value"}', {"key": "value"}), + ('{"number": 123}', {"number": 123}), + ('{"list": [1, 2, 3]}', {"list": [1, 2, 3]}), + ('{"nested": {"inner": "value"}}', {"nested": {"inner": "value"}}), + ("[]", []), + ("[1, 2, 3]", [1, 2, 3]), + ('"string"', "string"), + ("123", 123), + ("true", True), + ("false", False), + ("null", None), + ], + ) + def test_valid_json(self, json_str, expected): + assert json_loader(json_str) == expected + + @pytest.mark.parametrize( + "invalid_json", + [ + "{invalid}", + "{'single': 'quotes'}", + "{missing: quotes}", + '{"unclosed": "brace"', + "", + ], + ) + def test_invalid_json_raises_error(self, invalid_json): + with pytest.raises(json.JSONDecodeError): + json_loader(invalid_json) diff --git a/tests/databases/test_console_db.py b/tests/databases/test_console_db.py new file mode 100644 index 0000000..e6448e8 --- /dev/null +++ b/tests/databases/test_console_db.py @@ -0,0 +1,62 @@ +""" +Tests for the ConsoleDb module +""" + +import pytest + + +@pytest.fixture +def console_db(setup_module): + return setup_module("console_db") + + +class TestConsoleDb: + """Test the ConsoleDb functionality.""" + + def test_started_logs_info(self, console_db, make_item, caplog): + """Test that started() logs an info message.""" + item = make_item("https://example.com/test") + + with caplog.at_level("INFO"): + console_db.started(item) + + assert "STARTED" in caplog.text + assert "example.com" in caplog.text + + def test_failed_logs_error(self, console_db, make_item, caplog): + """Test that failed() logs an error message with reason.""" + item = make_item("https://example.com/test") + reason = "Connection timeout" + + with caplog.at_level("ERROR"): + console_db.failed(item, reason) + + assert "FAILED" in caplog.text + assert "Connection timeout" in caplog.text + + def test_aborted_logs_warning(self, console_db, make_item, caplog): + """Test that aborted() logs a warning message.""" + item = make_item("https://example.com/test") + + with caplog.at_level("WARNING"): + console_db.aborted(item) + + assert "ABORTED" in caplog.text + + def test_done_logs_success(self, console_db, make_item, caplog): + """Test that done() logs a success message.""" + item = make_item("https://example.com/test") + + with caplog.at_level("INFO"): + console_db.done(item) + + assert "DONE" in caplog.text + + def test_done_cached(self, console_db, make_item, caplog): + """Test done() with cached=True (should behave the same).""" + item = make_item("https://example.com/test") + + with caplog.at_level("INFO"): + console_db.done(item, cached=True) + + assert "DONE" in caplog.text diff --git a/tests/enrichers/test_json_enricher.py b/tests/enrichers/test_json_enricher.py new file mode 100644 index 0000000..2f9e811 --- /dev/null +++ b/tests/enrichers/test_json_enricher.py @@ -0,0 +1,72 @@ +""" +Tests for the JsonEnricher module +""" + +import json +import os +import pytest + + +@pytest.fixture +def json_enricher(setup_module): + return setup_module("json_enricher") + + +class TestJsonEnricher: + """Test the JsonEnricher functionality.""" + + def test_enrich_creates_json_file(self, json_enricher, make_item): + """Test that enrich creates a metadata.json file.""" + item = make_item("https://example.com/test") + item.set("title", "Test Title") + item.set("description", "Test description") + + json_enricher.enrich(item) + + # Check that a media with id 'metadata_json' was added + json_media = item.get_media_by_id("metadata_json") + assert json_media is not None + assert json_media.filename.endswith("metadata.json") + assert os.path.exists(json_media.filename) + + def test_enrich_json_content(self, json_enricher, make_item): + """Test that the JSON content is correct.""" + item = make_item("https://example.com/test") + item.set("title", "Test Title") + item.set("custom_field", "custom_value") + + json_enricher.enrich(item) + + json_media = item.get_media_by_id("metadata_json") + with open(json_media.filename, "r", encoding="utf-8") as f: + content = json.load(f) + + # The to_dict() returns nested structure: {status, metadata: {...}, media: [...]} + assert content["metadata"]["title"] == "Test Title" + assert content["metadata"]["custom_field"] == "custom_value" + assert content["metadata"]["url"] == "https://example.com/test" + + def test_enrich_handles_special_characters(self, json_enricher, make_item): + """Test that special characters are handled correctly.""" + item = make_item("https://example.com/test") + item.set("title", "Test with émojis 🎉 and üñíçödé") + + json_enricher.enrich(item) + + json_media = item.get_media_by_id("metadata_json") + with open(json_media.filename, "r", encoding="utf-8") as f: + content = json.load(f) + + # Access the nested metadata structure + assert "émojis 🎉" in content["metadata"]["title"] + assert "üñíçödé" in content["metadata"]["title"] + + def test_enrich_empty_metadata(self, json_enricher, make_item): + """Test enriching metadata with minimal content.""" + item = make_item("https://example.com/minimal") + + json_enricher.enrich(item) + + json_media = item.get_media_by_id("metadata_json") + assert json_media is not None + assert os.path.exists(json_media.filename) diff --git a/tests/feeders/test_cli_feeder.py b/tests/feeders/test_cli_feeder.py new file mode 100644 index 0000000..2996f2f --- /dev/null +++ b/tests/feeders/test_cli_feeder.py @@ -0,0 +1,70 @@ +""" +Tests for the CLIFeeder module +""" + +import pytest + +from auto_archiver.modules.cli_feeder.cli_feeder import CLIFeeder +from auto_archiver.core.consts import SetupError +from auto_archiver.core.metadata import Metadata + + +@pytest.fixture +def cli_feeder_instance(): + """Create a CLIFeeder instance with mocked config.""" + + def _create(urls): + feeder = CLIFeeder() + # Mock the config structure that cli_feeder expects + feeder.config = {"urls": urls} + feeder.name = "cli_feeder" + feeder.tmp_dir = "/tmp" + return feeder + + return _create + + +class TestCLIFeeder: + """Test the CLIFeeder functionality.""" + + def test_iter_yields_metadata_for_urls(self, cli_feeder_instance): + """Test that iteration yields Metadata objects for each URL.""" + urls = ["https://example.com/1", "https://example.com/2", "https://example.com/3"] + feeder = cli_feeder_instance(urls) + feeder.setup() + + items = list(feeder) + + assert len(items) == 3 + assert all(isinstance(item, Metadata) for item in items) + assert items[0].get_url() == "https://example.com/1" + assert items[1].get_url() == "https://example.com/2" + assert items[2].get_url() == "https://example.com/3" + + def test_iter_single_url(self, cli_feeder_instance): + """Test iteration with a single URL.""" + feeder = cli_feeder_instance(["https://example.com/single"]) + feeder.setup() + + items = list(feeder) + + assert len(items) == 1 + assert items[0].get_url() == "https://example.com/single" + + def test_setup_raises_without_urls(self, cli_feeder_instance): + """Test that setup raises SetupError when no URLs provided.""" + feeder = cli_feeder_instance([]) + + with pytest.raises(SetupError) as exc_info: + feeder.setup() + + assert "No URLs provided" in str(exc_info.value) + + def test_setup_raises_with_none_urls(self, cli_feeder_instance): + """Test that setup raises SetupError when urls is None.""" + feeder = cli_feeder_instance(None) + + with pytest.raises(SetupError) as exc_info: + feeder.setup() + + assert "No URLs provided" in str(exc_info.value) diff --git a/tests/formatters/test_mute_formatter.py b/tests/formatters/test_mute_formatter.py new file mode 100644 index 0000000..b9e79db --- /dev/null +++ b/tests/formatters/test_mute_formatter.py @@ -0,0 +1,43 @@ +""" +Tests for the MuteFormatter module +""" + +import pytest +from auto_archiver.core.metadata import Metadata + + +@pytest.fixture +def mute_formatter(setup_module): + return setup_module("mute_formatter") + + +class TestMuteFormatter: + """Test the MuteFormatter functionality.""" + + def test_format_returns_none(self, mute_formatter, make_item): + """Test that format always returns None (mutes output).""" + item = make_item("https://example.com/test") + item.set("title", "Test Title") + + result = mute_formatter.format(item) + + assert result is None + + def test_format_with_empty_metadata(self, mute_formatter): + """Test format with empty metadata.""" + item = Metadata().set_url("https://example.com/empty") + + result = mute_formatter.format(item) + + assert result is None + + def test_format_with_media(self, mute_formatter, make_item): + """Test that format still returns None even with media attached.""" + from auto_archiver.core.media import Media + + item = make_item("https://example.com/with-media") + item.add_media(Media(filename="test.mp4")) + + result = mute_formatter.format(item) + + assert result is None