From d9d936c2cae621a3576968268ccf665bc3d31a8d Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Wed, 12 Feb 2025 12:22:27 +0000 Subject: [PATCH 01/15] Thumbnail enricher fix seconds to minutes. --- poetry.lock | 113 ++++++------- .../thumbnail_enricher/__manifest__.py | 8 +- .../thumbnail_enricher/thumbnail_enricher.py | 2 +- tests/enrichers/test_thumbnail_enricher.py | 155 ++++++++++++++++++ 4 files changed, 219 insertions(+), 59 deletions(-) create mode 100644 tests/enrichers/test_thumbnail_enricher.py diff --git a/poetry.lock b/poetry.lock index 8fb48ec..6bfa62c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -84,14 +84,14 @@ tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] [[package]] name = "authlib" -version = "1.4.0" +version = "1.4.1" description = "The ultimate Python library in building OAuth and OpenID Connect servers and clients." optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "Authlib-1.4.0-py2.py3-none-any.whl", hash = "sha256:4bb20b978c8b636222b549317c1815e1fe62234fc1c5efe8855d84aebf3a74e3"}, - {file = "authlib-1.4.0.tar.gz", hash = "sha256:1c1e6608b5ed3624aeeee136ca7f8c120d6f51f731aa152b153d54741840e1f2"}, + {file = "Authlib-1.4.1-py2.py3-none-any.whl", hash = "sha256:edc29c3f6a3e72cd9e9f45fff67fc663a2c364022eb0371c003f22d5405915c1"}, + {file = "authlib-1.4.1.tar.gz", hash = "sha256:30ead9ea4993cdbab821dc6e01e818362f92da290c04c7f6a1940f86507a790d"}, ] [package.dependencies] @@ -115,33 +115,34 @@ tomli = {version = "*", markers = "python_version < \"3.11\""} [[package]] name = "babel" -version = "2.16.0" +version = "2.17.0" description = "Internationalization utilities" optional = false python-versions = ">=3.8" groups = ["docs"] files = [ - {file = "babel-2.16.0-py3-none-any.whl", hash = "sha256:368b5b98b37c06b7daf6696391c3240c938b37767d4584413e8438c5c435fa8b"}, - {file = "babel-2.16.0.tar.gz", hash = "sha256:d1f3554ca26605fe173f3de0c65f750f5a42f924499bf134de6423582298e316"}, + {file = "babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2"}, + {file = "babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d"}, ] [package.extras] -dev = ["freezegun (>=1.0,<2.0)", "pytest (>=6.0)", "pytest-cov"] +dev = ["backports.zoneinfo", "freezegun (>=1.0,<2.0)", "jinja2 (>=3.0)", "pytest (>=6.0)", "pytest-cov", "pytz", "setuptools", "tzdata"] [[package]] name = "beautifulsoup4" -version = "4.12.3" +version = "4.13.3" description = "Screen-scraping library" optional = false -python-versions = ">=3.6.0" +python-versions = ">=3.7.0" groups = ["main", "docs"] files = [ - {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"}, - {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"}, + {file = "beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16"}, + {file = "beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b"}, ] [package.dependencies] soupsieve = ">1.2" +typing-extensions = ">=4.0.0" [package.extras] cchardet = ["cchardet"] @@ -152,18 +153,18 @@ lxml = ["lxml"] [[package]] name = "boto3" -version = "1.36.6" +version = "1.36.17" description = "The AWS SDK for Python" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "boto3-1.36.6-py3-none-any.whl", hash = "sha256:6d473f0f340d02b4e9ad5b8e68786a09728101a8b950231b89ebdaf72b6dca21"}, - {file = "boto3-1.36.6.tar.gz", hash = "sha256:b36feae061dc0793cf311468956a0a9e99215ce38bc99a1a4e55a5b105f16297"}, + {file = "boto3-1.36.17-py3-none-any.whl", hash = "sha256:59bcf0c4b04d9cc36f8b418ad17ab3c4a99a21a175d2fad7096aa21cbe84630b"}, + {file = "boto3-1.36.17.tar.gz", hash = "sha256:5ecae20e780a3ce9afb3add532b61c466a8cb8960618e4fa565b3883064c1346"}, ] [package.dependencies] -botocore = ">=1.36.6,<1.37.0" +botocore = ">=1.36.17,<1.37.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.11.0,<0.12.0" @@ -172,14 +173,14 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.36.6" +version = "1.36.17" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "botocore-1.36.6-py3-none-any.whl", hash = "sha256:f77bbbb03fb420e260174650fb5c0cc142ec20a96967734eed2b0ef24334ef34"}, - {file = "botocore-1.36.6.tar.gz", hash = "sha256:4864c53d638da191a34daf3ede3ff1371a3719d952cc0c6bd24ce2836a38dd77"}, + {file = "botocore-1.36.17-py3-none-any.whl", hash = "sha256:069858b2fd693548035d7fd53a774e37e4260fea64e0ac9b8a3aee904f9321df"}, + {file = "botocore-1.36.17.tar.gz", hash = "sha256:cec13e0a7ce78e71aad0b397581b4e81824c7981ef4c261d2e296d200c399b09"}, ] [package.dependencies] @@ -188,7 +189,7 @@ python-dateutil = ">=2.1,<3.0.0" urllib3 = {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""} [package.extras] -crt = ["awscrt (==0.23.4)"] +crt = ["awscrt (==0.23.8)"] [[package]] name = "brotli" @@ -355,14 +356,14 @@ files = [ [[package]] name = "certifi" -version = "2024.12.14" +version = "2025.1.31" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" groups = ["main", "docs"] files = [ - {file = "certifi-2024.12.14-py3-none-any.whl", hash = "sha256:1275f7a45be9464efc1173084eaa30f866fe2e47d389406136d332ed4967ec56"}, - {file = "certifi-2024.12.14.tar.gz", hash = "sha256:b650d30f370c2b724812bee08008be0c4163b163ddaec3f2546c1caf65f191db"}, + {file = "certifi-2025.1.31-py3-none-any.whl", hash = "sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe"}, + {file = "certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651"}, ] [[package]] @@ -655,26 +656,26 @@ typing-inspect = ">=0.4.0,<1" [[package]] name = "dateparser" -version = "1.2.0" +version = "1.2.1" description = "Date parsing library designed to parse dates from HTML pages" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" groups = ["main"] files = [ - {file = "dateparser-1.2.0-py2.py3-none-any.whl", hash = "sha256:0b21ad96534e562920a0083e97fd45fa959882d4162acc358705144520a35830"}, - {file = "dateparser-1.2.0.tar.gz", hash = "sha256:7975b43a4222283e0ae15be7b4999d08c9a70e2d378ac87385b1ccf2cffbbb30"}, + {file = "dateparser-1.2.1-py3-none-any.whl", hash = "sha256:bdcac262a467e6260030040748ad7c10d6bacd4f3b9cdb4cfd2251939174508c"}, + {file = "dateparser-1.2.1.tar.gz", hash = "sha256:7e4919aeb48481dbfc01ac9683c8e20bfe95bb715a38c1e9f6af889f4f30ccc3"}, ] [package.dependencies] -python-dateutil = "*" -pytz = "*" -regex = "<2019.02.19 || >2019.02.19,<2021.8.27 || >2021.8.27" -tzlocal = "*" +python-dateutil = ">=2.7.0" +pytz = ">=2024.2" +regex = ">=2015.06.24,<2019.02.19 || >2019.02.19,<2021.8.27 || >2021.8.27" +tzlocal = ">=0.2" [package.extras] -calendars = ["convertdate", "hijri-converter"] -fasttext = ["fasttext"] -langdetect = ["langdetect"] +calendars = ["convertdate (>=2.2.1)", "hijridate"] +fasttext = ["fasttext (>=0.9.1)", "numpy (>=1.19.3,<2)"] +langdetect = ["langdetect (>=1.0.0)"] [[package]] name = "docutils" @@ -754,14 +755,14 @@ files = [ [[package]] name = "google-api-core" -version = "2.24.0" +version = "2.24.1" description = "Google API client core library" optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "google_api_core-2.24.0-py3-none-any.whl", hash = "sha256:10d82ac0fca69c82a25b3efdeefccf6f28e02ebb97925a8cce8edbfe379929d9"}, - {file = "google_api_core-2.24.0.tar.gz", hash = "sha256:e255640547a597a4da010876d333208ddac417d60add22b6851a0c66a831fcaf"}, + {file = "google_api_core-2.24.1-py3-none-any.whl", hash = "sha256:bc78d608f5a5bf853b80bd70a795f703294de656c096c0968320830a4bc280f1"}, + {file = "google_api_core-2.24.1.tar.gz", hash = "sha256:f8b36f5456ab0dd99a1b693a40a31d1e7757beea380ad1b38faaf8941eae9d8a"}, ] [package.dependencies] @@ -779,14 +780,14 @@ grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] [[package]] name = "google-api-python-client" -version = "2.159.0" +version = "2.160.0" description = "Google API Client Library for Python" optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "google_api_python_client-2.159.0-py2.py3-none-any.whl", hash = "sha256:baef0bb631a60a0bd7c0bf12a5499e3a40cd4388484de7ee55c1950bf820a0cf"}, - {file = "google_api_python_client-2.159.0.tar.gz", hash = "sha256:55197f430f25c907394b44fa078545ffef89d33fd4dca501b7db9f0d8e224bd6"}, + {file = "google_api_python_client-2.160.0-py2.py3-none-any.whl", hash = "sha256:63d61fb3e4cf3fb31a70a87f45567c22f6dfe87bbfa27252317e3e2c42900db4"}, + {file = "google_api_python_client-2.160.0.tar.gz", hash = "sha256:a8ccafaecfa42d15d5b5c3134ced8de08380019717fc9fb1ed510ca58eca3b7e"}, ] [package.dependencies] @@ -1136,14 +1137,14 @@ files = [ [[package]] name = "marshmallow" -version = "3.26.0" +version = "3.26.1" description = "A lightweight library for converting complex datatypes to and from native Python datatypes." optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "marshmallow-3.26.0-py3-none-any.whl", hash = "sha256:1287bca04e6a5f4094822ac153c03da5e214a0a60bcd557b140f3e66991b8ca1"}, - {file = "marshmallow-3.26.0.tar.gz", hash = "sha256:eb36762a1cc76d7abf831e18a3a1b26d3d481bbc74581b8e532a3d3a8115e1cb"}, + {file = "marshmallow-3.26.1-py3-none-any.whl", hash = "sha256:3350409f20a70a7e4e11a27661187b77cdcaeb20abca41c1454fe33636bea09c"}, + {file = "marshmallow-3.26.1.tar.gz", hash = "sha256:e6d8affb6cb61d39d26402096dc0aee12d5a26d490a121f118d2e81dc0719dc6"}, ] [package.dependencies] @@ -1512,14 +1513,14 @@ testing = ["pytest", "pytest-benchmark"] [[package]] name = "proto-plus" -version = "1.25.0" -description = "Beautiful, Pythonic protocol buffers." +version = "1.26.0" +description = "Beautiful, Pythonic protocol buffers" optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "proto_plus-1.25.0-py3-none-any.whl", hash = "sha256:c91fc4a65074ade8e458e95ef8bac34d4008daa7cce4a12d6707066fca648961"}, - {file = "proto_plus-1.25.0.tar.gz", hash = "sha256:fbb17f57f7bd05a68b7707e745e26528b0b3c34e378db91eef93912c54982d91"}, + {file = "proto_plus-1.26.0-py3-none-any.whl", hash = "sha256:bf2dfaa3da281fc3187d12d224c707cb57214fb2c22ba854eb0c105a3fb2d4d7"}, + {file = "proto_plus-1.26.0.tar.gz", hash = "sha256:6e93d5f5ca267b54300880fff156b6a3386b3fa3f43b1da62e680fc0c586ef22"}, ] [package.dependencies] @@ -1820,14 +1821,14 @@ requests = ">=2.28" [[package]] name = "pytz" -version = "2024.2" +version = "2025.1" description = "World timezone definitions, modern and historical" optional = false python-versions = "*" groups = ["main"] files = [ - {file = "pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725"}, - {file = "pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a"}, + {file = "pytz-2025.1-py2.py3-none-any.whl", hash = "sha256:89dd22dca55b46eac6eda23b2d72721bf1bdfef212645d81513ef5d03038de57"}, + {file = "pytz-2025.1.tar.gz", hash = "sha256:c2db42be2a2518b28e65f9207c4d05e6ff547d1efa4086469ef855e4ab70178e"}, ] [[package]] @@ -2076,14 +2077,14 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"] [[package]] name = "rich-argparse" -version = "1.6.0" +version = "1.7.0" description = "Rich help formatters for argparse and optparse" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "rich_argparse-1.6.0-py3-none-any.whl", hash = "sha256:fbe70a1d821b3f2fa8958cddf0cae131870a6e9faa04ab52b409cb1eda809bd7"}, - {file = "rich_argparse-1.6.0.tar.gz", hash = "sha256:092083c30da186f25bcdff8b1d47fdfb571288510fb051e0488a72cc3128de13"}, + {file = "rich_argparse-1.7.0-py3-none-any.whl", hash = "sha256:b8ec8943588e9731967f4f97b735b03dc127c416f480a083060433a97baf2fd3"}, + {file = "rich_argparse-1.7.0.tar.gz", hash = "sha256:f31d809c465ee43f367d599ccaf88b73bc2c4d75d74ed43f2d538838c53544ba"}, ] [package.dependencies] @@ -2316,20 +2317,20 @@ test = ["cython (>=3.0)", "defusedxml (>=0.7.1)", "pytest (>=8.0)", "setuptools [[package]] name = "sphinx-autoapi" -version = "3.4.0" +version = "3.5.0" description = "Sphinx API documentation generator" optional = false python-versions = ">=3.8" groups = ["docs"] files = [ - {file = "sphinx_autoapi-3.4.0-py3-none-any.whl", hash = "sha256:4027fef2875a22c5f2a57107c71641d82f6166bf55beb407a47aaf3ef14e7b92"}, - {file = "sphinx_autoapi-3.4.0.tar.gz", hash = "sha256:e6d5371f9411bbb9fca358c00a9e57aef3ac94cbfc5df4bab285946462f69e0c"}, + {file = "sphinx_autoapi-3.5.0-py3-none-any.whl", hash = "sha256:8676db32dded669dc6be9100696652640dc1e883e45b74710d74eb547a310114"}, + {file = "sphinx_autoapi-3.5.0.tar.gz", hash = "sha256:10dcdf86e078ae1fb144f653341794459e86f5b23cf3e786a735def71f564089"}, ] [package.dependencies] astroid = [ {version = ">=2.7", markers = "python_version < \"3.12\""}, - {version = ">=3.0.0a1", markers = "python_version >= \"3.12\""}, + {version = ">=3", markers = "python_version >= \"3.12\""}, ] Jinja2 = "*" PyYAML = "*" diff --git a/src/auto_archiver/modules/thumbnail_enricher/__manifest__.py b/src/auto_archiver/modules/thumbnail_enricher/__manifest__.py index e47397f..1bd23b5 100644 --- a/src/auto_archiver/modules/thumbnail_enricher/__manifest__.py +++ b/src/auto_archiver/modules/thumbnail_enricher/__manifest__.py @@ -7,8 +7,12 @@ "bin": ["ffmpeg"] }, "configs": { - "thumbnails_per_minute": {"default": 60, "help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"}, - "max_thumbnails": {"default": 16, "help": "limit the number of thumbnails to generate per video, 0 means no limit"}, + "thumbnails_per_minute": {"default": 60, + "type": "int", + "help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"}, + "max_thumbnails": {"default": 16, + "type": "int", + "help": "limit the number of thumbnails to generate per video, 0 means no limit"}, }, "description": """ Generates thumbnails for video files to provide visual previews. diff --git a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py index e0ac937..8178cd8 100644 --- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py +++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py @@ -42,7 +42,7 @@ class ThumbnailEnricher(Enricher): logger.error(f"error getting duration of video {m.filename}: {e}") return - num_thumbs = int(min(max(1, duration * self.thumbnails_per_minute), self.max_thumbnails)) + num_thumbs = int(min(max(1, (duration / 60) * self.thumbnails_per_minute), self.max_thumbnails)) timestamps = [duration / (num_thumbs + 1) * i for i in range(1, num_thumbs + 1)] thumbnails_media = [] diff --git a/tests/enrichers/test_thumbnail_enricher.py b/tests/enrichers/test_thumbnail_enricher.py new file mode 100644 index 0000000..eb27b99 --- /dev/null +++ b/tests/enrichers/test_thumbnail_enricher.py @@ -0,0 +1,155 @@ +import pytest +from unittest.mock import patch, MagicMock +from auto_archiver.core import Metadata, Media +from auto_archiver.modules.thumbnail_enricher import ThumbnailEnricher + + +@pytest.fixture +def thumbnail_enricher(setup_module) -> ThumbnailEnricher: + configs: dict = { + "thumbnails_per_minute": 60, + "max_thumbnails": 4, + } + return setup_module("thumbnail_enricher", configs) + + +@pytest.fixture +def metadata_with_video(): + m = Metadata() + m.set_url("https://example.com") + m.add_media(Media(filename="video.mp4").set("id", "video1")) + return m + + +@pytest.fixture +def mock_ffmpeg_environment(): + # Mocking all the ffmpeg calls in one place + with ( + patch("ffmpeg.input") as mock_ffmpeg_input, + patch("os.makedirs") as mock_makedirs, + patch.object(Media, "is_video", return_value=True), + patch( + "ffmpeg.probe", + return_value={ + "streams": [ + {"codec_type": "video", "duration": "120"} + ] # Default 2-minute duration, but can override in tests + }, + ) as mock_probe, + ): + mock_output = MagicMock() + mock_ffmpeg_input.return_value.filter.return_value.output.return_value = ( + mock_output + ) + + yield { + "mock_ffmpeg_input": mock_ffmpeg_input, + "mock_makedirs": mock_makedirs, + "mock_output": mock_output, + "mock_probe": mock_probe, + } + + +@pytest.mark.parametrize("thumbnails_per_minute, max_thumbnails, expected_count", [ + (10, 5, 5), # Capped at max_thumbnails + (1, 10, 2), # Less than max_thumbnails + (60, 7, 7), # Matches exactly +]) +def test_enrich_thumbnail_limits( + thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment, + thumbnails_per_minute, max_thumbnails, expected_count +): + thumbnail_enricher.thumbnails_per_minute = thumbnails_per_minute + thumbnail_enricher.max_thumbnails = max_thumbnails + + thumbnail_enricher.enrich(metadata_with_video) + + assert mock_ffmpeg_environment["mock_output"].run.call_count == expected_count + thumbnails = metadata_with_video.media[0].get("thumbnails") + assert len(thumbnails) == expected_count + +def test_enrich_handles_probe_failure(thumbnail_enricher, metadata_with_video): + with ( + patch("ffmpeg.probe", side_effect=Exception("Probe error")), + patch("os.makedirs"), + patch("loguru.logger.error") as mock_logger, + patch.object(Media, "is_video", return_value=True), + ): + thumbnail_enricher.enrich(metadata_with_video) + # Ensure error was logged + mock_logger.assert_called_with( + f"error getting duration of video video.mp4: Probe error" + ) + # Ensure no thumbnails were created + thumbnails = metadata_with_video.media[0].get("thumbnails") + assert thumbnails is None + + +def test_enrich_skips_non_video_files(thumbnail_enricher, metadata_with_video): + with ( + patch.object(Media, "is_video", return_value=False), + patch("ffmpeg.input") as mock_ffmpeg, + ): + thumbnail_enricher.enrich(metadata_with_video) + mock_ffmpeg.assert_not_called() + + +@pytest.mark.parametrize("thumbnails_per_minute,max_thumbnails,expected_count", [ + (60, 5, 5), # caught by max + (60, 20, 10), # caught by t/min + (0, 20, 1), # test min caught (1) + (11, 20, 1), # test min caught (1) + (12, 20, 2), # test caught by t/min +]) +def test_enrich_handles_short_video( + thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment, thumbnails_per_minute, max_thumbnails, expected_count +): + # override mock duration + fake_duration = 10 + with patch( + "ffmpeg.probe", + return_value={ "streams": [{"codec_type": "video", "duration": str(fake_duration)}]}, + ): + thumbnail_enricher.thumbnails_per_minute = thumbnails_per_minute + thumbnail_enricher.max_thumbnails = max_thumbnails + + thumbnail_enricher.enrich(metadata_with_video) + assert mock_ffmpeg_environment["mock_output"].run.call_count == expected_count + thumbnails = metadata_with_video.media[0].get("thumbnails") + assert len(thumbnails) == expected_count + + +def test_uses_existing_duration( + thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment +): + metadata_with_video.media[0].set("duration", 60) + thumbnail_enricher.enrich(metadata_with_video) + mock_ffmpeg_environment["mock_probe"].assert_not_called() + assert mock_ffmpeg_environment["mock_output"].run.call_count == 4 + + +def test_enrich_metadata_structure(thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment): + fake_duration = 120 + with patch("ffmpeg.probe", return_value={ + 'streams': [{'codec_type': 'video', 'duration': str(fake_duration)}] + }): + thumbnail_enricher.thumbnails_per_minute = 2 + thumbnail_enricher.max_thumbnails = 4 + + thumbnail_enricher.enrich(metadata_with_video) + + media_item = metadata_with_video.media[0] + thumbnails = media_item.get("thumbnails") + + # Assert normal metadata + assert media_item.get("id") == "video1" + assert media_item.get("duration") == fake_duration + # Evenly spaced timestamps + expected_timestamps = ["24.000s", "48.000s", "72.000s", "96.000s"] + assert thumbnails is not None + assert len(thumbnails) == 4 + + for index, thumbnail in enumerate(thumbnails): + assert thumbnail.filename is not None + assert thumbnail.properties.get("id") == f"thumbnail_{index}" + assert thumbnail.properties.get("timestamp") == expected_timestamps[index] From cbe98c729d9a4928889b5ece924eb62db1f41017 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Wed, 12 Feb 2025 19:32:40 +0000 Subject: [PATCH 02/15] Enricher tests --- .../screenshot_enricher.py | 8 +- tests/conftest.py | 20 +- tests/enrichers/test_metadata_enricher.py | 76 +++++++ tests/enrichers/test_pdq_hash_enricher.py | 84 +++++++ tests/enrichers/test_screenshot_enricher.py | 205 ++++++++++++++++++ tests/enrichers/test_ssl_enricher.py | 54 +++++ tests/enrichers/test_whisper_enricher.py | 93 ++++++++ 7 files changed, 538 insertions(+), 2 deletions(-) create mode 100644 tests/enrichers/test_metadata_enricher.py create mode 100644 tests/enrichers/test_pdq_hash_enricher.py create mode 100644 tests/enrichers/test_screenshot_enricher.py create mode 100644 tests/enrichers/test_ssl_enricher.py create mode 100644 tests/enrichers/test_whisper_enricher.py diff --git a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py index e1da99d..832d0f8 100644 --- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py +++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py @@ -11,6 +11,10 @@ from auto_archiver.core import Media, Metadata class ScreenshotEnricher(Enricher): + def __init__(self, webdriver_factory=None): + super().__init__() + self.webdriver_factory = webdriver_factory or Webdriver + def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() @@ -20,7 +24,8 @@ class ScreenshotEnricher(Enricher): logger.debug(f"Enriching screenshot for {url=}") auth = self.auth_for_site(url) - with Webdriver(self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url, + with self.webdriver_factory( + self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url, http_proxy=self.http_proxy, print_options=self.print_options, auth=auth) as driver: try: driver.get(url) @@ -38,3 +43,4 @@ class ScreenshotEnricher(Enricher): logger.info("TimeoutException loading page for screenshot") except Exception as e: logger.error(f"Got error while loading webdriver for screenshot enricher: {e}") + diff --git a/tests/conftest.py b/tests/conftest.py index 8675fbc..d7f484f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,6 +6,8 @@ import pickle from tempfile import TemporaryDirectory from typing import Dict, Tuple import hashlib +from unittest.mock import patch + import pytest from auto_archiver.core.metadata import Metadata from auto_archiver.core.module import get_module, _LAZY_LOADED_MODULES @@ -128,4 +130,20 @@ def unpickle(): test_data_dir = os.path.join(os.path.dirname(__file__), "data", "test_files") with open(os.path.join(test_data_dir, path), "rb") as f: return pickle.load(f) - return _unpickle \ No newline at end of file + return _unpickle + + +@pytest.fixture +def mock_python_dependencies(): + with patch("auto_archiver.core.module") as mock_check_python_dep: + # Mock all Python dependencies as available + mock_check_python_dep.return_value = True + yield mock_check_python_dep + + +@pytest.fixture +def mock_binary_dependencies(): + with patch("shutil.which") as mock_shutil_which: + # Mock all binary dependencies as available + mock_shutil_which.return_value = "/usr/bin/fake_binary" + yield mock_shutil_which diff --git a/tests/enrichers/test_metadata_enricher.py b/tests/enrichers/test_metadata_enricher.py new file mode 100644 index 0000000..314fca7 --- /dev/null +++ b/tests/enrichers/test_metadata_enricher.py @@ -0,0 +1,76 @@ +from unittest.mock import MagicMock, patch, Mock + +import pytest + +from auto_archiver.core import Metadata, Media + + +@pytest.fixture +def mock_media(): + """Creates a mock Media object.""" + mock: Media = MagicMock(spec=Media) + mock.filename = "mock_file.txt" + return mock + + +@pytest.fixture +def enricher(setup_module): + return setup_module("metadata_enricher", {}) + + +@pytest.mark.parametrize( + "output,expected", + [ + ("Key1: Value1\nKey2: Value2", {"Key1": "Value1", "Key2": "Value2"}), + ("InvalidLine", {}), + ("", {}), + ], +) +@patch("subprocess.run") +def test_get_metadata(mock_run, enricher, output, expected): + mock_run.return_value.stdout = output + mock_run.return_value.stderr = "" + mock_run.return_value.returncode = 0 + + result = enricher.get_metadata("test.jpg") + assert result == expected + mock_run.assert_called_once_with( + ["exiftool", "test.jpg"], capture_output=True, text=True + ) + + +@patch("subprocess.run") +def test_get_metadata_exiftool_not_found(mock_run, enricher): + mock_run.side_effect = FileNotFoundError + result = enricher.get_metadata("test.jpg") + assert result == {} + + +def test_enrich_sets_metadata(enricher): + media1 = Mock(filename="img1.jpg") + media2 = Mock(filename="img2.jpg") + metadata = Mock() + metadata.media = [media1, media2] + enricher.get_metadata = lambda f: {"key": "value"} if f == "img1.jpg" else {} + + enricher.enrich(metadata) + + media1.set.assert_called_once_with("metadata", {"key": "value"}) + media2.set.assert_not_called() + assert metadata.media == [media1, media2] + + +def test_enrich_empty_media(enricher): + metadata = Mock() + metadata.media = [] + # Should not raise errors + enricher.enrich(metadata) + + +@patch("loguru.logger.error") +@patch("subprocess.run") +def test_get_metadata_error_handling(mock_run, mock_logger_error, enricher): + mock_run.side_effect = Exception("Test error") + result = enricher.get_metadata("test.jpg") + assert result == {} + mock_logger_error.assert_called_once() diff --git a/tests/enrichers/test_pdq_hash_enricher.py b/tests/enrichers/test_pdq_hash_enricher.py new file mode 100644 index 0000000..e90cd22 --- /dev/null +++ b/tests/enrichers/test_pdq_hash_enricher.py @@ -0,0 +1,84 @@ +from unittest.mock import patch + +import pytest +from PIL import UnidentifiedImageError + +from auto_archiver.core import Metadata, Media +from auto_archiver.modules.pdq_hash_enricher import PdqHashEnricher + + +@pytest.fixture +def enricher(setup_module): + return setup_module("pdq_hash_enricher", {}) + + +@pytest.fixture +def metadata_with_images(): + m = Metadata() + m.set_url("https://example.com") + m.add_media(Media(filename="image1.jpg", key="image1")) + m.add_media(Media(filename="image2.jpg", key="image2")) + return m + + +def test_successful_enrich(metadata_with_images): + with ( + patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100)), + patch("PIL.Image.open"), + patch.object(Media, "is_image", return_value=True) as mock_is_image, + ): + enricher = PdqHashEnricher() + enricher.enrich(metadata_with_images) + + # Ensure the hash is set for image media + for media in metadata_with_images.media: + assert media.get("pdq_hash") is not None + + +def test_enrich_skip_non_image(metadata_with_images): + with ( + patch.object(Media, "is_image", return_value=False), + patch("pdqhash.compute") as mock_pdq, + ): + enricher = PdqHashEnricher() + enricher.enrich(metadata_with_images) + mock_pdq.assert_not_called() + + +def test_enrich_handles_corrupted_image(metadata_with_images): + with ( + patch("PIL.Image.open", side_effect=UnidentifiedImageError("Corrupted image")), + patch("pdqhash.compute") as mock_pdq, + patch("loguru.logger.error") as mock_logger, + ): + enricher = PdqHashEnricher() + enricher.enrich(metadata_with_images) + + assert mock_logger.call_count == len(metadata_with_images.media) + mock_pdq.assert_not_called() + + +@pytest.mark.parametrize( + "media_id, should_have_hash", + [ + ("screenshot", False), + ("warc-file-123", False), + ("regular-image", True), + ] +) +def test_enrich_excludes_by_filetype(media_id, should_have_hash): + metadata = Metadata() + metadata.set_url("https://example.com") + metadata.add_media(Media(filename="image.jpg").set("id", media_id)) + + with ( + patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100)), + patch("PIL.Image.open"), + patch.object(Media, "is_image", return_value=True), + ): + enricher = PdqHashEnricher() + enricher.enrich(metadata) + + media_item = metadata.media[0] + assert (media_item.get("pdq_hash") is not None) == should_have_hash + diff --git a/tests/enrichers/test_screenshot_enricher.py b/tests/enrichers/test_screenshot_enricher.py new file mode 100644 index 0000000..3998deb --- /dev/null +++ b/tests/enrichers/test_screenshot_enricher.py @@ -0,0 +1,205 @@ +import base64 +from unittest.mock import patch, MagicMock + +import pytest +from selenium.common.exceptions import TimeoutException + +from auto_archiver.core import Metadata, Media +from auto_archiver.modules.screenshot_enricher import ScreenshotEnricher + + +@pytest.fixture +def mock_selenium_env(): + # Patches Selenium calls and driver checks in one place. + with ( + patch("shutil.which") as mock_which, + patch("auto_archiver.utils.webdriver.CookieSettingDriver") as mock_driver_class, + patch( + "selenium.webdriver.common.selenium_manager.SeleniumManager.binary_paths" + ) as mock_binary_paths, + patch("pathlib.Path.is_file", return_value=True), + patch("subprocess.Popen") as mock_popen, + patch( + "selenium.webdriver.common.service.Service.is_connectable", + return_value=True, + ), + patch("selenium.webdriver.FirefoxOptions") as mock_firefox_options, + ): + # Mock driver existence + def mock_which_side_effect(dep): + return "/mock/geckodriver" if dep == "geckodriver" else None + + mock_which.side_effect = mock_which_side_effect + # Mock binary paths + mock_binary_paths.return_value = { + "driver_path": "/mock/driver", + "browser_path": "/mock/browser", + } + # Popen + mock_proc = MagicMock() + mock_proc.poll.return_value = None + mock_popen.return_value = mock_proc + # CookieSettingDriver -> returns a mock driver + mock_driver = MagicMock() + mock_driver_class.return_value = mock_driver + # FirefoxOptions + mock_options_instance = MagicMock() + mock_firefox_options.return_value = mock_options_instance + yield mock_driver, mock_driver_class, mock_options_instance + + +@pytest.fixture +def common_patches(tmp_path): + with ( + patch("auto_archiver.utils.url.is_auth_wall", return_value=False), + patch("os.path.join", return_value=str(tmp_path / "test.png")), + patch("time.sleep"), + ): + yield + + +@pytest.fixture +def screenshot_enricher(setup_module, mock_binary_dependencies) -> ScreenshotEnricher: + configs: dict = { + "width": 1280, + "height": 720, + "timeout": 60, + "sleep_before_screenshot": 4, + "http_proxy": "", + "save_to_pdf": "False", + "print_options": {}, + } + return setup_module("screenshot_enricher", configs) + + +@pytest.fixture +def metadata_with_video(): + m = Metadata() + m.set_url("https://example.com") + m.add_media(Media(filename="video.mp4").set("id", "video1")) + return m + + +def test_enrich_adds_screenshot( + screenshot_enricher, + metadata_with_video, + mock_selenium_env, + common_patches, + tmp_path, +): + mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env + screenshot_enricher.enrich(metadata_with_video) + mock_driver_class.assert_called_once_with( + cookies=None, + cookiejar=None, + facebook_accept_cookies=False, + options=mock_options_instance, + ) + # Verify the actual calls on the returned mock_driver + mock_driver.get.assert_called_once_with("https://example.com") + mock_driver.save_screenshot.assert_called_once_with(str(tmp_path / "test.png")) + # Check that the media was added (2 = original video + screenshot) + assert len(metadata_with_video.media) == 2 + assert metadata_with_video.media[1].properties.get("id") == "screenshot" + + +@pytest.mark.parametrize( + "url,is_auth", + [ + ("https://example.com", False), + ("https://private.com", True), + ], +) +def test_enrich_auth_wall( + screenshot_enricher, + metadata_with_video, + mock_selenium_env, + common_patches, + url, + is_auth, +): + # Testing with and without is_auth_wall + mock_driver, mock_driver_class, _ = mock_selenium_env + with patch("auto_archiver.utils.url.is_auth_wall", return_value=is_auth): + metadata_with_video.set_url(url) + screenshot_enricher.enrich(metadata_with_video) + + if is_auth: + mock_driver.get.assert_not_called() + assert len(metadata_with_video.media) == 1 + assert metadata_with_video.media[0].properties.get("id") == "video1" + else: + mock_driver.get.assert_called_once_with(url) + assert len(metadata_with_video.media) == 2 + assert metadata_with_video.media[1].properties.get("id") == "screenshot" + + +def test_handle_timeout_exception( + screenshot_enricher, metadata_with_video, mock_selenium_env +): + mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env + + mock_driver.get.side_effect = TimeoutException + with patch("loguru.logger.info") as mock_log: + screenshot_enricher.enrich(metadata_with_video) + mock_log.assert_called_once_with("TimeoutException loading page for screenshot") + assert len(metadata_with_video.media) == 1 + + +def test_handle_general_exception( + screenshot_enricher, metadata_with_video, mock_selenium_env +): + """Test proper handling of unexpected general exceptions""" + mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env + # Simulate a generic exception when save_screenshot is called + mock_driver.get.return_value = None + mock_driver.save_screenshot.side_effect = Exception("Unexpected Error") + + with patch("loguru.logger.error") as mock_log: + screenshot_enricher.enrich(metadata_with_video) + # Verify that the exception was logged with the log + mock_log.assert_called_once_with( + "Got error while loading webdriver for screenshot enricher: Unexpected Error" + ) + # And no new media was added due to the error + assert len(metadata_with_video.media) == 1 + + +def test_pdf_creation(screenshot_enricher, metadata_with_video, mock_selenium_env): + """Test PDF creation when save_to_pdf is enabled""" + mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env + + # Override the save_to_pdf option + screenshot_enricher.save_to_pdf = True + # Mock the print_page method to return base64-encoded content + mock_driver.print_page.return_value = base64.b64encode(b"fake_pdf_content").decode( + "utf-8" + ) + with ( + patch("os.path.join", side_effect=lambda *args: f"{args[-1]}"), + patch( + "auto_archiver.modules.screenshot_enricher.screenshot_enricher.random_str", + return_value="fixed123", + ), + patch("builtins.open", new_callable=MagicMock()) as mock_open, + patch("loguru.logger.error") as mock_log, + ): + screenshot_enricher.enrich(metadata_with_video) + + # Verify screenshot and PDF creation + mock_driver.save_screenshot.assert_called_once() + mock_driver.print_page.assert_called_once_with(mock_driver.print_options) + + # Check that PDF file was opened and written + mock_open.assert_any_call("pdf_fixed123.pdf", "wb") + # Ensure both screenshot and PDF were added as media + assert len(metadata_with_video.media) == 3 # Original video + screenshot + PDF + assert metadata_with_video.media[1].properties.get("id") == "screenshot" + assert metadata_with_video.media[2].properties.get("id") == "pdf" + + +@pytest.fixture(autouse=True) +def cleanup_files(tmp_path): + yield + for file in tmp_path.iterdir(): + file.unlink() diff --git a/tests/enrichers/test_ssl_enricher.py b/tests/enrichers/test_ssl_enricher.py new file mode 100644 index 0000000..c4d2dc5 --- /dev/null +++ b/tests/enrichers/test_ssl_enricher.py @@ -0,0 +1,54 @@ +import ssl +from unittest.mock import patch, mock_open + +import pytest + +from auto_archiver.core import Metadata, Media + + +@pytest.fixture +def enricher(setup_module): + configs: dict = { + "skip_when_nothing_archived": "True", + } + return setup_module("ssl_enricher", configs) + + +@pytest.fixture +def metadata(): + m = Metadata() + m.set_url("https://example.com") + m.add_media(Media("tests/data/testfile_1.txt")) + m.add_media(Media("tests/data/testfile_2.txt")) + return m + + +def test_http_raises(metadata, enricher): + metadata.set_url("http://example.com") + with pytest.raises(AssertionError) as exc_info: + enricher.enrich(metadata) + assert "Invalid URL scheme" in str(exc_info.value) + + +def test_empty_metadata(metadata, enricher): + metadata.media = [] + assert enricher.enrich(metadata) is None + + +def test_ssl_enrich(metadata, enricher): + with patch("ssl.get_server_certificate", return_value="TEST_CERT"), \ + patch("builtins.open", mock_open()) as mock_file: + enricher.enrich(metadata) + + ssl.get_server_certificate.assert_called_once_with(("example.com", 443)) + mock_file.assert_called_once_with(f"{enricher.tmp_dir}/example-com.pem", "w") + mock_file().write.assert_called_once_with("TEST_CERT") + # Ensure the certificate is added to metadata + assert any(media.filename.endswith("example-com.pem") for media in metadata.media) + + +def test_ssl_error_handling(enricher, metadata): + with patch("ssl.get_server_certificate", side_effect=ssl.SSLError("SSL error")): + with pytest.raises(ssl.SSLError, match="SSL error"): + enricher.enrich(metadata) + diff --git a/tests/enrichers/test_whisper_enricher.py b/tests/enrichers/test_whisper_enricher.py new file mode 100644 index 0000000..8a73ed7 --- /dev/null +++ b/tests/enrichers/test_whisper_enricher.py @@ -0,0 +1,93 @@ +import shutil +import sys +import pytest +from unittest.mock import MagicMock, patch +from auto_archiver.core import Metadata, Media +from auto_archiver.modules.s3_storage import S3Storage + +from auto_archiver.modules.whisper_enricher import WhisperEnricher + + +@pytest.fixture +def enricher(): + """Fixture with mocked S3 and API dependencies""" + config = { + "api_endpoint": "http://testapi", + "api_key": "whisper-key", + "include_srt": False, + "timeout": 5, + "action": "translate", + "steps": {"storages": ["s3_storage"]} + } + mock_s3 = MagicMock(spec=S3Storage) + mock_s3.get_cdn_url.return_value = "http://s3.example.com/media.mp3" + instance = WhisperEnricher() + instance.name = "whisper_enricher" + instance.display_name = "Whisper Enricher" + instance.config_setup({instance.name: config}) + # bypassing the setup method and mocking S3 setup + instance.stores = config['steps']['storages'] + instance.s3 = mock_s3 + yield instance, mock_s3 + + +@pytest.fixture +def metadata(): + metadata = Metadata() + metadata.set_url("http://test.url") + metadata.set_title("test title") + return metadata + + +@pytest.fixture +def mock_requests(): + with patch("auto_archiver.modules.whisper_enricher.whisper_enricher.requests") as mock_requests: + mock_response = MagicMock() + mock_response.status_code = 201 + mock_response.json.return_value = {"id": "job123"} + mock_requests.post.return_value = mock_response + yield mock_requests + + +def test_successful_job_submission(enricher, metadata, mock_requests): + """Test successful media processing with S3 configured""" + whisper, mock_s3 = enricher + # Configure mock S3 URL to match test expectation + mock_s3.get_cdn_url.return_value = "http://cdn.example.com/test.mp4" + + # Create test media with matching CDN URL + m = Media("test.mp4") + m.mimetype = "video/mp4" + m.add_url(mock_s3.get_cdn_url.return_value) + metadata.media = [m] + + # Mock the complete API interaction chain + mock_status_response = MagicMock() + mock_status_response.status_code = 200 + mock_status_response.json.return_value = { + "status": "success", + "meta": {} + } + mock_artifacts_response = MagicMock() + mock_artifacts_response.status_code = 200 + mock_artifacts_response.json.return_value = [{ + "data": [{"start": 0, "end": 5, "text": "test transcript"}] + }] + # Set up mock response sequence + mock_requests.get.side_effect = [ + mock_status_response, # First call: status check + mock_artifacts_response # Second call: artifacts check + ] + # Run enrichment (without opening file) + whisper.enrich(metadata) + # Check API interactions + mock_requests.post.assert_called_once_with( + "http://testapi/jobs", + json={"url": "http://cdn.example.com/test.mp4", "type": "translate"}, + headers={"Authorization": "Bearer whisper-key"} + ) + # Verify job status checks + assert mock_requests.get.call_count == 2 + assert "artifact_0_text" in metadata.media[0].get("whisper_model") + assert "test transcript" in metadata.metadata.get("content") + From 319c1e8f92531db0d89f72fa72b3d06c7172c06c Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 14 Feb 2025 09:48:37 +0000 Subject: [PATCH 03/15] Add more tests. --- poetry.lock | 30 +++++------ .../whisper_enricher/whisper_enricher.py | 3 +- tests/conftest.py | 8 --- .../metadata_enricher_ytshort_expected.pickle | Bin 0 -> 12524 bytes .../metadata_enricher_ytshort_input.pickle | Bin 0 -> 10840 bytes tests/enrichers/test_metadata_enricher.py | 15 +++++- tests/enrichers/test_ssl_enricher.py | 2 + tests/enrichers/test_thumbnail_enricher.py | 6 +-- tests/enrichers/test_whisper_enricher.py | 48 ++++++++++++++++-- tests/test_metadata.py | 23 ++++++++- 10 files changed, 102 insertions(+), 33 deletions(-) create mode 100644 tests/data/metadata/metadata_enricher_ytshort_expected.pickle create mode 100644 tests/data/metadata/metadata_enricher_ytshort_input.pickle diff --git a/poetry.lock b/poetry.lock index decadca..d61b908 100644 --- a/poetry.lock +++ b/poetry.lock @@ -172,18 +172,18 @@ lxml = ["lxml"] [[package]] name = "boto3" -version = "1.36.17" +version = "1.36.19" description = "The AWS SDK for Python" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "boto3-1.36.17-py3-none-any.whl", hash = "sha256:59bcf0c4b04d9cc36f8b418ad17ab3c4a99a21a175d2fad7096aa21cbe84630b"}, - {file = "boto3-1.36.17.tar.gz", hash = "sha256:5ecae20e780a3ce9afb3add532b61c466a8cb8960618e4fa565b3883064c1346"}, + {file = "boto3-1.36.19-py3-none-any.whl", hash = "sha256:7784590369a9d545bb07b2de56b6ce4d5a5e232883a957f704c3f842caeba155"}, + {file = "boto3-1.36.19.tar.gz", hash = "sha256:8c2c2a4ccdfe35dd2611ee1b7473dd2383948415c777e42dc4e7f1ebe371fe8c"}, ] [package.dependencies] -botocore = ">=1.36.17,<1.37.0" +botocore = ">=1.36.19,<1.37.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.11.0,<0.12.0" @@ -192,14 +192,14 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.36.17" +version = "1.36.19" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "botocore-1.36.17-py3-none-any.whl", hash = "sha256:069858b2fd693548035d7fd53a774e37e4260fea64e0ac9b8a3aee904f9321df"}, - {file = "botocore-1.36.17.tar.gz", hash = "sha256:cec13e0a7ce78e71aad0b397581b4e81824c7981ef4c261d2e296d200c399b09"}, + {file = "botocore-1.36.19-py3-none-any.whl", hash = "sha256:98882c106fec4c08678ea028199f7f5119550fab95d682b30846f7aae04b7bec"}, + {file = "botocore-1.36.19.tar.gz", hash = "sha256:cdf6729f601f82b1acdb9004b1f88b57cfb470f576394cdb3bbf5150f7fafb5b"}, ] [package.dependencies] @@ -860,14 +860,14 @@ tool = ["click (>=6.0.0)"] [[package]] name = "googleapis-common-protos" -version = "1.66.0" +version = "1.67.0" description = "Common protobufs used in Google APIs" optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "googleapis_common_protos-1.66.0-py2.py3-none-any.whl", hash = "sha256:d7abcd75fabb2e0ec9f74466401f6c119a0b498e27370e9be4c94cb7e382b8ed"}, - {file = "googleapis_common_protos-1.66.0.tar.gz", hash = "sha256:c3e7b33d15fdca5374cc0a7346dd92ffa847425cc4ea941d970f13680052ec8c"}, + {file = "googleapis_common_protos-1.67.0-py2.py3-none-any.whl", hash = "sha256:579de760800d13616f51cf8be00c876f00a9f146d3e6510e19d1f4111758b741"}, + {file = "googleapis_common_protos-1.67.0.tar.gz", hash = "sha256:21398025365f138be356d5923e9168737d94d46a72aefee4a6110a1f23463c86"}, ] [package.dependencies] @@ -1235,14 +1235,14 @@ files = [ [[package]] name = "myst-parser" -version = "4.0.0" +version = "4.0.1" description = "An extended [CommonMark](https://spec.commonmark.org/) compliant parser," optional = false python-versions = ">=3.10" groups = ["docs"] files = [ - {file = "myst_parser-4.0.0-py3-none-any.whl", hash = "sha256:b9317997552424448c6096c2558872fdb6f81d3ecb3a40ce84a7518798f3f28d"}, - {file = "myst_parser-4.0.0.tar.gz", hash = "sha256:851c9dfb44e36e56d15d05e72f02b80da21a9e0d07cba96baf5e2d476bb91531"}, + {file = "myst_parser-4.0.1-py3-none-any.whl", hash = "sha256:9134e88959ec3b5780aedf8a99680ea242869d012e8821db3126d427edc9c95d"}, + {file = "myst_parser-4.0.1.tar.gz", hash = "sha256:5cfea715e4f3574138aecbf7d54132296bfd72bb614d31168f48c477a830a7c4"}, ] [package.dependencies] @@ -1254,10 +1254,10 @@ pyyaml = "*" sphinx = ">=7,<9" [package.extras] -code-style = ["pre-commit (>=3.0,<4.0)"] +code-style = ["pre-commit (>=4.0,<5.0)"] linkify = ["linkify-it-py (>=2.0,<3.0)"] rtd = ["ipython", "sphinx (>=7)", "sphinx-autodoc2 (>=0.5.0,<0.6.0)", "sphinx-book-theme (>=1.1,<2.0)", "sphinx-copybutton", "sphinx-design", "sphinx-pyscript", "sphinx-tippy (>=0.4.3)", "sphinx-togglebutton", "sphinxext-opengraph (>=0.9.0,<0.10.0)", "sphinxext-rediraffe (>=0.2.7,<0.3.0)"] -testing = ["beautifulsoup4", "coverage[toml]", "defusedxml", "pytest (>=8,<9)", "pytest-cov", "pytest-param-files (>=0.6.0,<0.7.0)", "pytest-regressions", "sphinx-pytest"] +testing = ["beautifulsoup4", "coverage[toml]", "defusedxml", "pygments (<2.19)", "pytest (>=8,<9)", "pytest-cov", "pytest-param-files (>=0.6.0,<0.7.0)", "pytest-regressions", "sphinx-pytest"] testing-docutils = ["pygments", "pytest (>=8,<9)", "pytest-param-files (>=0.6.0,<0.7.0)"] [[package]] diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index 89579f9..917ab85 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -29,8 +29,7 @@ class WhisperEnricher(Enricher): job_results = {} for i, m in enumerate(to_enrich.media): if m.is_video() or m.is_audio(): - # TODO: this used to pass all storage items to store now - # Now only passing S3, the rest will get added later in the usual order (?) + # Only storing S3, the rest will get added later in the usual order (?) m.store(url=url, metadata=to_enrich, storages=[self.s3]) try: job_id = self.submit_job(m) diff --git a/tests/conftest.py b/tests/conftest.py index d7f484f..f7ed4b7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -133,14 +133,6 @@ def unpickle(): return _unpickle -@pytest.fixture -def mock_python_dependencies(): - with patch("auto_archiver.core.module") as mock_check_python_dep: - # Mock all Python dependencies as available - mock_check_python_dep.return_value = True - yield mock_check_python_dep - - @pytest.fixture def mock_binary_dependencies(): with patch("shutil.which") as mock_shutil_which: diff --git a/tests/data/metadata/metadata_enricher_ytshort_expected.pickle b/tests/data/metadata/metadata_enricher_ytshort_expected.pickle new file mode 100644 index 0000000000000000000000000000000000000000..23ce5f6101dbd8b63b5aa4062a97d753d748ebc8 GIT binary patch literal 12524 zcmbVSTW=iKk(Mmcq9`emlXViqi|oNV3p=sZocj!k&4r>ap=FM>vSe9{F^Ha-Gt+JM zOi%j4Aq6au07(`L^qYGhgZ!8LfXzdGLEe&ok+169dMG;?t|eNWnmVVdPTlLAUtay+ zKi4ktf5qE=p2bIgG77>InRuf(k={&Z{@Bm_ve-N{zkXT%@K@!3mG2d=rFCX^GVj~N)2weQ@^T5x<-hoKuG>`l& zjOSt$=h0Y1aeOSI@EAx;AEM6|WpER`{^iEoH(TGr8J5L2NT56mBQJZ2zqIkoJR5;e z92qlrBVRBAnwZh?~h#Fpr~n z8p`wo;m^k+h?gP_q>%HIFa_0AWcW0JeiKPFSI)>2`+*!C3))|yMUhzea~TO7AW%V6 z?rwdZMk3hTcI`hOMcvvXznxBhnub4a>^wX;Ieqp*cBid7Cl4oLYwI1QIq?qUdS|F) z9zk92SnkhoZll1TgYXd>LGSBGYOL*N`wyZ!4_`idA@`p*!{=Yj|KnbM`f_iN3^GsH z0Ln+77h5NxTpqy^=2?08f0v6JOF3NlQ>m!D|8-QBel`kzdh+8HU2cJAIgJw>b@_Q& z+`2!{WCC5yXA-;n#Vr^q{i?}5dac+ZyE7l(Dz0hf%VJAw9y^xBcZ}qdIEvz>Obox9 zzb-bF)ybp`dWAMYAdsPb`)BSbd{d{)rz({xFQf3@Ukjkq@JA zgd>go`83C|mBnjvUKZb+gpo|cm-5J8ATXSkANJq<hQJ6M-sdj|$kSF|dhn}iTgGC6lRy&hmhU4#?jOZ|>0F_Sugz?~1&ste%_ z`YY@;JQiv%G^88eP91_H_83T0q4=o-u$bfx4%F$=D)+fZM>Fyh18FI-mv%|ZYY<8n zg!5w~QphpSPdUO1pVo^d;^w+I=Y(r{uQ1y3I()K0g^w<2I-H-t*%4t4!)>og2N7E% zX?r`Q{}UO6BlwHK>v(MfQ?OQ%oz8{BuicUYwOMu{#6cfn+!5-w!O+q_bC4dLdWK|a zKwYm^12g?T}mKGlR6_)kt%pRnU&w2PJ}^jP1G|uf+mk{XqR<8&OS?o)K_Go?FDbh z_7i4s_jS<*ee0O@p4TF7P3*^K4v0}~x#)Ra3X$j>jLGeIb@F(V1nc-rL3k1;Q)!pM zmC19Lh_``T$RXV&7A#e%CZ3_vJtL3y(IsR=4GB{1sP^D=f~xK!c!NGVj3l#m*Qn{Z zWH@Tnl~R{sRspfRRRe2i&yJ{$G8zPs#kOh&fGx!OKL8w~=MrszVnpPDAY$4U)rJD9 zL;zSHZP34Ky}bd~R!KvjG61Zep&X%Lb3oR!;|(dDlx8iV(ej$)Scpid#musn*JEQg zA+4c3H=Hfd0Q&O`ebXEX2Bt{VNDSyu?Li+dj7L5f+9Pf&@|9u*3~*{uq292mee;#D z-RLOett8n7lB~)x=tYT^Dpz#sa68~Ts3oJ0buBDZFXM4YHHdFdud7l7wc3W$o;r5q z@;Fb>#XC6r`V59|B2HsRRX|QfJ02Gi%$cJmXn$%))sBMqAT?YBZ_tksV@xU0tf}ba zJQWbO1m%IKa+{9%NIn-e__ zf+~5Uyy7IP>w4OPHljrrLL2l~qG%JYOOz`5LAE4gLFR~z+ce>{=Za;3*rs^0WZ3Si zBBFXK$g@-@4<+K13Aa!X@2)hWW>YNO`dCOU*wN8tmTi~S?3q#!qdW9J_FPduvmRa1 z4^(a;k-p7tCI`C~4M?0PW2-h4)VcvnQEEUoWdAhQ)RbIQ5O6ya2LK?@t%(Qkv8?Da z-P+WSV%8b8;TegXltx}*o0sN}^1W&7PM}ruYAcou03S)3g_B4Mu<2KvPH!l93oh|5 zf>tSXT6I;}!2LAKfN*xM6w?6lF(VQhB%S#yg}M7ixBd>mr4#(k9AB%U)s|2=CeP=B z8rz#Pm3j|qFlmv9*F~fFW>2|=ct*ala>`BR95!A6iOcIzY+E^iI9|*!qBbI+_$RBX zW}y~B4+nURXJ&}$(V6Netwv}|tUyDc6VJsFEAblSxGi?>&}u7RGB$|M%+OF~F2exG zp{*)kM}_s&KdG3XF2!}a*JZPh@IaekObuDo0@FkTcDb!PIu&+=+;6V zLLC6QYY?c06ZodTk&%+#p>FqX$j2nf5KEl%5YWzq-iGvI`2qnKoB z0{f%H#Ex2AbPD8KYv2v~OVq4d1)#!mRWdn1ESDP58$4a;SNiETI(sUXZJ7Z8rx}z- zwlo5a6R>qc0Q5K!%^Lik>oa{Dz)d!6jW?h`hY8_2NT`?1wSzICHoRDk$*ldg^#ISb zZ1~LO!#H-$HekA2Q5I>WM`KQfklPC#IGF>d6h~Z1b0Bz2Yu41-TnG`X)&*Q+2|6-Vy0JFu0kQ>aSNKe4x}@tqw~PmbLmMVA)T4|7 zHRI${)1KEc%-4|~^_k+4hOzJizC0YFIIt_&^Vh&Z1z$- zR7Zdl918v60D((Mgp2j@P`P57%F1CfS{Dw$);P8`8<1M6*wpcv0emKvV{55~&dQAG zJ+&HC{Hi+`Ud3BiYD55acBh(%_@ap})28VG356h=6N6EfEY zhC&8};6ce+m%LyI4G95ock57EWD;XF1Yr~ousSLM0H3CPozbAECxEW$fuS8VMc-5z ziV!1I^3p}|IHq6&{cpfX5hU{l4^ zcQFdr9fpql460LXw26>G4cf~d1(xyLZyHls>*IDy(PnCR(tt9N)1DeCOH{4v;!)$M zN4`v#9Y__0_>78y_z3d(IMMSyhH{PW?j{(<7C%X1bvwZzo*$d3Iv~jJly^*7#1J+D zF5tPV@(nS48$lZinwQ%;e?2tRB=TKdJTe{DL{(PA29ixTf56ez{Y<3_2Jm!m8tVke zAYFtSSH~)z9=X9Lu(W6E#+VtgJ>%>Q+DWN*G^6tiq0uDInGWIu`H(5=nKWA=Ub(2G z%^=o;RjQs%OygK36QCem&{LZ~Ga*>YK%QC}lD=t_*m*&UU1U(}6SEWHqLRc(~BE6~>vZ!;itcQkMzgVWGO2=WE zpyjd-K%6CQIR*eHkBOAVVF2QAUDY1~86{Uc1L4it(0GVMjiIKkvJOfo+EJ;9R>A5p zY`OrR9Z-OjeuUuaK|t8@kxyMv!aW6>iIYuVa8i+D;scOWRMPJ?usix2sy-^i4FFN2 zQ1B#OKo+Y16fOG1`*G;fGmN#d27Pq&$(gy>104h>BJKhyAZ$cr9=b(hG>)zmR*d=S z5N66htYg6fy!>$}d`i_-2x>$RWsl*61yR&cW*}_g*&Fsyzpsxs=%Z;fgfVj^Dwuvc z2%DLHjR1-C(8UElKv)3PotMEidU^TX`={aLDK$@@%7otY30$NO@gA($Kyt0u@T&0v zX4az552bEb9<1r7QP=N;MF*H{N%5SY?12h$BsNySpR31v zfZ`qt(5H!yN21#Bg45!<0z5moR~ENN^=?g0>asQRyQ7W|`M#mP{>VNWydvjQydvC( zSR+A;Uf(_I3bh}pz>Ub{c1(}cgAJG5I*{i*o-a%|o+FG-7Uk&V}PaqHG%s!VuGotE;S z>vV#r>cF-R@jhq~o)YuL6(q$n&x`9vGpIIWgIqDNYey;EQ;y63$%ny*>WOx6Sp3V1 zvu@A)Qw%P~a^mMv=6$i4mchsU;PK~W@PuACzE^CF=v~(TD{Q>+>gL7TaiiH4$7{un zbDrEbp8WUX#NNh<`6094#LzXX8C?zt|t{C#~JaZcV(`5RD(gyQ>beSqI0b%;1WDY~X=R dqX7>Ie%B9v4~@;r@oql!{sx!S6FmR` literal 0 HcmV?d00001 diff --git a/tests/data/metadata/metadata_enricher_ytshort_input.pickle b/tests/data/metadata/metadata_enricher_ytshort_input.pickle new file mode 100644 index 0000000000000000000000000000000000000000..5f1a4eb4317bf69f2ccf59306fb645985a70c648 GIT binary patch literal 10840 zcmb7KdygB{6<<<9mQB)1MWrC5cGOTMFBpY^)&pnTO-e>&v^Zz(~QDSz>&@~iSr@j{x3EKkeg$Ghz9S+qKPl;l}H zk$1f`pH5|pPYx{sI&kxBwNB^*IXe>>J_Tu+g-coe^+|dE=-U6T+LEZzvRY?aJ@*o5!C#lKn;J?ahO^cXV`1S3mE&QqA{mX&^odz%I# z2IDh!g2A`3)L6%#-2YAV_JfO)NAmus?eNntVj&fmH@}U`R%FxQz0Es&wmgE&a*?cY)#Vpuar4)4CfBf4yp*6W zikom!`c<<#xl$aF-LT;Wr-^8xclg5=LPLlf1CXCs8=Xl|~|7WkGp5c)oZckl`Z0oqv5TFRmi$LxDTHQ@rwx zsfs==uAVZMO;U%%Bo>roe&Qcz%hi`>>w#QuCe;k!!=6iIj&r#yN5%8-9k$I8q*}o^ zWGdvZz=tc`faMm5Fy&#vtHq1+Jc?BQ2wn@`2;K}{FAiIcR;Sr)^>LGFKH*Q(GwA$T z^!mX;IR9m%ahL!4%~s=G59z4_84lm|G7+6SP@9ml!~r@T0G84Lx;+3mz#XhluzL>; zps9FOp4)_w%rhA~oWTGv;_iiWl1u%LEU}b2fgqe8s#BN38;|zHYx%6yz0k03`NJj@ zNA59@wn7o91F)JD4Gz@n(<%2npsSgRwSjb$+FN_177D&)#I1pb36%YCWFQ(4)Dirt(W7NVLk5(eJA`X;1%|^p;9r#_k>K%%q zm3rV0DJ=4(K^poEvRr5lG*ooZB8c_Oo*VidHn5G-q2FtRcq-;H+o2k!nFrR28OVbm zbs0p0>hq=0#-o|s$S7G+ozQYN8%i11xtI?S_^eTCEZS`P9Of?8nk{9%)JP8i$0ucg z3a@Hkl&8oDaK1F*JW&g4JUWk)tr4u{bMS5hn}EQUeZ2~5tNZ`MzWX=ZJ|{ucIjdG% z-S|9Qex6>vt)T zCTn367|yce|5m^a^cBh(dHBv#^gN=V&Up2)IDoByJ-j zb3#RrkQrBR_SMOX`}9?I3+=JK&(&v(l=_S=MBZb9>_1_KaIcFt9@)Tb4*U*DYjQuH zIUz=gm15xcDMey(FfKRrn-uY;2sZIdNqC;D7t$U>C{yHak#7TckW;!VEI6u4O|nF% zyQ7E}m=ZGLh61T5DjHl)(A8ZAZ#=?;vCgdDwHi7v8Hrj=Wz=n$RY0t7)xlaivLouF zj0FK?wXK-}U=OkJ4*=)rSYixNf_Qo05i{+Jil7HmA_3G#8;{<%(cVI6tD>QA82~oV zL{8DLIUw8F$%c|nMzfyK>iBI6EMz3~V&+)KAFwlrW}ueht&SDF&PHU(-AZMbX z&rJkN=D2x8pTYl4A`rd27QBrzNFI`#_yK?o zgu&mDLu3$=?a$gRl^w(^Ss^?<=3R+!(5FaY8Z`Z05^p@RNfey87`i;$emY>zfQN@NGzfXyKi_o>^>L}R(JIDIND2Xs9ew+kV@kD#YMO63oM1ly>p-Z8SM|)Yc zjnE}d75kuClCz+4L?&$72->HbWq{bHc(Y{0?yDxEdMl{2)F)3Q@{~!pFc4E$7E!w? zR<1r4N((4DzAUnxvf2aF3SxDK`N!TX+Gn<-Yx;r8BP7bVp=N5Z8_|HobuzJLLqn|x zuoSHZbVKz|6U|NOMGXOuGf4md5?xI^M2{6kx9QfWhDuoH)JA3$axxl4g&khnLlt|A z#N9xr;Wzd?834YLGz;gE^dP3F1f3@6F$r$*FN0QTbUIDd*dYA0$bfKlt~ApC@iik8 z8YEr$D~-APM34Rsz^xPF&0JrnrOmdcbWD-Y4K>J{HkD=%8ZhaQm()e0{AO>tgM3D@ zv3JXD6&yBS0Eye{X<|n?fH+?)v7$CAp!p}Ss#c*6QV%D1tY?CsNjlhz_c5+^Va z=)!Y(#7e#fJ06Q&ICQ!ymW&VLnFX52%vBfwIki>w>#4M!icQ7+bStjQy>6QY(gP90 zni{&O17?`~Zj|w8j%FMyU2}_Q4fNXF8z>W29hvuG?UED1AVlZps3Zwq(HG%18+RqqG#1602kJ) zQpo{gz0``{;OR!cGEa9g*;BJ@+YA7>%%DB8tr1{cfNc;0px22Q))4pHpXs*&JY>Vy zm;fa@YzXThpZcl~MA9InK`Uu!l z2I{M9Q1LoINHJ=ArOl>Yv=C%j;jtXFz+_$?Q#b9MK2sM9J&&EI!GISSb~OnA>zGV7 zVUnkjHKQFBL{tT`HRR(dW)0g{bO+kxNfN2~dG1+yD5pfhGiPY1sfh*e02;qw#tXuUO%qt^QAL57ar3De&zo50>r9XSOvy;gSi}LpJe;68 zuqQb3Yv7;*u&IX`uy{Ib0nlqb;}w9Q;FE*tp^E{m;v`p2$q-gZDb3FuIF)BkaFu6H zZ#MPt5w##yodN*5i0fRz08YcuTBSPV21rSL1SG+w&>v0^cqxf=Q6CSTE4Ha@9A=|! z;Q;K7GuyKPsiT@rou3)NcT%~wjvDB!&e-15s6oxIdI!U&m~^E_1W;FZ>WN4$+UPQG z+8&U?c(2MY5Y#D(*1CA;D$JI`D2H@H?z+NK$bgVMs95Wk7Xo1*ApqXpI+PB%#7r$g z7=;t8&Po6vrird|8Z`9;&^|yVQDs~3{n-?bTQ4lScU5eLuY;lH7PgRLdc*N(Xv;8W4!lU zB-GXlyxmf~nH#>+fHslKo?0qPbggyqXmHdkU*^jJq?$rJqhlaBf;^tB^}dgx+@rg9 z6AWXIpRW`3c7j2?KQ>!+Kv3VQ?wGcSA?yU)!1Gik3^V;Uf(UxFFSmVuJv8(r^0sas znGai{sw;8>>86K2;OOptsY(R{c)PbqbOB_LZbD6{;}ma?++q`0Iikk@w8?X>gZw}-WZHUW&0dIaxu~MeAhv@ws@_d3l0+2~pdek)T~JO?Lp8&A zAaL1;UM7(FQoW959KH@XB(3#mG$zJyP$I6#scupt&(=cX<#Ig3yC)rPO}+WY*4jbX zm?*ynqPLH3*#_C3&K>}8nzEB5a4;@#oK>zA{Q>u&Pz9S<5`-4!9+>HU8gdEGm0MDo z=Koq~WH;+G`t^W0X|&Mq1oj*kD+-d(jiD?xN2#r9*!QyDZ3Jkfmo6^x0m2HX>AcK~7tfY> z%aZL@65nmQc9v#Xc+JYc=ab+=^+nF$wD{@%SEH8VF~*yjoQpil{4ZCFGWci|{O*e~ m`1q?bxKmu6(w8^NG ThumbnailEnricher: - configs: dict = { +def thumbnail_enricher(setup_module, mock_binary_dependencies) -> ThumbnailEnricher: + config: dict = { "thumbnails_per_minute": 60, "max_thumbnails": 4, } - return setup_module("thumbnail_enricher", configs) + return setup_module("thumbnail_enricher", config) @pytest.fixture diff --git a/tests/enrichers/test_whisper_enricher.py b/tests/enrichers/test_whisper_enricher.py index 8a73ed7..873198f 100644 --- a/tests/enrichers/test_whisper_enricher.py +++ b/tests/enrichers/test_whisper_enricher.py @@ -8,6 +8,9 @@ from auto_archiver.modules.s3_storage import S3Storage from auto_archiver.modules.whisper_enricher import WhisperEnricher +TEST_S3_URL = "http://cdn.example.com/test.mp4" + + @pytest.fixture def enricher(): """Fixture with mocked S3 and API dependencies""" @@ -20,7 +23,7 @@ def enricher(): "steps": {"storages": ["s3_storage"]} } mock_s3 = MagicMock(spec=S3Storage) - mock_s3.get_cdn_url.return_value = "http://s3.example.com/media.mp3" + mock_s3.get_cdn_url.return_value = TEST_S3_URL instance = WhisperEnricher() instance.name = "whisper_enricher" instance.display_name = "Whisper Enricher" @@ -53,7 +56,7 @@ def test_successful_job_submission(enricher, metadata, mock_requests): """Test successful media processing with S3 configured""" whisper, mock_s3 = enricher # Configure mock S3 URL to match test expectation - mock_s3.get_cdn_url.return_value = "http://cdn.example.com/test.mp4" + mock_s3.get_cdn_url.return_value = TEST_S3_URL # Create test media with matching CDN URL m = Media("test.mp4") @@ -78,6 +81,7 @@ def test_successful_job_submission(enricher, metadata, mock_requests): mock_status_response, # First call: status check mock_artifacts_response # Second call: artifacts check ] + # Run enrichment (without opening file) whisper.enrich(metadata) # Check API interactions @@ -89,5 +93,43 @@ def test_successful_job_submission(enricher, metadata, mock_requests): # Verify job status checks assert mock_requests.get.call_count == 2 assert "artifact_0_text" in metadata.media[0].get("whisper_model") - assert "test transcript" in metadata.metadata.get("content") + assert metadata.media[0].get("whisper_model") == {'artifact_0_text': 'test transcript', 'job_artifacts_check': 'http://testapi/jobs/job123/artifacts', 'job_id': 'job123', 'job_status_check': 'http://testapi/jobs/job123'} + + + +def test_submit_job(enricher): + """Test job submission method""" + whisper, _ = enricher + m = Media("test.mp4") + m.add_url(TEST_S3_URL) + with patch("auto_archiver.modules.whisper_enricher.whisper_enricher.requests") as mock_requests: + mock_response = MagicMock() + mock_response.status_code = 201 + mock_response.json.return_value = {"id": "job123"} + mock_requests.post.return_value = mock_response + job_id = whisper.submit_job(m) + assert job_id == "job123" + +def test_submit_raises_status(enricher): + whisper, _ = enricher + m = Media("test.mp4") + m.add_url(TEST_S3_URL) + with patch("auto_archiver.modules.whisper_enricher.whisper_enricher.requests") as mock_requests: + mock_response = MagicMock() + mock_response.status_code = 400 + mock_response.json.return_value = {"id": "job123"} + mock_requests.post.return_value = mock_response + with pytest.raises(AssertionError) as exc_info: + whisper.submit_job(m) + assert str(exc_info.value) == "calling the whisper api http://testapi returned a non-success code: 400" + +# @pytest.mark.parametrize("test_url, status", ["http://cdn.example.com/test.mp4",]) +def test_submit_job_fails(enricher): + """Test assertion fails with non-S3 URL""" + whisper, mock_s3 = enricher + m = Media("test.mp4") + m.add_url("http://cdn.wrongurl.com/test.mp4") + with pytest.raises(AssertionError): + whisper.submit_job(m) + diff --git a/tests/test_metadata.py b/tests/test_metadata.py index b07e107..a753936 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -162,4 +162,25 @@ def test_get_context(): def test_choose_most_complete(): - pass \ No newline at end of file + m_more = Metadata() + m_more.set_title("Title 1") + m_more.set_content("Content 1") + m_more.set_url("https://example.com") + + m_less = Metadata() + m_less.set_title("Title 2") + m_less.set_content("Content 2") + m_less.set_url("https://example.com") + m_less.set_context("key", "value") + + res = Metadata.choose_most_complete([m_more, m_less]) + assert res.metadata.get("title") == "Title 1" + +def test_choose_most_complete_from_pickles(unpickle): + # test most complete from pickles before and after an enricher has run + # Only compares length of media, not the actual media + m_before_enriching = unpickle("/Users/erinclark/PycharmProjects/auto-archiver/tests/data/metadata/metadata_enricher_ytshort_input.pickle") + m_after_enriching = unpickle("/Users/erinclark/PycharmProjects/auto-archiver/tests/data/metadata/metadata_enricher_ytshort_expected.pickle") + # Iterates `for r in results[1:]:` + res = Metadata.choose_most_complete([Metadata(), m_after_enriching, m_before_enriching]) + assert res.media == m_after_enriching.media From b0756a6a3405ef6b860edc070d91c0ce70f4b7b1 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 14 Feb 2025 09:57:44 +0000 Subject: [PATCH 04/15] Remove accidental full path. --- tests/test_metadata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index a753936..c6f3593 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -179,8 +179,8 @@ def test_choose_most_complete(): def test_choose_most_complete_from_pickles(unpickle): # test most complete from pickles before and after an enricher has run # Only compares length of media, not the actual media - m_before_enriching = unpickle("/Users/erinclark/PycharmProjects/auto-archiver/tests/data/metadata/metadata_enricher_ytshort_input.pickle") - m_after_enriching = unpickle("/Users/erinclark/PycharmProjects/auto-archiver/tests/data/metadata/metadata_enricher_ytshort_expected.pickle") + m_before_enriching = unpickle("tests/data/metadata/metadata_enricher_ytshort_input.pickle") + m_after_enriching = unpickle("tests/data/metadata/metadata_enricher_ytshort_expected.pickle") # Iterates `for r in results[1:]:` res = Metadata.choose_most_complete([Metadata(), m_after_enriching, m_before_enriching]) assert res.media == m_after_enriching.media From 71b41dd9018ff09126943602c69a10bc19fe4a62 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 14 Feb 2025 10:05:32 +0000 Subject: [PATCH 05/15] Remove accidental path, yet again. --- tests/conftest.py | 5 ++--- .../metadata_enricher_ytshort_expected.pickle | Bin .../metadata_enricher_ytshort_input.pickle | Bin tests/enrichers/test_metadata_enricher.py | 4 ++-- tests/test_metadata.py | 4 ++-- 5 files changed, 6 insertions(+), 7 deletions(-) rename tests/data/{metadata => }/metadata_enricher_ytshort_expected.pickle (100%) rename tests/data/{metadata => }/metadata_enricher_ytshort_input.pickle (100%) diff --git a/tests/conftest.py b/tests/conftest.py index f7ed4b7..19e1f6b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -124,11 +124,10 @@ def pytest_runtest_setup(item): def unpickle(): """ Returns a helper function that unpickles a file - ** gets the file from the test_files directory: tests/data/test_files ** + ** gets the file from the test_files directory: tests/data/ ** """ def _unpickle(path): - test_data_dir = os.path.join(os.path.dirname(__file__), "data", "test_files") - with open(os.path.join(test_data_dir, path), "rb") as f: + with open(os.path.join("tests/data", path), "rb") as f: return pickle.load(f) return _unpickle diff --git a/tests/data/metadata/metadata_enricher_ytshort_expected.pickle b/tests/data/metadata_enricher_ytshort_expected.pickle similarity index 100% rename from tests/data/metadata/metadata_enricher_ytshort_expected.pickle rename to tests/data/metadata_enricher_ytshort_expected.pickle diff --git a/tests/data/metadata/metadata_enricher_ytshort_input.pickle b/tests/data/metadata_enricher_ytshort_input.pickle similarity index 100% rename from tests/data/metadata/metadata_enricher_ytshort_input.pickle rename to tests/data/metadata_enricher_ytshort_input.pickle diff --git a/tests/enrichers/test_metadata_enricher.py b/tests/enrichers/test_metadata_enricher.py index 9dc410b..c6190ed 100644 --- a/tests/enrichers/test_metadata_enricher.py +++ b/tests/enrichers/test_metadata_enricher.py @@ -79,8 +79,8 @@ def test_get_metadata_error_handling(mock_run, mock_logger_error, enricher): @pytest.mark.skip(reason="Requires ExifTool to be installed. TODO mock") def test_metadata_pickle(enricher, unpickle): # Uses a pickle of a YouTube short - metadata = unpickle("tests/data/metadata/metadata_enricher_ytshort_input.pickle") - expected = unpickle("tests/data/metadata/metadata_enricher_ytshort_expected.pickle") + metadata = unpickle("metadata_enricher_ytshort_input.pickle") + expected = unpickle("metadata_enricher_ytshort_expected.pickle") enricher.enrich(metadata) expected_media = expected.media actual_media = metadata.media diff --git a/tests/test_metadata.py b/tests/test_metadata.py index c6f3593..e1f7797 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -179,8 +179,8 @@ def test_choose_most_complete(): def test_choose_most_complete_from_pickles(unpickle): # test most complete from pickles before and after an enricher has run # Only compares length of media, not the actual media - m_before_enriching = unpickle("tests/data/metadata/metadata_enricher_ytshort_input.pickle") - m_after_enriching = unpickle("tests/data/metadata/metadata_enricher_ytshort_expected.pickle") + m_before_enriching = unpickle("metadata_enricher_ytshort_input.pickle") + m_after_enriching = unpickle("metadata_enricher_ytshort_expected.pickle") # Iterates `for r in results[1:]:` res = Metadata.choose_most_complete([Metadata(), m_after_enriching, m_before_enriching]) assert res.media == m_after_enriching.media From ce5a200d1f42572061cd2cedf3db0d847c8c7e34 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 18 Feb 2025 12:59:10 +0000 Subject: [PATCH 06/15] Added tests, updated instagram_tbot_extractor.py raise failure. --- poetry.lock | 193 +++++++++--------- .../instagram_tbot_extractor.py | 9 +- src/auto_archiver/utils/misc.py | 2 +- tests/conftest.py | 7 + tests/databases/test_api_db.py | 69 +++++++ .../test_instagram_tbot_extractor.py | 131 +++++++----- tests/feeders/test_gworksheet.py | 1 + tests/utils/test_misc.py | 146 +++++++++++++ 8 files changed, 401 insertions(+), 157 deletions(-) create mode 100644 tests/databases/test_api_db.py create mode 100644 tests/utils/test_misc.py diff --git a/poetry.lock b/poetry.lock index c6bb954..c8524b4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -172,18 +172,18 @@ lxml = ["lxml"] [[package]] name = "boto3" -version = "1.36.19" +version = "1.36.22" description = "The AWS SDK for Python" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "boto3-1.36.19-py3-none-any.whl", hash = "sha256:7784590369a9d545bb07b2de56b6ce4d5a5e232883a957f704c3f842caeba155"}, - {file = "boto3-1.36.19.tar.gz", hash = "sha256:8c2c2a4ccdfe35dd2611ee1b7473dd2383948415c777e42dc4e7f1ebe371fe8c"}, + {file = "boto3-1.36.22-py3-none-any.whl", hash = "sha256:39957eabdce009353d72d131046489fbbfa15891865d5f069f1e8bfa414e6b81"}, + {file = "boto3-1.36.22.tar.gz", hash = "sha256:768c8a4d4a6227fe2258105efa086f1424cba5ca915a5eb2305b2cd979306ad1"}, ] [package.dependencies] -botocore = ">=1.36.19,<1.37.0" +botocore = ">=1.36.22,<1.37.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.11.0,<0.12.0" @@ -192,14 +192,14 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.36.19" +version = "1.36.22" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "botocore-1.36.19-py3-none-any.whl", hash = "sha256:98882c106fec4c08678ea028199f7f5119550fab95d682b30846f7aae04b7bec"}, - {file = "botocore-1.36.19.tar.gz", hash = "sha256:cdf6729f601f82b1acdb9004b1f88b57cfb470f576394cdb3bbf5150f7fafb5b"}, + {file = "botocore-1.36.22-py3-none-any.whl", hash = "sha256:75d6b34acb0686ee4d54ff6eb285e78ccfe318407428769d1e3e13351714d890"}, + {file = "botocore-1.36.22.tar.gz", hash = "sha256:59520247d5a479731724f97c995d5a1c2aae3b303b324f39d99efcfad1d3019e"}, ] [package.dependencies] @@ -781,14 +781,14 @@ grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] [[package]] name = "google-api-python-client" -version = "2.160.0" +version = "2.161.0" description = "Google API Client Library for Python" optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "google_api_python_client-2.160.0-py2.py3-none-any.whl", hash = "sha256:63d61fb3e4cf3fb31a70a87f45567c22f6dfe87bbfa27252317e3e2c42900db4"}, - {file = "google_api_python_client-2.160.0.tar.gz", hash = "sha256:a8ccafaecfa42d15d5b5c3134ced8de08380019717fc9fb1ed510ca58eca3b7e"}, + {file = "google_api_python_client-2.161.0-py2.py3-none-any.whl", hash = "sha256:9476a5a4f200bae368140453df40f9cda36be53fa7d0e9a9aac4cdb859a26448"}, + {file = "google_api_python_client-2.161.0.tar.gz", hash = "sha256:324c0cce73e9ea0a0d2afd5937e01b7c2d6a4d7e2579cdb6c384f9699d6c9f37"}, ] [package.dependencies] @@ -2363,14 +2363,14 @@ test = ["cython (>=3.0)", "defusedxml (>=0.7.1)", "pytest (>=8.0)", "setuptools [[package]] name = "sphinx-autoapi" -version = "3.5.0" +version = "3.6.0" description = "Sphinx API documentation generator" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" groups = ["docs"] files = [ - {file = "sphinx_autoapi-3.5.0-py3-none-any.whl", hash = "sha256:8676db32dded669dc6be9100696652640dc1e883e45b74710d74eb547a310114"}, - {file = "sphinx_autoapi-3.5.0.tar.gz", hash = "sha256:10dcdf86e078ae1fb144f653341794459e86f5b23cf3e786a735def71f564089"}, + {file = "sphinx_autoapi-3.6.0-py3-none-any.whl", hash = "sha256:f3b66714493cab140b0e896d33ce7137654a16ac1edb6563edcbd47bf975f711"}, + {file = "sphinx_autoapi-3.6.0.tar.gz", hash = "sha256:c685f274e41d0842ae7e199460c322c4bd7fec816ccc2da8d806094b4f64af06"}, ] [package.dependencies] @@ -2380,7 +2380,7 @@ astroid = [ ] Jinja2 = "*" PyYAML = "*" -sphinx = ">=6.1.0" +sphinx = ">=7.4.0" [[package]] name = "sphinx-autobuild" @@ -2680,14 +2680,14 @@ telegram = ["requests"] [[package]] name = "trio" -version = "0.28.0" +version = "0.29.0" description = "A friendly Python library for async concurrency and I/O" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "trio-0.28.0-py3-none-any.whl", hash = "sha256:56d58977acc1635735a96581ec70513cc781b8b6decd299c487d3be2a721cd94"}, - {file = "trio-0.28.0.tar.gz", hash = "sha256:4e547896fe9e8a5658e54e4c7c5fa1db748cbbbaa7c965e7d40505b928c73c05"}, + {file = "trio-0.29.0-py3-none-any.whl", hash = "sha256:d8c463f1a9cc776ff63e331aba44c125f423a5a13c684307e828d930e625ba66"}, + {file = "trio-0.29.0.tar.gz", hash = "sha256:ea0d3967159fc130acb6939a0be0e558e364fee26b5deeecc893a6b08c361bdf"}, ] [package.dependencies] @@ -2701,18 +2701,19 @@ sortedcontainers = "*" [[package]] name = "trio-websocket" -version = "0.11.1" +version = "0.12.1" description = "WebSocket library for Trio" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" groups = ["main"] files = [ - {file = "trio-websocket-0.11.1.tar.gz", hash = "sha256:18c11793647703c158b1f6e62de638acada927344d534e3c7628eedcb746839f"}, - {file = "trio_websocket-0.11.1-py3-none-any.whl", hash = "sha256:520d046b0d030cf970b8b2b2e00c4c2245b3807853ecd44214acd33d74581638"}, + {file = "trio_websocket-0.12.1-py3-none-any.whl", hash = "sha256:608ec746bb287e5d5a66baf483e41194193c5cf05ffaad6240e7d1fcd80d1e6f"}, + {file = "trio_websocket-0.12.1.tar.gz", hash = "sha256:d55ccd4d3eae27c494f3fdae14823317839bdcb8214d1173eacc4d42c69fc91b"}, ] [package.dependencies] exceptiongroup = {version = "*", markers = "python_version < \"3.11\""} +outcome = ">=1.2.0" trio = ">=0.11" wsproto = ">=0.14" @@ -2779,14 +2780,14 @@ files = [ [[package]] name = "tzlocal" -version = "5.2" +version = "5.3" description = "tzinfo object for the local timezone" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" groups = ["main"] files = [ - {file = "tzlocal-5.2-py3-none-any.whl", hash = "sha256:49816ef2fe65ea8ac19d19aa7a1ae0551c834303d5014c6d5a62e4cbda8047b8"}, - {file = "tzlocal-5.2.tar.gz", hash = "sha256:8d399205578f1a9342816409cc1e46a93ebd5755e39ea2d85334bea911bf0e6e"}, + {file = "tzlocal-5.3-py3-none-any.whl", hash = "sha256:3814135a1bb29763c6e4f08fd6e41dbb435c7a60bfbb03270211bcc537187d8c"}, + {file = "tzlocal-5.3.tar.gz", hash = "sha256:2fafbfc07e9d8b49ade18f898d6bcd37ae88ce3ad6486842a2e4f03af68323d2"}, ] [package.dependencies] @@ -3032,81 +3033,81 @@ test = ["websockets"] [[package]] name = "websockets" -version = "14.2" +version = "15.0" description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)" optional = false python-versions = ">=3.9" groups = ["main", "docs"] files = [ - {file = "websockets-14.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e8179f95323b9ab1c11723e5d91a89403903f7b001828161b480a7810b334885"}, - {file = "websockets-14.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0d8c3e2cdb38f31d8bd7d9d28908005f6fa9def3324edb9bf336d7e4266fd397"}, - {file = "websockets-14.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:714a9b682deb4339d39ffa674f7b674230227d981a37d5d174a4a83e3978a610"}, - {file = "websockets-14.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2e53c72052f2596fb792a7acd9704cbc549bf70fcde8a99e899311455974ca3"}, - {file = "websockets-14.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e3fbd68850c837e57373d95c8fe352203a512b6e49eaae4c2f4088ef8cf21980"}, - {file = "websockets-14.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b27ece32f63150c268593d5fdb82819584831a83a3f5809b7521df0685cd5d8"}, - {file = "websockets-14.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4daa0faea5424d8713142b33825fff03c736f781690d90652d2c8b053345b0e7"}, - {file = "websockets-14.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:bc63cee8596a6ec84d9753fd0fcfa0452ee12f317afe4beae6b157f0070c6c7f"}, - {file = "websockets-14.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7a570862c325af2111343cc9b0257b7119b904823c675b22d4ac547163088d0d"}, - {file = "websockets-14.2-cp310-cp310-win32.whl", hash = "sha256:75862126b3d2d505e895893e3deac0a9339ce750bd27b4ba515f008b5acf832d"}, - {file = "websockets-14.2-cp310-cp310-win_amd64.whl", hash = "sha256:cc45afb9c9b2dc0852d5c8b5321759cf825f82a31bfaf506b65bf4668c96f8b2"}, - {file = "websockets-14.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3bdc8c692c866ce5fefcaf07d2b55c91d6922ac397e031ef9b774e5b9ea42166"}, - {file = "websockets-14.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c93215fac5dadc63e51bcc6dceca72e72267c11def401d6668622b47675b097f"}, - {file = "websockets-14.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1c9b6535c0e2cf8a6bf938064fb754aaceb1e6a4a51a80d884cd5db569886910"}, - {file = "websockets-14.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a52a6d7cf6938e04e9dceb949d35fbdf58ac14deea26e685ab6368e73744e4c"}, - {file = "websockets-14.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9f05702e93203a6ff5226e21d9b40c037761b2cfb637187c9802c10f58e40473"}, - {file = "websockets-14.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22441c81a6748a53bfcb98951d58d1af0661ab47a536af08920d129b4d1c3473"}, - {file = "websockets-14.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:efd9b868d78b194790e6236d9cbc46d68aba4b75b22497eb4ab64fa640c3af56"}, - {file = "websockets-14.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1a5a20d5843886d34ff8c57424cc65a1deda4375729cbca4cb6b3353f3ce4142"}, - {file = "websockets-14.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:34277a29f5303d54ec6468fb525d99c99938607bc96b8d72d675dee2b9f5bf1d"}, - {file = "websockets-14.2-cp311-cp311-win32.whl", hash = "sha256:02687db35dbc7d25fd541a602b5f8e451a238ffa033030b172ff86a93cb5dc2a"}, - {file = "websockets-14.2-cp311-cp311-win_amd64.whl", hash = "sha256:862e9967b46c07d4dcd2532e9e8e3c2825e004ffbf91a5ef9dde519ee2effb0b"}, - {file = "websockets-14.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1f20522e624d7ffbdbe259c6b6a65d73c895045f76a93719aa10cd93b3de100c"}, - {file = "websockets-14.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:647b573f7d3ada919fd60e64d533409a79dcf1ea21daeb4542d1d996519ca967"}, - {file = "websockets-14.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6af99a38e49f66be5a64b1e890208ad026cda49355661549c507152113049990"}, - {file = "websockets-14.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:091ab63dfc8cea748cc22c1db2814eadb77ccbf82829bac6b2fbe3401d548eda"}, - {file = "websockets-14.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b374e8953ad477d17e4851cdc66d83fdc2db88d9e73abf755c94510ebddceb95"}, - {file = "websockets-14.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a39d7eceeea35db85b85e1169011bb4321c32e673920ae9c1b6e0978590012a3"}, - {file = "websockets-14.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0a6f3efd47ffd0d12080594f434faf1cd2549b31e54870b8470b28cc1d3817d9"}, - {file = "websockets-14.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:065ce275e7c4ffb42cb738dd6b20726ac26ac9ad0a2a48e33ca632351a737267"}, - {file = "websockets-14.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e9d0e53530ba7b8b5e389c02282f9d2aa47581514bd6049d3a7cffe1385cf5fe"}, - {file = "websockets-14.2-cp312-cp312-win32.whl", hash = "sha256:20e6dd0984d7ca3037afcb4494e48c74ffb51e8013cac71cf607fffe11df7205"}, - {file = "websockets-14.2-cp312-cp312-win_amd64.whl", hash = "sha256:44bba1a956c2c9d268bdcdf234d5e5ff4c9b6dc3e300545cbe99af59dda9dcce"}, - {file = "websockets-14.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6f1372e511c7409a542291bce92d6c83320e02c9cf392223272287ce55bc224e"}, - {file = "websockets-14.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4da98b72009836179bb596a92297b1a61bb5a830c0e483a7d0766d45070a08ad"}, - {file = "websockets-14.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8a86a269759026d2bde227652b87be79f8a734e582debf64c9d302faa1e9f03"}, - {file = "websockets-14.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86cf1aaeca909bf6815ea714d5c5736c8d6dd3a13770e885aafe062ecbd04f1f"}, - {file = "websockets-14.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9b0f6c3ba3b1240f602ebb3971d45b02cc12bd1845466dd783496b3b05783a5"}, - {file = "websockets-14.2-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:669c3e101c246aa85bc8534e495952e2ca208bd87994650b90a23d745902db9a"}, - {file = "websockets-14.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:eabdb28b972f3729348e632ab08f2a7b616c7e53d5414c12108c29972e655b20"}, - {file = "websockets-14.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2066dc4cbcc19f32c12a5a0e8cc1b7ac734e5b64ac0a325ff8353451c4b15ef2"}, - {file = "websockets-14.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ab95d357cd471df61873dadf66dd05dd4709cae001dd6342edafc8dc6382f307"}, - {file = "websockets-14.2-cp313-cp313-win32.whl", hash = "sha256:a9e72fb63e5f3feacdcf5b4ff53199ec8c18d66e325c34ee4c551ca748623bbc"}, - {file = "websockets-14.2-cp313-cp313-win_amd64.whl", hash = "sha256:b439ea828c4ba99bb3176dc8d9b933392a2413c0f6b149fdcba48393f573377f"}, - {file = "websockets-14.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7cd5706caec1686c5d233bc76243ff64b1c0dc445339bd538f30547e787c11fe"}, - {file = "websockets-14.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ec607328ce95a2f12b595f7ae4c5d71bf502212bddcea528290b35c286932b12"}, - {file = "websockets-14.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:da85651270c6bfb630136423037dd4975199e5d4114cae6d3066641adcc9d1c7"}, - {file = "websockets-14.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3ecadc7ce90accf39903815697917643f5b7cfb73c96702318a096c00aa71f5"}, - {file = "websockets-14.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1979bee04af6a78608024bad6dfcc0cc930ce819f9e10342a29a05b5320355d0"}, - {file = "websockets-14.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2dddacad58e2614a24938a50b85969d56f88e620e3f897b7d80ac0d8a5800258"}, - {file = "websockets-14.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:89a71173caaf75fa71a09a5f614f450ba3ec84ad9fca47cb2422a860676716f0"}, - {file = "websockets-14.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:6af6a4b26eea4fc06c6818a6b962a952441e0e39548b44773502761ded8cc1d4"}, - {file = "websockets-14.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:80c8efa38957f20bba0117b48737993643204645e9ec45512579132508477cfc"}, - {file = "websockets-14.2-cp39-cp39-win32.whl", hash = "sha256:2e20c5f517e2163d76e2729104abc42639c41cf91f7b1839295be43302713661"}, - {file = "websockets-14.2-cp39-cp39-win_amd64.whl", hash = "sha256:b4c8cef610e8d7c70dea92e62b6814a8cd24fbd01d7103cc89308d2bfe1659ef"}, - {file = "websockets-14.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:d7d9cafbccba46e768be8a8ad4635fa3eae1ffac4c6e7cb4eb276ba41297ed29"}, - {file = "websockets-14.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:c76193c1c044bd1e9b3316dcc34b174bbf9664598791e6fb606d8d29000e070c"}, - {file = "websockets-14.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd475a974d5352390baf865309fe37dec6831aafc3014ffac1eea99e84e83fc2"}, - {file = "websockets-14.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2c6c0097a41968b2e2b54ed3424739aab0b762ca92af2379f152c1aef0187e1c"}, - {file = "websockets-14.2-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d7ff794c8b36bc402f2e07c0b2ceb4a2424147ed4785ff03e2a7af03711d60a"}, - {file = "websockets-14.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:dec254fcabc7bd488dab64846f588fc5b6fe0d78f641180030f8ea27b76d72c3"}, - {file = "websockets-14.2-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:bbe03eb853e17fd5b15448328b4ec7fb2407d45fb0245036d06a3af251f8e48f"}, - {file = "websockets-14.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:a3c4aa3428b904d5404a0ed85f3644d37e2cb25996b7f096d77caeb0e96a3b42"}, - {file = "websockets-14.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:577a4cebf1ceaf0b65ffc42c54856214165fb8ceeba3935852fc33f6b0c55e7f"}, - {file = "websockets-14.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ad1c1d02357b7665e700eca43a31d52814ad9ad9b89b58118bdabc365454b574"}, - {file = "websockets-14.2-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f390024a47d904613577df83ba700bd189eedc09c57af0a904e5c39624621270"}, - {file = "websockets-14.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3c1426c021c38cf92b453cdf371228d3430acd775edee6bac5a4d577efc72365"}, - {file = "websockets-14.2-py3-none-any.whl", hash = "sha256:7a6ceec4ea84469f15cf15807a747e9efe57e369c384fa86e022b3bea679b79b"}, - {file = "websockets-14.2.tar.gz", hash = "sha256:5059ed9c54945efb321f097084b4c7e52c246f2c869815876a69d1efc4ad6eb5"}, + {file = "websockets-15.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5e6ee18a53dd5743e6155b8ff7e8e477c25b29b440f87f65be8165275c87fef0"}, + {file = "websockets-15.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ee06405ea2e67366a661ed313e14cf2a86e84142a3462852eb96348f7219cee3"}, + {file = "websockets-15.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8711682a629bbcaf492f5e0af72d378e976ea1d127a2d47584fa1c2c080b436b"}, + {file = "websockets-15.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94c4a9b01eede952442c088d415861b0cf2053cbd696b863f6d5022d4e4e2453"}, + {file = "websockets-15.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:45535fead66e873f411c1d3cf0d3e175e66f4dd83c4f59d707d5b3e4c56541c4"}, + {file = "websockets-15.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e389efe46ccb25a1f93d08c7a74e8123a2517f7b7458f043bd7529d1a63ffeb"}, + {file = "websockets-15.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:67a04754d121ea5ca39ddedc3f77071651fb5b0bc6b973c71c515415b44ed9c5"}, + {file = "websockets-15.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:bd66b4865c8b853b8cca7379afb692fc7f52cf898786537dfb5e5e2d64f0a47f"}, + {file = "websockets-15.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a4cc73a6ae0a6751b76e69cece9d0311f054da9b22df6a12f2c53111735657c8"}, + {file = "websockets-15.0-cp310-cp310-win32.whl", hash = "sha256:89da58e4005e153b03fe8b8794330e3f6a9774ee9e1c3bd5bc52eb098c3b0c4f"}, + {file = "websockets-15.0-cp310-cp310-win_amd64.whl", hash = "sha256:4ff380aabd7a74a42a760ee76c68826a8f417ceb6ea415bd574a035a111fd133"}, + {file = "websockets-15.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:dd24c4d256558429aeeb8d6c24ebad4e982ac52c50bc3670ae8646c181263965"}, + {file = "websockets-15.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f83eca8cbfd168e424dfa3b3b5c955d6c281e8fc09feb9d870886ff8d03683c7"}, + {file = "websockets-15.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4095a1f2093002c2208becf6f9a178b336b7572512ee0a1179731acb7788e8ad"}, + {file = "websockets-15.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb915101dfbf318486364ce85662bb7b020840f68138014972c08331458d41f3"}, + {file = "websockets-15.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:45d464622314973d78f364689d5dbb9144e559f93dca11b11af3f2480b5034e1"}, + {file = "websockets-15.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ace960769d60037ca9625b4c578a6f28a14301bd2a1ff13bb00e824ac9f73e55"}, + {file = "websockets-15.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c7cd4b1015d2f60dfe539ee6c95bc968d5d5fad92ab01bb5501a77393da4f596"}, + {file = "websockets-15.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4f7290295794b5dec470867c7baa4a14182b9732603fd0caf2a5bf1dc3ccabf3"}, + {file = "websockets-15.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3abd670ca7ce230d5a624fd3d55e055215d8d9b723adee0a348352f5d8d12ff4"}, + {file = "websockets-15.0-cp311-cp311-win32.whl", hash = "sha256:110a847085246ab8d4d119632145224d6b49e406c64f1bbeed45c6f05097b680"}, + {file = "websockets-15.0-cp311-cp311-win_amd64.whl", hash = "sha256:8d7bbbe2cd6ed80aceef2a14e9f1c1b61683194c216472ed5ff33b700e784e37"}, + {file = "websockets-15.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:cccc18077acd34c8072578394ec79563664b1c205f7a86a62e94fafc7b59001f"}, + {file = "websockets-15.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d4c22992e24f12de340ca5f824121a5b3e1a37ad4360b4e1aaf15e9d1c42582d"}, + {file = "websockets-15.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1206432cc6c644f6fc03374b264c5ff805d980311563202ed7fef91a38906276"}, + {file = "websockets-15.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d3cc75ef3e17490042c47e0523aee1bcc4eacd2482796107fd59dd1100a44bc"}, + {file = "websockets-15.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b89504227a5311610e4be16071465885a0a3d6b0e82e305ef46d9b064ce5fb72"}, + {file = "websockets-15.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56e3efe356416bc67a8e093607315951d76910f03d2b3ad49c4ade9207bf710d"}, + {file = "websockets-15.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0f2205cdb444a42a7919690238fb5979a05439b9dbb73dd47c863d39640d85ab"}, + {file = "websockets-15.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:aea01f40995fa0945c020228ab919b8dfc93fc8a9f2d3d705ab5b793f32d9e99"}, + {file = "websockets-15.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a9f8e33747b1332db11cf7fcf4a9512bef9748cb5eb4d3f7fbc8c30d75dc6ffc"}, + {file = "websockets-15.0-cp312-cp312-win32.whl", hash = "sha256:32e02a2d83f4954aa8c17e03fe8ec6962432c39aca4be7e8ee346b05a3476904"}, + {file = "websockets-15.0-cp312-cp312-win_amd64.whl", hash = "sha256:ffc02b159b65c05f2ed9ec176b715b66918a674bd4daed48a9a7a590dd4be1aa"}, + {file = "websockets-15.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:d2244d8ab24374bed366f9ff206e2619345f9cd7fe79aad5225f53faac28b6b1"}, + {file = "websockets-15.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3a302241fbe825a3e4fe07666a2ab513edfdc6d43ce24b79691b45115273b5e7"}, + {file = "websockets-15.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:10552fed076757a70ba2c18edcbc601c7637b30cdfe8c24b65171e824c7d6081"}, + {file = "websockets-15.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c53f97032b87a406044a1c33d1e9290cc38b117a8062e8a8b285175d7e2f99c9"}, + {file = "websockets-15.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1caf951110ca757b8ad9c4974f5cac7b8413004d2f29707e4d03a65d54cedf2b"}, + {file = "websockets-15.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8bf1ab71f9f23b0a1d52ec1682a3907e0c208c12fef9c3e99d2b80166b17905f"}, + {file = "websockets-15.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bfcd3acc1a81f106abac6afd42327d2cf1e77ec905ae11dc1d9142a006a496b6"}, + {file = "websockets-15.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:c8c5c8e1bac05ef3c23722e591ef4f688f528235e2480f157a9cfe0a19081375"}, + {file = "websockets-15.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:86bfb52a9cfbcc09aba2b71388b0a20ea5c52b6517c0b2e316222435a8cdab72"}, + {file = "websockets-15.0-cp313-cp313-win32.whl", hash = "sha256:26ba70fed190708551c19a360f9d7eca8e8c0f615d19a574292b7229e0ae324c"}, + {file = "websockets-15.0-cp313-cp313-win_amd64.whl", hash = "sha256:ae721bcc8e69846af00b7a77a220614d9b2ec57d25017a6bbde3a99473e41ce8"}, + {file = "websockets-15.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c348abc5924caa02a62896300e32ea80a81521f91d6db2e853e6b1994017c9f6"}, + {file = "websockets-15.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5294fcb410ed0a45d5d1cdedc4e51a60aab5b2b3193999028ea94afc2f554b05"}, + {file = "websockets-15.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c24ba103ecf45861e2e1f933d40b2d93f5d52d8228870c3e7bf1299cd1cb8ff1"}, + {file = "websockets-15.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc8821a03bcfb36e4e4705316f6b66af28450357af8a575dc8f4b09bf02a3dee"}, + {file = "websockets-15.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffc5ae23ada6515f31604f700009e2df90b091b67d463a8401c1d8a37f76c1d7"}, + {file = "websockets-15.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ac67b542505186b3bbdaffbc303292e1ee9c8729e5d5df243c1f20f4bb9057e"}, + {file = "websockets-15.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:c86dc2068f1c5ca2065aca34f257bbf4f78caf566eb230f692ad347da191f0a1"}, + {file = "websockets-15.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:30cff3ef329682b6182c01c568f551481774c476722020b8f7d0daacbed07a17"}, + {file = "websockets-15.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:98dcf978d4c6048965d1762abd534c9d53bae981a035bfe486690ba11f49bbbb"}, + {file = "websockets-15.0-cp39-cp39-win32.whl", hash = "sha256:37d66646f929ae7c22c79bc73ec4074d6db45e6384500ee3e0d476daf55482a9"}, + {file = "websockets-15.0-cp39-cp39-win_amd64.whl", hash = "sha256:24d5333a9b2343330f0f4eb88546e2c32a7f5c280f8dd7d3cc079beb0901781b"}, + {file = "websockets-15.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:b499caef4bca9cbd0bd23cd3386f5113ee7378094a3cb613a2fa543260fe9506"}, + {file = "websockets-15.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:17f2854c6bd9ee008c4b270f7010fe2da6c16eac5724a175e75010aacd905b31"}, + {file = "websockets-15.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:89f72524033abbfde880ad338fd3c2c16e31ae232323ebdfbc745cbb1b3dcc03"}, + {file = "websockets-15.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1657a9eecb29d7838e3b415458cc494e6d1b194f7ac73a34aa55c6fb6c72d1f3"}, + {file = "websockets-15.0-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e413352a921f5ad5d66f9e2869b977e88d5103fc528b6deb8423028a2befd842"}, + {file = "websockets-15.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:8561c48b0090993e3b2a54db480cab1d23eb2c5735067213bb90f402806339f5"}, + {file = "websockets-15.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:190bc6ef8690cd88232a038d1b15714c258f79653abad62f7048249b09438af3"}, + {file = "websockets-15.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:327adab7671f3726b0ba69be9e865bba23b37a605b585e65895c428f6e47e766"}, + {file = "websockets-15.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd8ef197c87afe0a9009f7a28b5dc613bfc585d329f80b7af404e766aa9e8c7"}, + {file = "websockets-15.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:789c43bf4a10cd067c24c321238e800b8b2716c863ddb2294d2fed886fa5a689"}, + {file = "websockets-15.0-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7394c0b7d460569c9285fa089a429f58465db930012566c03046f9e3ab0ed181"}, + {file = "websockets-15.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2ea4f210422b912ebe58ef0ad33088bc8e5c5ff9655a8822500690abc3b1232d"}, + {file = "websockets-15.0-py3-none-any.whl", hash = "sha256:51ffd53c53c4442415b613497a34ba0aa7b99ac07f1e4a62db5dcd640ae6c3c3"}, + {file = "websockets-15.0.tar.gz", hash = "sha256:ca36151289a15b39d8d683fd8b7abbe26fc50be311066c5f8dcf3cb8cee107ab"}, ] [[package]] diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py index d4b7a8e..4404d07 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py @@ -77,13 +77,14 @@ class InstagramTbotExtractor(Extractor): chat, since_id = self._send_url_to_bot(url) message = self._process_messages(chat, since_id, tmp_dir, result) + # This may be outdated and replaced by the below message, but keeping until confirmed if "You must enter a URL to a post" in message: logger.debug(f"invalid link {url=} for {self.name}: {message}") return False - # # TODO: It currently returns this as a success - is that intentional? - # if "Media not found or unavailable" in message: - # logger.debug(f"invalid link {url=} for {self.name}: {message}") - # return False + + if "Media not found or unavailable" in message: + logger.debug(f"No media found for link {url=} for {self.name}: {message}") + return False if message: result.set_content(message).set_title(message[:128]) diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py index cd03b49..108deae 100644 --- a/src/auto_archiver/utils/misc.py +++ b/src/auto_archiver/utils/misc.py @@ -46,7 +46,7 @@ def dump_payload(p): def update_nested_dict(dictionary, update_dict): - # takes 2 dicts and overwrites the first with the second only on the changed balues + # takes 2 dicts and overwrites the first with the second only on the changed values for key, value in update_dict.items(): if key in dictionary and isinstance(value, dict) and isinstance(dictionary[key], dict): update_nested_dict(dictionary[key], value) diff --git a/tests/conftest.py b/tests/conftest.py index 19e1f6b..8deebff 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,7 @@ pytest conftest file, for shared fixtures and configuration """ import os import pickle +from datetime import datetime, timezone from tempfile import TemporaryDirectory from typing import Dict, Tuple import hashlib @@ -138,3 +139,9 @@ def mock_binary_dependencies(): # Mock all binary dependencies as available mock_shutil_which.return_value = "/usr/bin/fake_binary" yield mock_shutil_which + + +@pytest.fixture +def sample_datetime(): + return datetime(2023, 1, 1, 12, 0, tzinfo=timezone.utc) + diff --git a/tests/databases/test_api_db.py b/tests/databases/test_api_db.py new file mode 100644 index 0000000..d07cb1a --- /dev/null +++ b/tests/databases/test_api_db.py @@ -0,0 +1,69 @@ +from unittest.mock import patch + +import pytest + +from auto_archiver.core import Metadata +from auto_archiver.modules.api_db import AAApiDb + + +@pytest.fixture +def api_db(setup_module): + configs: dict = { + "api_endpoint": "https://api.example.com", + "api_token": "test-token", + "public": False, + "author_id": "Someone", + "group_id": "123", + "use_api_cache": True, + "store_results": True, + "tags": "[]", + } + return setup_module(AAApiDb, configs) + + +@pytest.fixture +def metadata(): + metadata = Metadata() + metadata.set("_processed_at", "2021-01-01T00:00:00") + metadata.set_url("https://example.com") + return metadata + + +def test_fetch_no_cache(api_db, metadata): + # Test fetch + api_db.use_api_cache = False + assert api_db.fetch(metadata) is None + + +def test_fetch_fail_status(api_db, metadata): + # Test response fail in fetch method + with patch("auto_archiver.modules.api_db.api_db.requests.get") as mock_get: + mock_get.return_value.status_code = 400 + mock_get.return_value.json.return_value = {} + with patch("loguru.logger.error") as mock_error: + assert api_db.fetch(metadata) is False + mock_error.assert_called_once_with("AA API FAIL (400): {}") + + +def test_fetch(api_db, metadata): + # Test successful fetch method + with patch("auto_archiver.modules.api_db.api_db.requests.get") as mock_get,\ + patch("auto_archiver.core.metadata.datetime.datetime") as mock_datetime: + mock_datetime.now.return_value = "2021-01-01T00:00:00" + mock_get.return_value.status_code = 200 + mock_get.return_value.json.return_value = [{"result": {}}, {"result": + {'media': [], 'metadata': {'_processed_at': '2021-01-01T00:00:00', 'url': 'https://example.com'}, + 'status': 'no archiver'}}] + assert api_db.fetch(metadata) == metadata + + +def test_done_success(api_db, metadata): + with patch("auto_archiver.modules.api_db.api_db.requests.post") as mock_post: + mock_post.return_value.status_code = 201 + api_db.done(metadata) + mock_post.assert_called_once() + mock_post.assert_called_once_with("https://api.example.com/interop/submit-archive", + json={'author_id': 'Someone', 'url': 'https://example.com', + 'public': False, 'group_id': '123', 'tags': ['[', ']'], 'result': '{"status": "no archiver", "metadata": {"_processed_at": "2021-01-01T00:00:00", "url": "https://example.com"}, "media": []}'}, + headers={'Authorization': 'Bearer test-token'}) + diff --git a/tests/extractors/test_instagram_tbot_extractor.py b/tests/extractors/test_instagram_tbot_extractor.py index d7a1e53..9df9983 100644 --- a/tests/extractors/test_instagram_tbot_extractor.py +++ b/tests/extractors/test_instagram_tbot_extractor.py @@ -1,11 +1,9 @@ import os -from typing import Type from unittest.mock import patch, MagicMock import pytest from auto_archiver.core import Metadata -from auto_archiver.core.extractor import Extractor from auto_archiver.modules.instagram_tbot_extractor import InstagramTbotExtractor from tests.extractors.test_extractor_base import TestExtractorBase @@ -13,82 +11,103 @@ TESTFILES = os.path.join(os.path.dirname(__file__), "testfiles") @pytest.fixture -def session_file(tmpdir): - """Fixture to create a test session file.""" - session_file = os.path.join(tmpdir, "test_session.session") - with open(session_file, "w") as f: - f.write("mock_session_data") - return session_file.replace(".session", "") - - -@pytest.fixture(autouse=True) def patch_extractor_methods(request, setup_module): with patch.object(InstagramTbotExtractor, '_prepare_session_file', return_value=None), \ patch.object(InstagramTbotExtractor, '_initialize_telegram_client', return_value=None): - if hasattr(request, 'cls') and hasattr(request.cls, 'config'): - request.cls.extractor = setup_module("instagram_tbot_extractor", request.cls.config) - yield +@pytest.fixture(autouse=True) +def mock_sleep(): + """Globally mock time.sleep to avoid delays.""" + with patch("time.sleep") as mock_sleep: + yield mock_sleep + + @pytest.fixture def metadata_sample(): m = Metadata() m.set_title("Test Title") - m.set_timestamp("2021-01-01T00:00:00Z") + m.set_timestamp("2021-01-01T00:00:00") m.set_url("https://www.instagram.com/p/1234567890") return m -class TestInstagramTbotExtractor: +@pytest.fixture +def mock_telegram_client(): + """Fixture to mock TelegramClient interactions.""" + with patch("auto_archiver.modules.instagram_tbot_extractor.client") as mock_client: + instance = MagicMock() + mock_client.return_value = instance + yield instance + +@pytest.fixture +def extractor(setup_module, patch_extractor_methods): extractor_module = "instagram_tbot_extractor" - extractor: InstagramTbotExtractor config = { "api_id": 12345, "api_hash": "test_api_hash", "session_file": "test_session", + "timeout": 4 + } + extractor = setup_module(extractor_module, config) + extractor.client = MagicMock() + extractor.session_file = "test_session" + return extractor + + +def test_non_instagram_url(extractor, metadata_sample): + metadata_sample.set_url("https://www.youtube.com") + assert extractor.download(metadata_sample) is False + +def test_download_success(extractor, metadata_sample): + with patch.object(extractor, "_send_url_to_bot", return_value=(MagicMock(), 101)), \ + patch.object(extractor, "_process_messages", return_value="Sample Instagram post caption"): + result = extractor.download(metadata_sample) + assert result.is_success() + assert result.status == "insta-via-bot: success" + assert result.metadata.get("title") == "Sample Instagram post caption" + + +def test_download_invalid(extractor, metadata_sample): + with patch.object(extractor, "_send_url_to_bot", return_value=(MagicMock(), 101)), \ + patch.object(extractor, "_process_messages", return_value="You must enter a URL to a post"): + assert extractor.download(metadata_sample) is False + + + +@pytest.mark.skip(reason="Requires authentication.") +class TestInstagramTbotExtractorReal(TestExtractorBase): + # To run these tests set the TELEGRAM_API_ID and TELEGRAM_API_HASH environment variables, and ensure the session file exists. + # Note these are true at this point in time, but changes to source media could be reason for failure. + extractor_module = "instagram_tbot_extractor" + extractor: InstagramTbotExtractor + config = { + "api_id": os.environ.get("TELEGRAM_API_ID"), + "api_hash": os.environ.get("TELEGRAM_API_HASH"), + "session_file": "secrets/anon-insta", } - @pytest.fixture - def mock_telegram_client(self): - """Fixture to mock TelegramClient interactions.""" - with patch("auto_archiver.modules.instagram_tbot_extractor._initialize_telegram_client") as mock_client: - instance = MagicMock() - mock_client.return_value = instance - yield instance - - def test_extractor_is_initialized(self): - assert self.extractor is not None - - - @patch("time.sleep") - @pytest.mark.parametrize("url, expected_status, bot_responses", [ - ("https://www.instagram.com/p/C4QgLbrIKXG", "insta-via-bot: success", [MagicMock(id=101, media=None, message="Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou")]), - ("https://www.instagram.com/reel/DEVLK8qoIbg/", "insta-via-bot: success", [MagicMock(id=101, media=None, message="Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol")]), - # todo tbot not working for stories :( - ("https://www.instagram.com/stories/bellingcatofficial/3556336382743057476/", False, [MagicMock(id=101, media=None, message="Media not found or unavailable")]), - ("https://www.youtube.com/watch?v=ymCMy8OffHM", False, []), - ("https://www.instagram.com/p/INVALID", False, [MagicMock(id=101, media=None, message="You must enter a URL to a post")]), + @pytest.mark.parametrize("url, expected_status, message, len_media", [ + ("https://www.instagram.com/p/C4QgLbrIKXG", "insta-via-bot: success", "Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou", 6), + ("https://www.instagram.com/reel/DEVLK8qoIbg/", "insta-via-bot: success", "Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol", 3), + # instagram tbot not working (potentially intermittently?) for stories - replace with a live story to retest + # ("https://www.instagram.com/stories/bellingcatofficial/3556336382743057476/", False, "Media not found or unavailable"), + # Seems to be working intermittently for highlights + # ("https://www.instagram.com/stories/highlights/17868810693068139/", "insta-via-bot: success", None, 50), + # Marking invalid url as success + ("https://www.instagram.com/p/INVALID", "insta-via-bot: success", "Media not found or unavailable", 0), + ("https://www.youtube.com/watch?v=ymCMy8OffHM", False, None, 0), ]) - def test_download(self, mock_sleep, url, expected_status, bot_responses, metadata_sample): + def test_download(self, url, expected_status, message, len_media, metadata_sample): """Test the `download()` method with various Instagram URLs.""" metadata_sample.set_url(url) - self.extractor.client = MagicMock() + result = self.extractor.download(metadata_sample) - pass - # TODO fully mock or use as authenticated test - # if expected_status: - # assert result.is_success() - # assert result.status == expected_status - # assert result.metadata.get("title") in [msg.message[:128] for msg in bot_responses if msg.message] - # else: - # assert result is False - - - - - # Test story -# Test expired story -# Test requires login/ access (?) -# Test post -# Test multiple images? \ No newline at end of file + if expected_status: + assert result.is_success() + assert result.status == expected_status + assert result.metadata.get("title") == message + assert len(result.media) == len_media + else: + assert result is False diff --git a/tests/feeders/test_gworksheet.py b/tests/feeders/test_gworksheet.py index e6f5cc6..016cfb2 100644 --- a/tests/feeders/test_gworksheet.py +++ b/tests/feeders/test_gworksheet.py @@ -1,3 +1,4 @@ +# Note this isn't a feeder, but contained as utility of the gsheet feeder module import pytest from unittest.mock import MagicMock diff --git a/tests/utils/test_misc.py b/tests/utils/test_misc.py new file mode 100644 index 0000000..e45c1c1 --- /dev/null +++ b/tests/utils/test_misc.py @@ -0,0 +1,146 @@ +import hashlib +import json +from datetime import datetime, timezone +from unittest.mock import Mock, patch + +import pytest + +from auto_archiver.utils.misc import ( + mkdir_if_not_exists, + expand_url, + getattr_or, + DateTimeEncoder, + dump_payload, + get_datetime_from_str, + update_nested_dict, + calculate_file_hash, + random_str, + get_timestamp +) + + +@pytest.fixture +def sample_file(tmp_path): + file_path = tmp_path / "test.txt" + file_path.write_text("test content") + return file_path + + +class TestDirectoryUtils: + def test_mkdir_creates_new_directory(self, tmp_path): + new_dir = tmp_path / "new_folder" + mkdir_if_not_exists(new_dir) + assert new_dir.exists() + assert new_dir.is_dir() + + def test_mkdir_exists_quietly(self, tmp_path): + existing_dir = tmp_path / "existing" + existing_dir.mkdir() + mkdir_if_not_exists(existing_dir) + assert existing_dir.exists() + +class TestURLExpansion: + @pytest.mark.parametrize("input_url,expected", [ + ("https://example.com", "https://example.com"), + ("https://t.co/test", "https://expanded.url") + ]) + def test_expand_url(self, input_url, expected): + mock_response = Mock() + mock_response.url = "https://expanded.url" + with patch('requests.get', return_value=mock_response): + + result = expand_url(input_url) + assert result == expected + + def test_expand_url_handles_errors(self, caplog): + with patch('requests.get', side_effect=Exception("Connection error")): + url = "https://t.co/error" + result = expand_url(url) + assert result == url + assert f"Failed to expand url {url}" in caplog.text + +class TestAttributeHandling: + class Sample: + exists = "value" + none = None + + @pytest.mark.parametrize("obj,attr,default,expected", [ + (Sample(), "exists", "default", "value"), + (Sample(), "none", "default", "default"), + (Sample(), "missing", "default", "default"), + (None, "anything", "fallback", "fallback"), + ]) + def test_getattr_or(self, obj, attr, default, expected): + # Test gets attribute or returns a default value + assert getattr_or(obj, attr, default) == expected + +class TestDateTimeHandling: + def test_datetime_encoder(self, sample_datetime): + result = json.dumps({"dt": sample_datetime}, cls=DateTimeEncoder) + loaded = json.loads(result) + assert loaded["dt"] == str(sample_datetime) + + def test_dump_payload(self, sample_datetime): + payload = {"timestamp": sample_datetime} + result = dump_payload(payload) + assert str(sample_datetime) in result + + @pytest.mark.parametrize("dt_str,fmt,expected", [ + ("2023-01-01 12:00:00+00:00", None, datetime(2023, 1, 1, 12, 0, tzinfo=timezone.utc)), + ("20230101 120000", "%Y%m%d %H%M%S", datetime(2023, 1, 1, 12, 0)), + ("invalid", None, None), + ]) + def test_datetime_from_string(self, dt_str, fmt, expected): + result = get_datetime_from_str(dt_str, fmt) + if expected is None: + assert result is None + else: + assert result == expected.replace(tzinfo=result.tzinfo) + +class TestDictUtils: + @pytest.mark.parametrize("original,update,expected", [ + ({"a": 1}, {"b": 2}, {"a": 1, "b": 2}), + ({"nested": {"a": 1}}, {"nested": {"b": 2}}, {"nested": {"a": 1, "b": 2}}), + ({"a": {"b": {"c": 1}}}, {"a": {"b": {"c": 2}}}, {"a": {"b": {"c": 2}}}), + ]) + def test_update_nested_dict(self, original, update, expected): + update_nested_dict(original, update) + assert original == expected + +class TestHashingUtils: + def test_file_hashing(self, sample_file): + expected = hashlib.sha256(b"test content").hexdigest() + assert calculate_file_hash(str(sample_file)) == expected + + def test_large_file_hashing(self, tmp_path): + file_path = tmp_path / "large.bin" + content = b"0" * 16_000_000 * 2 # 32MB + file_path.write_bytes(content) + + expected = hashlib.sha256(content).hexdigest() + assert calculate_file_hash(str(file_path)) == expected + +class TestMiscUtils: + def test_random_str_length(self): + for length in [8, 16, 32]: + assert len(random_str(length)) == length + + def test_random_str_raises_too_long(self): + with pytest.raises(AssertionError) as exc_info: + random_str(64) + assert "length must be less than 32 as UUID4 is used" == str(exc_info.value) + + def test_random_str_uniqueness(self): + assert random_str() != random_str() + + @pytest.mark.parametrize("ts_input,utc,iso,expected_type", [ + (datetime.now(), True, True, str), + ("2023-01-01T12:00:00+00:00", False, False, datetime), + (1672574400, True, True, str), + ]) + def test_timestamp_parsing(self, ts_input, utc, iso, expected_type): + result = get_timestamp(ts_input, utc=utc, iso=iso) + assert isinstance(result, expected_type) + + def test_invalid_timestamp_returns_none(self): + assert get_timestamp("invalid-date") is None \ No newline at end of file From 7b88df72cb4d945321855039e133634f00e4fc3d Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 18 Feb 2025 19:46:57 +0000 Subject: [PATCH 07/15] Update test_metadata_enricher.py --- tests/data/metadata_enricher_exif.pickle | Bin 0 -> 1175 bytes .../metadata_enricher_ytshort_expected.pickle | Bin 12524 -> 12233 bytes .../metadata_enricher_ytshort_input.pickle | Bin 10840 -> 10840 bytes tests/enrichers/test_metadata_enricher.py | 10 +++++----- 4 files changed, 5 insertions(+), 5 deletions(-) create mode 100644 tests/data/metadata_enricher_exif.pickle diff --git a/tests/data/metadata_enricher_exif.pickle b/tests/data/metadata_enricher_exif.pickle new file mode 100644 index 0000000000000000000000000000000000000000..5607a9b92cb87837806cf436f42a98db5f651c72 GIT binary patch literal 1175 zcma)6O>fgc5RF8H(4P1Q%&`?D**L9|?v1z!O;MtV251im+402LVtcLKwVEO!!QD5O z-wW$FLBS!Ek)`#=n>TOX?)Yo{M_#Y3BPS~JQz_=Cl*ymEB2FXJ7)&>>zMJE3=8HMX z>z)ji`C#v!;EFG_5Ru8BHCsAP>Bp{GEyKH3?5!lK02f{s>E>-38qA4AoylY_0yM8{ z`D3L6k!f@N^HHrfutYOU1n*EPE)tkzv5ykA!Jyq~wka9%2w~#IxcyLB8SGUi=Un1k z3wd9`ZXX%^p}nke#Xlde(dojceu*24fG_ym(}n1BPh)GwH|iKT)H!9;VeKBY&)A8> zPU&+>8Qt{pa$a~{$tqPnm^(_Wjo>c5B*Y5CgSk_bF;|7nYU{yvWzdj!4cREJvccJX zdf14|>EP84Y_QUpthaB=SF8-|{*&=&Jb-_a@z@Kob(>;HE{CJx@1wBGwYj&)p-J21 z4Jr|3YwO=8T<-xHOhkg@8g_ttph94q-24Y-R0xFkJkZO(u&jBU9wjC00(rRHmt+$e zbR9|t#USt`gw1?a85#6Fg%MBe_PVndaFHz*v6n!ZMfzMTup0?iUNNWDY+Iy9ToDNg z#UNW35{s_{gO2Un^})3?nO{;Bl6Hc@jqAJeggK1Dkps^0roY1jV(qjxtcoVb7SRS0`N?t2ZHzga zpEC0?au~#$7guDYmv8>fBCR^vS5FNtGPzpMj>|&9(9+7p$ja1Ua<`U>fT@*%k(D8m z;C?LwWHpBRYejn)4X5<*L@ETO7U$=bmSpDVP01*o{89fAZx3gDZf0(3No7Im6u-%* zbmb6seb8-2aLBTFolq$@Tl2&->Usp6^KOs7D5yZbh!JB``>@IK$~`+mNk&->&{{WnZLOYHyP zUM2SjhsUNa4uAgs;cYN5^fiWu9-~GZ>L{2@TnQe;U&K-ezv3gY)Wf=4M%HfQ;7?*= zYBbl9B^6fI?(W?x7w8QuxH)rS;<$(4^rG29VBOq91SfeBsDU#xl@k@7R|FaGf+Cd^ zNe*!SYCrfX&t-~$pH+&oB8b6ZUg~XC=T(BoqM@}XZ3L=a57`~lu?bFu^psIuQmY^= z5}esJo5;D&3R!x08Ehh#wa0jZM*^(9;ulY(wrt$ufY~wCcy)%Lw8dBu4(IFuH7a$y9{@?OMF2t@Ek|*(3Wd@|0?D- z%^p<)7F}o|um(Lkb3l;jttE@T6WB7Fo)OY;WjZR$z}v{w4KGAzg%GW|PAFRq*Xv7J@NW9@$uAOon|^+JxIc5_G|m!@eE#%U3eJRGqG5kOMx&U$ oN0ZfYT7I;-u@ii}_VbeQdc}D2(r+M~s%wtzp{_@s62DRS3xGgQB>(^b diff --git a/tests/data/metadata_enricher_ytshort_input.pickle b/tests/data/metadata_enricher_ytshort_input.pickle index 5f1a4eb4317bf69f2ccf59306fb645985a70c648..2495c4696c8e0b08c6a29ec038797165741e00c9 100644 GIT binary patch delta 72 zcmcZ+awBBIEOsGbEhA2wH5(UBX5zle*u!0(npBXOo;q2PMSgM|a~osM=BLbjj2s5> V=EW5m>E)Y$vk0rgd6UJooB*Uq7@Gh9 delta 72 zcmcZ+awBBIEOuUDRSDihmp3k)%*1_;v4^`nHK`ymJ$14oi~Qs`<~Bx-%}<&67&)@y Wi!4%e%aS(#W)W6}^CpXFIROB@a2W{z diff --git a/tests/enrichers/test_metadata_enricher.py b/tests/enrichers/test_metadata_enricher.py index c6190ed..aedcf54 100644 --- a/tests/enrichers/test_metadata_enricher.py +++ b/tests/enrichers/test_metadata_enricher.py @@ -76,14 +76,14 @@ def test_get_metadata_error_handling(mock_run, mock_logger_error, enricher): mock_logger_error.assert_called_once() -@pytest.mark.skip(reason="Requires ExifTool to be installed. TODO mock") -def test_metadata_pickle(enricher, unpickle): - # Uses a pickle of a YouTube short +@patch("subprocess.run") +def test_metadata_pickle(mock_run, enricher, unpickle): + # Uses pickled values + mock_run.return_value = unpickle("metadata_enricher_exif.pickle") metadata = unpickle("metadata_enricher_ytshort_input.pickle") expected = unpickle("metadata_enricher_ytshort_expected.pickle") enricher.enrich(metadata) expected_media = expected.media actual_media = metadata.media assert len(expected_media) == len(actual_media) - assert actual_media[0].properties.get("metadata") == expected_media[0].properties.get("metadata") - assert metadata == expected \ No newline at end of file + assert actual_media[0].properties.get("metadata") == expected_media[0].properties.get("metadata") \ No newline at end of file From f0fd9bf44596485fb53ae4bd4347319d3a42806f Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 18 Feb 2025 23:32:03 +0000 Subject: [PATCH 08/15] Updates tests to use pytest-mock. --- poetry.lock | 20 +- pyproject.toml | 1 + tests/conftest.py | 26 ++- tests/databases/test_api_db.py | 52 +++--- tests/databases/test_gsheet_db.py | 39 ++-- tests/enrichers/test_meta_enricher.py | 19 +- tests/enrichers/test_metadata_enricher.py | 43 +++-- tests/enrichers/test_pdq_hash_enricher.py | 62 +++---- tests/enrichers/test_screenshot_enricher.py | 174 +++++++++--------- tests/enrichers/test_ssl_enricher.py | 36 ++-- tests/enrichers/test_thumbnail_enricher.py | 141 +++++++------- tests/enrichers/test_whisper_enricher.py | 72 ++++---- .../test_instagram_api_extractor.py | 123 ++++++------- .../test_instagram_tbot_extractor.py | 55 +++--- tests/feeders/test_gsheet_feeder.py | 114 ++++++------ tests/feeders/test_gworksheet.py | 9 +- tests/storages/test_S3_storage.py | 113 +++++------- tests/storages/test_gdrive_storage.py | 61 +++--- tests/utils/test_misc.py | 24 ++- 19 files changed, 584 insertions(+), 600 deletions(-) diff --git a/poetry.lock b/poetry.lock index c8524b4..83b2860 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1815,6 +1815,24 @@ loguru = "*" [package.extras] test = ["pytest", "pytest-cov"] +[[package]] +name = "pytest-mock" +version = "3.14.0" +description = "Thin-wrapper around the mock package for easier use with pytest" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "pytest-mock-3.14.0.tar.gz", hash = "sha256:2719255a1efeceadbc056d6bf3df3d1c5015530fb40cf347c0f9afac88410bd0"}, + {file = "pytest_mock-3.14.0-py3-none-any.whl", hash = "sha256:0b72c38033392a5f4621342fe11e9219ac11ec9d375f8e2a0c164539e0d70f6f"}, +] + +[package.dependencies] +pytest = ">=6.2.5" + +[package.extras] +dev = ["pre-commit", "pytest-asyncio", "tox"] + [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -3166,4 +3184,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.13" -content-hash = "b3a6142d6495bc4c8741e9411d29352af219851e4b84b263f991e1bb6db1614e" +content-hash = "2d0a953383901fe12e97f6f56a76a9d8008788695425792eedbf739a18585188" diff --git a/pyproject.toml b/pyproject.toml index 9823833..29de7e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,6 +63,7 @@ dependencies = [ pytest = "^8.3.4" autopep8 = "^2.3.1" pytest-loguru = "^0.4.0" +pytest-mock = "^3.14.0" [tool.poetry.group.docs.dependencies] sphinx = "^8.1.3" diff --git a/tests/conftest.py b/tests/conftest.py index 8deebff..a32ca48 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,7 +7,6 @@ from datetime import datetime, timezone from tempfile import TemporaryDirectory from typing import Dict, Tuple import hashlib -from unittest.mock import patch import pytest from auto_archiver.core.metadata import Metadata @@ -134,14 +133,29 @@ def unpickle(): @pytest.fixture -def mock_binary_dependencies(): - with patch("shutil.which") as mock_shutil_which: - # Mock all binary dependencies as available - mock_shutil_which.return_value = "/usr/bin/fake_binary" - yield mock_shutil_which +def mock_binary_dependencies(mocker): + mock_shutil_which = mocker.patch("shutil.which") + # Mock all binary dependencies as available + mock_shutil_which.return_value = "/usr/bin/fake_binary" + return mock_shutil_which @pytest.fixture def sample_datetime(): return datetime(2023, 1, 1, 12, 0, tzinfo=timezone.utc) + +@pytest.fixture(autouse=True) +def mock_sleep(mocker): + """Globally mock time.sleep to avoid delays.""" + return mocker.patch("time.sleep") + + +@pytest.fixture +def metadata(): + metadata = Metadata() + metadata.set("_processed_at", "2021-01-01T00:00:00") + metadata.set_title("Example Title") + metadata.set_content("Example Content") + metadata.set_url("https://example.com") + return metadata \ No newline at end of file diff --git a/tests/databases/test_api_db.py b/tests/databases/test_api_db.py index d07cb1a..6d7a2bc 100644 --- a/tests/databases/test_api_db.py +++ b/tests/databases/test_api_db.py @@ -1,5 +1,3 @@ -from unittest.mock import patch - import pytest from auto_archiver.core import Metadata @@ -35,35 +33,35 @@ def test_fetch_no_cache(api_db, metadata): assert api_db.fetch(metadata) is None -def test_fetch_fail_status(api_db, metadata): +def test_fetch_fail_status(api_db, metadata, mocker): # Test response fail in fetch method - with patch("auto_archiver.modules.api_db.api_db.requests.get") as mock_get: - mock_get.return_value.status_code = 400 - mock_get.return_value.json.return_value = {} - with patch("loguru.logger.error") as mock_error: - assert api_db.fetch(metadata) is False - mock_error.assert_called_once_with("AA API FAIL (400): {}") + mock_get = mocker.patch("auto_archiver.modules.api_db.api_db.requests.get") + mock_get.return_value.status_code = 400 + mock_get.return_value.json.return_value = {} + mock_error = mocker.patch("loguru.logger.error") + assert api_db.fetch(metadata) is False + mock_error.assert_called_once_with("AA API FAIL (400): {}") -def test_fetch(api_db, metadata): +def test_fetch(api_db, metadata, mocker): # Test successful fetch method - with patch("auto_archiver.modules.api_db.api_db.requests.get") as mock_get,\ - patch("auto_archiver.core.metadata.datetime.datetime") as mock_datetime: - mock_datetime.now.return_value = "2021-01-01T00:00:00" - mock_get.return_value.status_code = 200 - mock_get.return_value.json.return_value = [{"result": {}}, {"result": - {'media': [], 'metadata': {'_processed_at': '2021-01-01T00:00:00', 'url': 'https://example.com'}, - 'status': 'no archiver'}}] - assert api_db.fetch(metadata) == metadata + mock_get = mocker.patch("auto_archiver.modules.api_db.api_db.requests.get") + mock_datetime = mocker.patch("auto_archiver.core.metadata.datetime.datetime") + mock_datetime.now.return_value = "2021-01-01T00:00:00" + mock_get.return_value.status_code = 200 + mock_get.return_value.json.return_value = [{"result": {}}, {"result": + {'media': [], 'metadata': {'_processed_at': '2021-01-01T00:00:00', 'url': 'https://example.com'}, + 'status': 'no archiver'}}] + assert api_db.fetch(metadata) == metadata -def test_done_success(api_db, metadata): - with patch("auto_archiver.modules.api_db.api_db.requests.post") as mock_post: - mock_post.return_value.status_code = 201 - api_db.done(metadata) - mock_post.assert_called_once() - mock_post.assert_called_once_with("https://api.example.com/interop/submit-archive", - json={'author_id': 'Someone', 'url': 'https://example.com', - 'public': False, 'group_id': '123', 'tags': ['[', ']'], 'result': '{"status": "no archiver", "metadata": {"_processed_at": "2021-01-01T00:00:00", "url": "https://example.com"}, "media": []}'}, - headers={'Authorization': 'Bearer test-token'}) +def test_done_success(api_db, metadata, mocker): + mock_post = mocker.patch("auto_archiver.modules.api_db.api_db.requests.post") + mock_post.return_value.status_code = 201 + api_db.done(metadata) + mock_post.assert_called_once() + mock_post.assert_called_once_with("https://api.example.com/interop/submit-archive", + json={'author_id': 'Someone', 'url': 'https://example.com', + 'public': False, 'group_id': '123', 'tags': ['[', ']'], 'result': '{"status": "no archiver", "metadata": {"_processed_at": "2021-01-01T00:00:00", "url": "https://example.com"}, "media": []}'}, + headers={'Authorization': 'Bearer test-token'}) diff --git a/tests/databases/test_gsheet_db.py b/tests/databases/test_gsheet_db.py index 18a22f1..42a21b2 100644 --- a/tests/databases/test_gsheet_db.py +++ b/tests/databases/test_gsheet_db.py @@ -1,6 +1,4 @@ from datetime import datetime, timezone -from unittest.mock import MagicMock, patch - import pytest from auto_archiver.core import Metadata, Media @@ -9,8 +7,8 @@ from auto_archiver.modules.gsheet_feeder import GWorksheet @pytest.fixture -def mock_gworksheet(): - mock_gworksheet = MagicMock(spec=GWorksheet) +def mock_gworksheet(mocker): + mock_gworksheet = mocker.MagicMock(spec=GWorksheet) mock_gworksheet.col_exists.return_value = True mock_gworksheet.get_cell.return_value = "" mock_gworksheet.get_row.return_value = {} @@ -18,14 +16,14 @@ def mock_gworksheet(): @pytest.fixture -def mock_metadata(): - metadata: Metadata = MagicMock(spec=Metadata) +def mock_metadata(mocker): + metadata: Metadata = mocker.MagicMock(spec=Metadata) metadata.get_url.return_value = "http://example.com" metadata.status = "done" metadata.get_title.return_value = "Example Title" metadata.get.return_value = "Example Content" metadata.get_timestamp.return_value = "2025-01-01T00:00:00" - metadata.get_final_media.return_value = MagicMock(spec=Media) + metadata.get_final_media.return_value = mocker.MagicMock(spec=Media) metadata.get_all_media.return_value = [] metadata.get_media_by_id.return_value = None metadata.get_first_image.return_value = None @@ -47,21 +45,21 @@ def metadata(): @pytest.fixture -def mock_media(): +def mock_media(mocker): """Fixture for a mock Media object.""" - mock_media = MagicMock(spec=Media) + mock_media = mocker.MagicMock(spec=Media) mock_media.urls = ["http://example.com/media"] mock_media.get.return_value = "not-calculated" return mock_media @pytest.fixture -def gsheets_db(mock_gworksheet, setup_module): +def gsheets_db(mock_gworksheet, setup_module, mocker): db = setup_module("gsheet_db", { "allow_worksheets": "set()", "block_worksheets": "set()", "use_sheet_names_in_stored_paths": "True", }) - db._retrieve_gsheet = MagicMock(return_value=(mock_gworksheet, 1)) + db._retrieve_gsheet = mocker.MagicMock(return_value=(mock_gworksheet, 1)) return db @@ -109,27 +107,26 @@ def test_aborted(gsheets_db, mock_metadata, mock_gworksheet): mock_gworksheet.set_cell.assert_called_once_with(1, 'status', '') -def test_done(gsheets_db, metadata, mock_gworksheet, expected_calls): - with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00'): - gsheets_db.done(metadata) +def test_done(gsheets_db, metadata, mock_gworksheet, expected_calls, mocker): + mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00') + gsheets_db.done(metadata) mock_gworksheet.batch_set_cell.assert_called_once_with(expected_calls) -def test_done_cached(gsheets_db, metadata, mock_gworksheet): - with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00'): - gsheets_db.done(metadata, cached=True) +def test_done_cached(gsheets_db, metadata, mock_gworksheet, mocker): + mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00') + gsheets_db.done(metadata, cached=True) # Verify the status message includes "[cached]" call_args = mock_gworksheet.batch_set_cell.call_args[0][0] assert any(call[2].startswith("[cached]") for call in call_args) -def test_done_missing_media(gsheets_db, metadata, mock_gworksheet): +def test_done_missing_media(gsheets_db, metadata, mock_gworksheet, mocker): # clear media from metadata metadata.media = [] - with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", - return_value='2025-02-01T00:00:00+00:00'): - gsheets_db.done(metadata) + mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00') + gsheets_db.done(metadata) # Verify nothing media-related gets updated call_args = mock_gworksheet.batch_set_cell.call_args[0][0] media_fields = {'archive', 'screenshot', 'thumbnail', 'wacz', 'replaywebpage'} diff --git a/tests/enrichers/test_meta_enricher.py b/tests/enrichers/test_meta_enricher.py index a09aaa9..cc283c0 100644 --- a/tests/enrichers/test_meta_enricher.py +++ b/tests/enrichers/test_meta_enricher.py @@ -1,6 +1,5 @@ import datetime from datetime import datetime, timedelta, timezone -from unittest.mock import MagicMock, patch import pytest @@ -9,18 +8,18 @@ from auto_archiver.modules.meta_enricher import MetaEnricher @pytest.fixture -def mock_metadata(): +def mock_metadata(mocker): """Creates a mock Metadata object.""" - mock: Metadata = MagicMock(spec=Metadata) + mock: Metadata = mocker.MagicMock(spec=Metadata) mock.get_url.return_value = "https://example.com" mock.is_empty.return_value = False # Default to not empty mock.get_all_media.return_value = [] return mock @pytest.fixture -def mock_media(): +def mock_media(mocker): """Creates a mock Media object.""" - mock: Media = MagicMock(spec=Media) + mock: Media = mocker.MagicMock(spec=Media) mock.filename = "mock_file.txt" return mock @@ -90,14 +89,14 @@ def test_enrich_file_sizes_no_media(meta_enricher, metadata): assert metadata.get("total_size") == "0.0 bytes" -def test_enrich_archive_duration(meta_enricher, metadata): +def test_enrich_archive_duration(meta_enricher, metadata, mocker): # Set fixed "processed at" time in the past processed_at = datetime.now(timezone.utc) - timedelta(minutes=10, seconds=30) metadata.set("_processed_at", processed_at) # patch datetime - with patch("datetime.datetime") as mock_datetime: - mock_now = datetime.now(timezone.utc) - mock_datetime.now.return_value = mock_now - meta_enricher.enrich_archive_duration(metadata) + mock_datetime = mocker.patch("datetime.datetime") + mock_now = datetime.now(timezone.utc) + mock_datetime.now.return_value = mock_now + meta_enricher.enrich_archive_duration(metadata) assert metadata.get("archive_duration_seconds") == 630 \ No newline at end of file diff --git a/tests/enrichers/test_metadata_enricher.py b/tests/enrichers/test_metadata_enricher.py index aedcf54..888837d 100644 --- a/tests/enrichers/test_metadata_enricher.py +++ b/tests/enrichers/test_metadata_enricher.py @@ -1,14 +1,13 @@ -from unittest.mock import MagicMock, patch, Mock import pytest -from auto_archiver.core import Metadata, Media +from auto_archiver.core import Media @pytest.fixture -def mock_media(): +def mock_media(mocker): """Creates a mock Media object.""" - mock: Media = MagicMock(spec=Media) + mock: Media = mocker.MagicMock(spec=Media) mock.filename = "mock_file.txt" return mock @@ -26,8 +25,8 @@ def enricher(setup_module, mock_binary_dependencies): ("", {}), ], ) -@patch("subprocess.run") -def test_get_metadata(mock_run, enricher, output, expected): +def test_get_metadata(enricher, output, expected, mocker): + mock_run = mocker.patch("subprocess.run") mock_run.return_value.stdout = output mock_run.return_value.stderr = "" mock_run.return_value.returncode = 0 @@ -39,17 +38,17 @@ def test_get_metadata(mock_run, enricher, output, expected): ) -@patch("subprocess.run") -def test_get_metadata_exiftool_not_found(mock_run, enricher): +def test_get_metadata_exiftool_not_found(enricher, mocker): + mock_run = mocker.patch("subprocess.run") mock_run.side_effect = FileNotFoundError result = enricher.get_metadata("test.jpg") assert result == {} -def test_enrich_sets_metadata(enricher): - media1 = Mock(filename="img1.jpg") - media2 = Mock(filename="img2.jpg") - metadata = Mock() +def test_enrich_sets_metadata(enricher, mocker): + media1 = mocker.Mock(filename="img1.jpg") + media2 = mocker.Mock(filename="img2.jpg") + metadata = mocker.Mock() metadata.media = [media1, media2] enricher.get_metadata = lambda f: {"key": "value"} if f == "img1.jpg" else {} @@ -60,24 +59,23 @@ def test_enrich_sets_metadata(enricher): assert metadata.media == [media1, media2] -def test_enrich_empty_media(enricher): - metadata = Mock() +def test_enrich_empty_media(enricher, mocker): + metadata = mocker.Mock() metadata.media = [] # Should not raise errors enricher.enrich(metadata) -@patch("loguru.logger.error") -@patch("subprocess.run") -def test_get_metadata_error_handling(mock_run, mock_logger_error, enricher): - mock_run.side_effect = Exception("Test error") +def test_get_metadata_error_handling(enricher, mocker): + mocker.patch("subprocess.run", side_effect=Exception("Test error")) + mock_log = mocker.patch("loguru.logger.error") result = enricher.get_metadata("test.jpg") assert result == {} - mock_logger_error.assert_called_once() + assert "Error occurred: " in mock_log.call_args[0][0] -@patch("subprocess.run") -def test_metadata_pickle(mock_run, enricher, unpickle): +def test_metadata_pickle(enricher, unpickle, mocker): + mock_run = mocker.patch("subprocess.run") # Uses pickled values mock_run.return_value = unpickle("metadata_enricher_exif.pickle") metadata = unpickle("metadata_enricher_ytshort_input.pickle") @@ -86,4 +84,5 @@ def test_metadata_pickle(mock_run, enricher, unpickle): expected_media = expected.media actual_media = metadata.media assert len(expected_media) == len(actual_media) - assert actual_media[0].properties.get("metadata") == expected_media[0].properties.get("metadata") \ No newline at end of file + assert actual_media[0].properties.get("metadata") == expected_media[0].properties.get("metadata") + diff --git a/tests/enrichers/test_pdq_hash_enricher.py b/tests/enrichers/test_pdq_hash_enricher.py index e90cd22..9653734 100644 --- a/tests/enrichers/test_pdq_hash_enricher.py +++ b/tests/enrichers/test_pdq_hash_enricher.py @@ -1,5 +1,3 @@ -from unittest.mock import patch - import pytest from PIL import UnidentifiedImageError @@ -21,11 +19,11 @@ def metadata_with_images(): return m -def test_successful_enrich(metadata_with_images): +def test_successful_enrich(metadata_with_images, mocker): with ( - patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100)), - patch("PIL.Image.open"), - patch.object(Media, "is_image", return_value=True) as mock_is_image, + mocker.patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100)), + mocker.patch("PIL.Image.open"), + mocker.patch.object(Media, "is_image", return_value=True) as mock_is_image, ): enricher = PdqHashEnricher() enricher.enrich(metadata_with_images) @@ -35,27 +33,24 @@ def test_successful_enrich(metadata_with_images): assert media.get("pdq_hash") is not None -def test_enrich_skip_non_image(metadata_with_images): - with ( - patch.object(Media, "is_image", return_value=False), - patch("pdqhash.compute") as mock_pdq, - ): - enricher = PdqHashEnricher() - enricher.enrich(metadata_with_images) - mock_pdq.assert_not_called() +def test_enrich_skip_non_image(metadata_with_images, mocker): + mocker.patch.object(Media, "is_image", return_value=False) + mock_pdq = mocker.patch("pdqhash.compute") + + enricher = PdqHashEnricher() + enricher.enrich(metadata_with_images) + mock_pdq.assert_not_called() -def test_enrich_handles_corrupted_image(metadata_with_images): - with ( - patch("PIL.Image.open", side_effect=UnidentifiedImageError("Corrupted image")), - patch("pdqhash.compute") as mock_pdq, - patch("loguru.logger.error") as mock_logger, - ): - enricher = PdqHashEnricher() - enricher.enrich(metadata_with_images) +def test_enrich_handles_corrupted_image(metadata_with_images, mocker): + mocker.patch("PIL.Image.open", side_effect=UnidentifiedImageError("Corrupted image")) + mock_pdq = mocker.patch("pdqhash.compute") + mock_logger = mocker.patch("loguru.logger.error") + enricher = PdqHashEnricher() + enricher.enrich(metadata_with_images) - assert mock_logger.call_count == len(metadata_with_images.media) - mock_pdq.assert_not_called() + assert mock_logger.call_count == len(metadata_with_images.media) + mock_pdq.assert_not_called() @pytest.mark.parametrize( @@ -66,19 +61,18 @@ def test_enrich_handles_corrupted_image(metadata_with_images): ("regular-image", True), ] ) -def test_enrich_excludes_by_filetype(media_id, should_have_hash): +def test_enrich_excludes_by_filetype(media_id, should_have_hash, mocker): metadata = Metadata() metadata.set_url("https://example.com") metadata.add_media(Media(filename="image.jpg").set("id", media_id)) - with ( - patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100)), - patch("PIL.Image.open"), - patch.object(Media, "is_image", return_value=True), - ): - enricher = PdqHashEnricher() - enricher.enrich(metadata) + mocker.patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100)) + mocker.patch("PIL.Image.open") + mocker.patch.object(Media, "is_image", return_value=True) - media_item = metadata.media[0] - assert (media_item.get("pdq_hash") is not None) == should_have_hash + enricher = PdqHashEnricher() + enricher.enrich(metadata) + + media_item = metadata.media[0] + assert (media_item.get("pdq_hash") is not None) == should_have_hash diff --git a/tests/enrichers/test_screenshot_enricher.py b/tests/enrichers/test_screenshot_enricher.py index 3998deb..25ca51d 100644 --- a/tests/enrichers/test_screenshot_enricher.py +++ b/tests/enrichers/test_screenshot_enricher.py @@ -1,5 +1,4 @@ import base64 -from unittest.mock import patch, MagicMock import pytest from selenium.common.exceptions import TimeoutException @@ -9,53 +8,47 @@ from auto_archiver.modules.screenshot_enricher import ScreenshotEnricher @pytest.fixture -def mock_selenium_env(): - # Patches Selenium calls and driver checks in one place. - with ( - patch("shutil.which") as mock_which, - patch("auto_archiver.utils.webdriver.CookieSettingDriver") as mock_driver_class, - patch( - "selenium.webdriver.common.selenium_manager.SeleniumManager.binary_paths" - ) as mock_binary_paths, - patch("pathlib.Path.is_file", return_value=True), - patch("subprocess.Popen") as mock_popen, - patch( - "selenium.webdriver.common.service.Service.is_connectable", - return_value=True, - ), - patch("selenium.webdriver.FirefoxOptions") as mock_firefox_options, - ): - # Mock driver existence - def mock_which_side_effect(dep): - return "/mock/geckodriver" if dep == "geckodriver" else None +def mock_selenium_env(mocker): + """Patches Selenium calls and driver checks in one place.""" - mock_which.side_effect = mock_which_side_effect - # Mock binary paths - mock_binary_paths.return_value = { - "driver_path": "/mock/driver", - "browser_path": "/mock/browser", - } - # Popen - mock_proc = MagicMock() - mock_proc.poll.return_value = None - mock_popen.return_value = mock_proc - # CookieSettingDriver -> returns a mock driver - mock_driver = MagicMock() - mock_driver_class.return_value = mock_driver - # FirefoxOptions - mock_options_instance = MagicMock() - mock_firefox_options.return_value = mock_options_instance - yield mock_driver, mock_driver_class, mock_options_instance + # Patch external dependencies + mock_which = mocker.patch("shutil.which") + mock_driver_class = mocker.patch("auto_archiver.utils.webdriver.CookieSettingDriver") + mock_binary_paths = mocker.patch("selenium.webdriver.common.selenium_manager.SeleniumManager.binary_paths") + mock_is_file = mocker.patch("pathlib.Path.is_file", return_value=True) + mock_popen = mocker.patch("subprocess.Popen") + mock_is_connectable = mocker.patch("selenium.webdriver.common.service.Service.is_connectable", return_value=True) + mock_firefox_options = mocker.patch("selenium.webdriver.FirefoxOptions") + # Define side effect for `shutil.which` + def mock_which_side_effect(dep): + return "/mock/geckodriver" if dep == "geckodriver" else None + mock_which.side_effect = mock_which_side_effect + + # Mock binary paths + mock_binary_paths.return_value = { + "driver_path": "/mock/driver", + "browser_path": "/mock/browser", + } + # Mock `subprocess.Popen` + mock_proc = mocker.MagicMock() + mock_proc.poll.return_value = None + mock_popen.return_value = mock_proc + # Mock `CookieSettingDriver` + mock_driver = mocker.MagicMock() + mock_driver_class.return_value = mock_driver + # Mock `FirefoxOptions` + mock_options_instance = mocker.MagicMock() + mock_firefox_options.return_value = mock_options_instance + yield mock_driver, mock_driver_class, mock_options_instance @pytest.fixture -def common_patches(tmp_path): - with ( - patch("auto_archiver.utils.url.is_auth_wall", return_value=False), - patch("os.path.join", return_value=str(tmp_path / "test.png")), - patch("time.sleep"), - ): - yield +def common_patches(tmp_path, mocker): + """Patches common utilities used across multiple tests.""" + mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=False) + mocker.patch("os.path.join", return_value=str(tmp_path / "test.png")) + mocker.patch("time.sleep") + yield @pytest.fixture @@ -117,37 +110,38 @@ def test_enrich_auth_wall( common_patches, url, is_auth, + mocker ): # Testing with and without is_auth_wall mock_driver, mock_driver_class, _ = mock_selenium_env - with patch("auto_archiver.utils.url.is_auth_wall", return_value=is_auth): - metadata_with_video.set_url(url) - screenshot_enricher.enrich(metadata_with_video) + mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=is_auth) + metadata_with_video.set_url(url) + screenshot_enricher.enrich(metadata_with_video) - if is_auth: - mock_driver.get.assert_not_called() - assert len(metadata_with_video.media) == 1 - assert metadata_with_video.media[0].properties.get("id") == "video1" - else: - mock_driver.get.assert_called_once_with(url) - assert len(metadata_with_video.media) == 2 - assert metadata_with_video.media[1].properties.get("id") == "screenshot" + if is_auth: + mock_driver.get.assert_not_called() + assert len(metadata_with_video.media) == 1 + assert metadata_with_video.media[0].properties.get("id") == "video1" + else: + mock_driver.get.assert_called_once_with(url) + assert len(metadata_with_video.media) == 2 + assert metadata_with_video.media[1].properties.get("id") == "screenshot" def test_handle_timeout_exception( - screenshot_enricher, metadata_with_video, mock_selenium_env + screenshot_enricher, metadata_with_video, mock_selenium_env, mocker ): mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env mock_driver.get.side_effect = TimeoutException - with patch("loguru.logger.info") as mock_log: - screenshot_enricher.enrich(metadata_with_video) - mock_log.assert_called_once_with("TimeoutException loading page for screenshot") - assert len(metadata_with_video.media) == 1 + mock_log = mocker.patch("loguru.logger.info") + screenshot_enricher.enrich(metadata_with_video) + mock_log.assert_called_once_with("TimeoutException loading page for screenshot") + assert len(metadata_with_video.media) == 1 def test_handle_general_exception( - screenshot_enricher, metadata_with_video, mock_selenium_env + screenshot_enricher, metadata_with_video, mock_selenium_env, mocker ): """Test proper handling of unexpected general exceptions""" mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env @@ -155,47 +149,43 @@ def test_handle_general_exception( mock_driver.get.return_value = None mock_driver.save_screenshot.side_effect = Exception("Unexpected Error") - with patch("loguru.logger.error") as mock_log: - screenshot_enricher.enrich(metadata_with_video) - # Verify that the exception was logged with the log - mock_log.assert_called_once_with( - "Got error while loading webdriver for screenshot enricher: Unexpected Error" - ) - # And no new media was added due to the error - assert len(metadata_with_video.media) == 1 + mock_log = mocker.patch("loguru.logger.error") + screenshot_enricher.enrich(metadata_with_video) + # Verify that the exception was logged with the log + mock_log.assert_called_once_with( + "Got error while loading webdriver for screenshot enricher: Unexpected Error" + ) + # And no new media was added due to the error + assert len(metadata_with_video.media) == 1 -def test_pdf_creation(screenshot_enricher, metadata_with_video, mock_selenium_env): +def test_pdf_creation(mocker, screenshot_enricher, metadata_with_video, mock_selenium_env): """Test PDF creation when save_to_pdf is enabled""" mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env - # Override the save_to_pdf option screenshot_enricher.save_to_pdf = True # Mock the print_page method to return base64-encoded content - mock_driver.print_page.return_value = base64.b64encode(b"fake_pdf_content").decode( - "utf-8" + mock_driver.print_page.return_value = base64.b64encode(b"fake_pdf_content").decode("utf-8") + # Patch functions with mocker + mock_os_path_join = mocker.patch("os.path.join", side_effect=lambda *args: f"{args[-1]}") + mock_random_str = mocker.patch( + "auto_archiver.modules.screenshot_enricher.screenshot_enricher.random_str", + return_value="fixed123", ) - with ( - patch("os.path.join", side_effect=lambda *args: f"{args[-1]}"), - patch( - "auto_archiver.modules.screenshot_enricher.screenshot_enricher.random_str", - return_value="fixed123", - ), - patch("builtins.open", new_callable=MagicMock()) as mock_open, - patch("loguru.logger.error") as mock_log, - ): - screenshot_enricher.enrich(metadata_with_video) + mock_open = mocker.patch("builtins.open", new_callable=mocker.mock_open) + mock_log_error = mocker.patch("loguru.logger.error") - # Verify screenshot and PDF creation - mock_driver.save_screenshot.assert_called_once() - mock_driver.print_page.assert_called_once_with(mock_driver.print_options) + screenshot_enricher.enrich(metadata_with_video) + # Verify screenshot and PDF creation + mock_driver.save_screenshot.assert_called_once() + mock_driver.print_page.assert_called_once_with(mock_driver.print_options) + # Check that PDF file was opened and written + mock_open.assert_any_call("pdf_fixed123.pdf", "wb") - # Check that PDF file was opened and written - mock_open.assert_any_call("pdf_fixed123.pdf", "wb") - # Ensure both screenshot and PDF were added as media - assert len(metadata_with_video.media) == 3 # Original video + screenshot + PDF - assert metadata_with_video.media[1].properties.get("id") == "screenshot" - assert metadata_with_video.media[2].properties.get("id") == "pdf" + # Ensure both screenshot and PDF were added as media + assert len(metadata_with_video.media) == 3 + assert metadata_with_video.media[1].properties.get("id") == "screenshot" + assert metadata_with_video.media[2].properties.get("id") == "pdf" @pytest.fixture(autouse=True) diff --git a/tests/enrichers/test_ssl_enricher.py b/tests/enrichers/test_ssl_enricher.py index 29775f2..eb7ba6b 100644 --- a/tests/enrichers/test_ssl_enricher.py +++ b/tests/enrichers/test_ssl_enricher.py @@ -1,6 +1,4 @@ import ssl -from unittest.mock import patch, mock_open - import pytest from auto_archiver.core import Metadata, Media @@ -35,22 +33,22 @@ def test_empty_metadata(metadata, enricher): assert enricher.enrich(metadata) is None -def test_ssl_enrich(metadata, enricher): - with patch("ssl.get_server_certificate", return_value="TEST_CERT"), \ - patch("builtins.open", mock_open()) as mock_file: - media_len_before = len(metadata.media) +def test_ssl_enrich(metadata, enricher, mocker): + mocker.patch("ssl.get_server_certificate", return_value="TEST_CERT") + mock_file = mocker.patch("builtins.open", mocker.mock_open()) + media_len_before = len(metadata.media) + enricher.enrich(metadata) + + ssl.get_server_certificate.assert_called_once_with(("example.com", 443)) + mock_file.assert_called_once_with(f"{enricher.tmp_dir}/example-com.pem", "w") + mock_file().write.assert_called_once_with("TEST_CERT") + assert len(metadata.media) == media_len_before + 1 + # Ensure the certificate is added to metadata + assert any(media.filename.endswith("example-com.pem") for media in metadata.media) + + +def test_ssl_error_handling(enricher, metadata, mocker): + mocker.patch("ssl.get_server_certificate", side_effect=ssl.SSLError("SSL error")) + with pytest.raises(ssl.SSLError, match="SSL error"): enricher.enrich(metadata) - ssl.get_server_certificate.assert_called_once_with(("example.com", 443)) - mock_file.assert_called_once_with(f"{enricher.tmp_dir}/example-com.pem", "w") - mock_file().write.assert_called_once_with("TEST_CERT") - assert len(metadata.media) == media_len_before + 1 - # Ensure the certificate is added to metadata - assert any(media.filename.endswith("example-com.pem") for media in metadata.media) - - -def test_ssl_error_handling(enricher, metadata): - with patch("ssl.get_server_certificate", side_effect=ssl.SSLError("SSL error")): - with pytest.raises(ssl.SSLError, match="SSL error"): - enricher.enrich(metadata) - diff --git a/tests/enrichers/test_thumbnail_enricher.py b/tests/enrichers/test_thumbnail_enricher.py index 14cfa0e..effc25e 100644 --- a/tests/enrichers/test_thumbnail_enricher.py +++ b/tests/enrichers/test_thumbnail_enricher.py @@ -1,5 +1,4 @@ import pytest -from unittest.mock import patch, MagicMock from auto_archiver.core import Metadata, Media from auto_archiver.modules.thumbnail_enricher import ThumbnailEnricher @@ -22,32 +21,30 @@ def metadata_with_video(): @pytest.fixture -def mock_ffmpeg_environment(): +def mock_ffmpeg_environment(mocker): # Mocking all the ffmpeg calls in one place - with ( - patch("ffmpeg.input") as mock_ffmpeg_input, - patch("os.makedirs") as mock_makedirs, - patch.object(Media, "is_video", return_value=True), - patch( - "ffmpeg.probe", - return_value={ - "streams": [ - {"codec_type": "video", "duration": "120"} - ] # Default 2-minute duration, but can override in tests - }, - ) as mock_probe, - ): - mock_output = MagicMock() - mock_ffmpeg_input.return_value.filter.return_value.output.return_value = ( - mock_output - ) + mock_ffmpeg_input = mocker.patch("ffmpeg.input") + mock_makedirs = mocker.patch("os.makedirs") + mocker.patch.object(Media, "is_video", return_value=True), + mock_probe = mocker.patch( + "ffmpeg.probe", + return_value={ + "streams": [ + {"codec_type": "video", "duration": "120"} + ] # Default 2-minute duration, but can override in tests + }, + ) + mock_output = mocker.MagicMock() + mock_ffmpeg_input.return_value.filter.return_value.output.return_value = ( + mock_output + ) - yield { - "mock_ffmpeg_input": mock_ffmpeg_input, - "mock_makedirs": mock_makedirs, - "mock_output": mock_output, - "mock_probe": mock_probe, - } + return { + "mock_ffmpeg_input": mock_ffmpeg_input, + "mock_makedirs": mock_makedirs, + "mock_output": mock_output, + "mock_probe": mock_probe, + } @pytest.mark.parametrize("thumbnails_per_minute, max_thumbnails, expected_count", [ @@ -68,28 +65,26 @@ def test_enrich_thumbnail_limits( thumbnails = metadata_with_video.media[0].get("thumbnails") assert len(thumbnails) == expected_count -def test_enrich_handles_probe_failure(thumbnail_enricher, metadata_with_video): - with ( - patch("ffmpeg.probe", side_effect=Exception("Probe error")), - patch("os.makedirs"), - patch("loguru.logger.error") as mock_logger, - patch.object(Media, "is_video", return_value=True), - ): - thumbnail_enricher.enrich(metadata_with_video) - # Ensure error was logged - mock_logger.assert_called_with( - f"error getting duration of video video.mp4: Probe error" - ) - # Ensure no thumbnails were created - thumbnails = metadata_with_video.media[0].get("thumbnails") - assert thumbnails is None +def test_enrich_handles_probe_failure(thumbnail_enricher, metadata_with_video, mocker): + + mocker.patch("ffmpeg.probe", side_effect=Exception("Probe error")) + mocker.patch("os.makedirs") + mock_logger = mocker.patch("loguru.logger.error") + mocker.patch.object(Media, "is_video", return_value=True) + + thumbnail_enricher.enrich(metadata_with_video) + # Ensure error was logged + mock_logger.assert_called_with( + f"error getting duration of video video.mp4: Probe error" + ) + # Ensure no thumbnails were created + thumbnails = metadata_with_video.media[0].get("thumbnails") + assert thumbnails is None -def test_enrich_skips_non_video_files(thumbnail_enricher, metadata_with_video): - with ( - patch.object(Media, "is_video", return_value=False), - patch("ffmpeg.input") as mock_ffmpeg, - ): +def test_enrich_skips_non_video_files(thumbnail_enricher, metadata_with_video, mocker): + mocker.patch.object(Media, "is_video", return_value=False) + mock_ffmpeg = mocker.patch("ffmpeg.input") thumbnail_enricher.enrich(metadata_with_video) mock_ffmpeg.assert_not_called() @@ -102,21 +97,21 @@ def test_enrich_skips_non_video_files(thumbnail_enricher, metadata_with_video): (12, 20, 2), # test caught by t/min ]) def test_enrich_handles_short_video( - thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment, thumbnails_per_minute, max_thumbnails, expected_count + thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment, thumbnails_per_minute, max_thumbnails, expected_count, mocker ): # override mock duration fake_duration = 10 - with patch( + mocker.patch( "ffmpeg.probe", return_value={ "streams": [{"codec_type": "video", "duration": str(fake_duration)}]}, - ): - thumbnail_enricher.thumbnails_per_minute = thumbnails_per_minute - thumbnail_enricher.max_thumbnails = max_thumbnails + ) + thumbnail_enricher.thumbnails_per_minute = thumbnails_per_minute + thumbnail_enricher.max_thumbnails = max_thumbnails - thumbnail_enricher.enrich(metadata_with_video) - assert mock_ffmpeg_environment["mock_output"].run.call_count == expected_count - thumbnails = metadata_with_video.media[0].get("thumbnails") - assert len(thumbnails) == expected_count + thumbnail_enricher.enrich(metadata_with_video) + assert mock_ffmpeg_environment["mock_output"].run.call_count == expected_count + thumbnails = metadata_with_video.media[0].get("thumbnails") + assert len(thumbnails) == expected_count def test_uses_existing_duration( @@ -128,28 +123,26 @@ def test_uses_existing_duration( assert mock_ffmpeg_environment["mock_output"].run.call_count == 4 -def test_enrich_metadata_structure(thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment): +def test_enrich_metadata_structure(thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment, mocker): fake_duration = 120 - with patch("ffmpeg.probe", return_value={ - 'streams': [{'codec_type': 'video', 'duration': str(fake_duration)}] - }): - thumbnail_enricher.thumbnails_per_minute = 2 - thumbnail_enricher.max_thumbnails = 4 + mocker.patch("ffmpeg.probe", return_value={'streams': [{'codec_type': 'video', 'duration': str(fake_duration)}]}) + thumbnail_enricher.thumbnails_per_minute = 2 + thumbnail_enricher.max_thumbnails = 4 - thumbnail_enricher.enrich(metadata_with_video) + thumbnail_enricher.enrich(metadata_with_video) - media_item = metadata_with_video.media[0] - thumbnails = media_item.get("thumbnails") + media_item = metadata_with_video.media[0] + thumbnails = media_item.get("thumbnails") - # Assert normal metadata - assert media_item.get("id") == "video1" - assert media_item.get("duration") == fake_duration - # Evenly spaced timestamps - expected_timestamps = ["24.000s", "48.000s", "72.000s", "96.000s"] - assert thumbnails is not None - assert len(thumbnails) == 4 + # Assert normal metadata + assert media_item.get("id") == "video1" + assert media_item.get("duration") == fake_duration + # Evenly spaced timestamps + expected_timestamps = ["24.000s", "48.000s", "72.000s", "96.000s"] + assert thumbnails is not None + assert len(thumbnails) == 4 - for index, thumbnail in enumerate(thumbnails): - assert thumbnail.filename is not None - assert thumbnail.properties.get("id") == f"thumbnail_{index}" - assert thumbnail.properties.get("timestamp") == expected_timestamps[index] + for index, thumbnail in enumerate(thumbnails): + assert thumbnail.filename is not None + assert thumbnail.properties.get("id") == f"thumbnail_{index}" + assert thumbnail.properties.get("timestamp") == expected_timestamps[index] diff --git a/tests/enrichers/test_whisper_enricher.py b/tests/enrichers/test_whisper_enricher.py index 873198f..ee1844a 100644 --- a/tests/enrichers/test_whisper_enricher.py +++ b/tests/enrichers/test_whisper_enricher.py @@ -1,18 +1,14 @@ -import shutil -import sys import pytest -from unittest.mock import MagicMock, patch + from auto_archiver.core import Metadata, Media from auto_archiver.modules.s3_storage import S3Storage - from auto_archiver.modules.whisper_enricher import WhisperEnricher - TEST_S3_URL = "http://cdn.example.com/test.mp4" @pytest.fixture -def enricher(): +def enricher(mocker): """Fixture with mocked S3 and API dependencies""" config = { "api_endpoint": "http://testapi", @@ -22,7 +18,7 @@ def enricher(): "action": "translate", "steps": {"storages": ["s3_storage"]} } - mock_s3 = MagicMock(spec=S3Storage) + mock_s3 = mocker.MagicMock(spec=S3Storage) mock_s3.get_cdn_url.return_value = TEST_S3_URL instance = WhisperEnricher() instance.name = "whisper_enricher" @@ -43,16 +39,16 @@ def metadata(): @pytest.fixture -def mock_requests(): - with patch("auto_archiver.modules.whisper_enricher.whisper_enricher.requests") as mock_requests: - mock_response = MagicMock() - mock_response.status_code = 201 - mock_response.json.return_value = {"id": "job123"} - mock_requests.post.return_value = mock_response - yield mock_requests +def mock_requests(mocker): + mock_requests = mocker.patch("auto_archiver.modules.whisper_enricher.whisper_enricher.requests") + mock_response = mocker.MagicMock() + mock_response.status_code = 201 + mock_response.json.return_value = {"id": "job123"} + mock_requests.post.return_value = mock_response + yield mock_requests -def test_successful_job_submission(enricher, metadata, mock_requests): +def test_successful_job_submission(enricher, metadata, mock_requests, mocker): """Test successful media processing with S3 configured""" whisper, mock_s3 = enricher # Configure mock S3 URL to match test expectation @@ -65,13 +61,13 @@ def test_successful_job_submission(enricher, metadata, mock_requests): metadata.media = [m] # Mock the complete API interaction chain - mock_status_response = MagicMock() + mock_status_response = mocker.MagicMock() mock_status_response.status_code = 200 mock_status_response.json.return_value = { "status": "success", "meta": {} } - mock_artifacts_response = MagicMock() + mock_artifacts_response = mocker.MagicMock() mock_artifacts_response.status_code = 200 mock_artifacts_response.json.return_value = [{ "data": [{"start": 0, "end": 5, "text": "test transcript"}] @@ -93,35 +89,39 @@ def test_successful_job_submission(enricher, metadata, mock_requests): # Verify job status checks assert mock_requests.get.call_count == 2 assert "artifact_0_text" in metadata.media[0].get("whisper_model") - assert metadata.media[0].get("whisper_model") == {'artifact_0_text': 'test transcript', 'job_artifacts_check': 'http://testapi/jobs/job123/artifacts', 'job_id': 'job123', 'job_status_check': 'http://testapi/jobs/job123'} + assert metadata.media[0].get("whisper_model") == {'artifact_0_text': 'test transcript', + 'job_artifacts_check': 'http://testapi/jobs/job123/artifacts', + 'job_id': 'job123', + 'job_status_check': 'http://testapi/jobs/job123'} - -def test_submit_job(enricher): +def test_submit_job(enricher, mocker): """Test job submission method""" whisper, _ = enricher m = Media("test.mp4") m.add_url(TEST_S3_URL) - with patch("auto_archiver.modules.whisper_enricher.whisper_enricher.requests") as mock_requests: - mock_response = MagicMock() - mock_response.status_code = 201 - mock_response.json.return_value = {"id": "job123"} - mock_requests.post.return_value = mock_response - job_id = whisper.submit_job(m) + mock_requests = mocker.patch("auto_archiver.modules.whisper_enricher.whisper_enricher.requests") + mock_response = mocker.MagicMock() + mock_response.status_code = 201 + mock_response.json.return_value = {"id": "job123"} + mock_requests.post.return_value = mock_response + job_id = whisper.submit_job(m) assert job_id == "job123" -def test_submit_raises_status(enricher): + +def test_submit_raises_status(enricher, mocker): whisper, _ = enricher m = Media("test.mp4") m.add_url(TEST_S3_URL) - with patch("auto_archiver.modules.whisper_enricher.whisper_enricher.requests") as mock_requests: - mock_response = MagicMock() - mock_response.status_code = 400 - mock_response.json.return_value = {"id": "job123"} - mock_requests.post.return_value = mock_response - with pytest.raises(AssertionError) as exc_info: - whisper.submit_job(m) - assert str(exc_info.value) == "calling the whisper api http://testapi returned a non-success code: 400" + mock_requests = mocker.patch("auto_archiver.modules.whisper_enricher.whisper_enricher.requests") + mock_response = mocker.MagicMock() + mock_response.status_code = 400 + mock_response.json.return_value = {"id": "job123"} + mock_requests.post.return_value = mock_response + with pytest.raises(AssertionError) as exc_info: + whisper.submit_job(m) + assert str(exc_info.value) == "calling the whisper api http://testapi returned a non-success code: 400" + # @pytest.mark.parametrize("test_url, status", ["http://cdn.example.com/test.mp4",]) def test_submit_job_fails(enricher): @@ -131,5 +131,3 @@ def test_submit_job_fails(enricher): m.add_url("http://cdn.wrongurl.com/test.mp4") with pytest.raises(AssertionError): whisper.submit_job(m) - - diff --git a/tests/extractors/test_instagram_api_extractor.py b/tests/extractors/test_instagram_api_extractor.py index c119e3f..7eba8e9 100644 --- a/tests/extractors/test_instagram_api_extractor.py +++ b/tests/extractors/test_instagram_api_extractor.py @@ -1,15 +1,12 @@ from datetime import datetime -from typing import Type import pytest -from unittest.mock import patch, MagicMock from auto_archiver.core import Metadata from auto_archiver.modules.instagram_api_extractor.instagram_api_extractor import InstagramAPIExtractor from .test_extractor_base import TestExtractorBase - @pytest.fixture def mock_user_response(): return { @@ -115,74 +112,74 @@ class TestInstagramAPIExtractor(TestExtractorBase): # test gets text (metadata title) pass - def test_download_profile_basic(self, metadata, mock_user_response): + def test_download_profile_basic(self, metadata, mock_user_response, mocker): """Test basic profile download without full_profile""" - with patch.object(self.extractor, 'call_api') as mock_call, \ - patch.object(self.extractor, 'download_from_url') as mock_download: - # Mock API responses - mock_call.return_value = mock_user_response - mock_download.return_value = "profile.jpg" + mock_call = mocker.patch.object(self.extractor, 'call_api') + mock_download = mocker.patch.object(self.extractor, 'download_from_url') + # Mock API responses + mock_call.return_value = mock_user_response + mock_download.return_value = "profile.jpg" - result = self.extractor.download_profile(metadata, "test_user") - assert result.status == "insta profile: success" - assert result.get_title() == "Test User" - assert result.get("data") == self.extractor.cleanup_dict(mock_user_response["user"]) - # Verify profile picture download - mock_call.assert_called_once_with("v2/user/by/username", {"username": "test_user"}) - mock_download.assert_called_once_with("http://example.com/profile.jpg") - assert len(result.media) == 1 - assert result.media[0].filename == "profile.jpg" + result = self.extractor.download_profile(metadata, "test_user") + assert result.status == "insta profile: success" + assert result.get_title() == "Test User" + assert result.get("data") == self.extractor.cleanup_dict(mock_user_response["user"]) + # Verify profile picture download + mock_call.assert_called_once_with("v2/user/by/username", {"username": "test_user"}) + mock_download.assert_called_once_with("http://example.com/profile.jpg") + assert len(result.media) == 1 + assert result.media[0].filename == "profile.jpg" - def test_download_profile_full(self, metadata, mock_user_response, mock_story_response): + def test_download_profile_full(self, metadata, mock_user_response, mock_story_response, mocker): """Test full profile download with stories/posts""" - with patch.object(self.extractor, 'call_api') as mock_call, \ - patch.object(self.extractor, 'download_all_posts') as mock_posts, \ - patch.object(self.extractor, 'download_all_highlights') as mock_highlights, \ - patch.object(self.extractor, 'download_all_tagged') as mock_tagged, \ - patch.object(self.extractor, '_download_stories_reusable') as mock_stories: + mock_call = mocker.patch.object(self.extractor, 'call_api') + mock_posts = mocker.patch.object(self.extractor, 'download_all_posts') + mock_highlights = mocker.patch.object(self.extractor, 'download_all_highlights') + mock_tagged = mocker.patch.object(self.extractor, 'download_all_tagged') + mock_stories = mocker.patch.object(self.extractor, '_download_stories_reusable') - self.extractor.full_profile = True - mock_call.side_effect = [ - mock_user_response, - mock_story_response - ] - mock_highlights.return_value = None - mock_stories.return_value = mock_story_response - mock_posts.return_value = None - mock_tagged.return_value = None + self.extractor.full_profile = True + mock_call.side_effect = [ + mock_user_response, + mock_story_response + ] + mock_highlights.return_value = None + mock_stories.return_value = mock_story_response + mock_posts.return_value = None + mock_tagged.return_value = None - result = self.extractor.download_profile(metadata, "test_user") - assert result.get("#stories") == len(mock_story_response) - mock_posts.assert_called_once_with(result, "123") - assert "errors" not in result.metadata + result = self.extractor.download_profile(metadata, "test_user") + assert result.get("#stories") == len(mock_story_response) + mock_posts.assert_called_once_with(result, "123") + assert "errors" not in result.metadata - def test_download_profile_not_found(self, metadata): + def test_download_profile_not_found(self, metadata, mocker): """Test profile not found error""" - with patch.object(self.extractor, 'call_api') as mock_call: - mock_call.return_value = {"user": None} - with pytest.raises(AssertionError) as exc_info: - self.extractor.download_profile(metadata, "invalid_user") - assert "User invalid_user not found" in str(exc_info.value) + mock_call = mocker.patch.object(self.extractor, 'call_api') + mock_call.return_value = {"user": None} + with pytest.raises(AssertionError) as exc_info: + self.extractor.download_profile(metadata, "invalid_user") + assert "User invalid_user not found" in str(exc_info.value) - def test_download_profile_error_handling(self, metadata, mock_user_response): + def test_download_profile_error_handling(self, metadata, mock_user_response, mocker): """Test error handling in full profile mode""" - with (patch.object(self.extractor, 'call_api') as mock_call, \ - patch.object(self.extractor, 'download_all_highlights') as mock_highlights, \ - patch.object(self.extractor, 'download_all_tagged') as mock_tagged, \ - patch.object(self.extractor, '_download_stories_reusable') as stories_tagged, \ - patch.object(self.extractor, 'download_all_posts') as mock_posts - ): - self.extractor.full_profile = True - mock_call.side_effect = [ - mock_user_response, - Exception("Stories API failed"), - Exception("Posts API failed") - ] - mock_highlights.return_value = None - mock_tagged.return_value = None - stories_tagged.return_value = None - mock_posts.return_value = None - result = self.extractor.download_profile(metadata, "test_user") + mock_call = mocker.patch.object(self.extractor, 'call_api') + mock_highlights = mocker.patch.object(self.extractor, 'download_all_highlights') + mock_tagged = mocker.patch.object(self.extractor, 'download_all_tagged') + stories_tagged = mocker.patch.object(self.extractor, '_download_stories_reusable') + mock_posts = mocker.patch.object(self.extractor, 'download_all_posts') - assert result.is_success() - assert "Error downloading stories for test_user" in result.metadata["errors"] \ No newline at end of file + self.extractor.full_profile = True + mock_call.side_effect = [ + mock_user_response, + Exception("Stories API failed"), + Exception("Posts API failed") + ] + mock_highlights.return_value = None + mock_tagged.return_value = None + stories_tagged.return_value = None + mock_posts.return_value = None + result = self.extractor.download_profile(metadata, "test_user") + + assert result.is_success() + assert "Error downloading stories for test_user" in result.metadata["errors"] \ No newline at end of file diff --git a/tests/extractors/test_instagram_tbot_extractor.py b/tests/extractors/test_instagram_tbot_extractor.py index 9df9983..9238f89 100644 --- a/tests/extractors/test_instagram_tbot_extractor.py +++ b/tests/extractors/test_instagram_tbot_extractor.py @@ -1,5 +1,4 @@ import os -from unittest.mock import patch, MagicMock import pytest @@ -11,16 +10,10 @@ TESTFILES = os.path.join(os.path.dirname(__file__), "testfiles") @pytest.fixture -def patch_extractor_methods(request, setup_module): - with patch.object(InstagramTbotExtractor, '_prepare_session_file', return_value=None), \ - patch.object(InstagramTbotExtractor, '_initialize_telegram_client', return_value=None): - yield - -@pytest.fixture(autouse=True) -def mock_sleep(): - """Globally mock time.sleep to avoid delays.""" - with patch("time.sleep") as mock_sleep: - yield mock_sleep +def patch_extractor_methods(request, setup_module, mocker): + mocker.patch.object(InstagramTbotExtractor, '_prepare_session_file', return_value=None) + mocker.patch.object(InstagramTbotExtractor, '_initialize_telegram_client', return_value=None) + yield @pytest.fixture @@ -33,16 +26,16 @@ def metadata_sample(): @pytest.fixture -def mock_telegram_client(): +def mock_telegram_client(mocker): """Fixture to mock TelegramClient interactions.""" - with patch("auto_archiver.modules.instagram_tbot_extractor.client") as mock_client: - instance = MagicMock() - mock_client.return_value = instance - yield instance + mock_client = mocker.patch("auto_archiver.modules.instagram_tbot_extractor.client") + instance = mocker.MagicMock() + mock_client.return_value = instance + return instance @pytest.fixture -def extractor(setup_module, patch_extractor_methods): +def extractor(setup_module, patch_extractor_methods, mocker): extractor_module = "instagram_tbot_extractor" config = { "api_id": 12345, @@ -51,7 +44,7 @@ def extractor(setup_module, patch_extractor_methods): "timeout": 4 } extractor = setup_module(extractor_module, config) - extractor.client = MagicMock() + extractor.client = mocker.MagicMock() extractor.session_file = "test_session" return extractor @@ -60,20 +53,20 @@ def test_non_instagram_url(extractor, metadata_sample): metadata_sample.set_url("https://www.youtube.com") assert extractor.download(metadata_sample) is False -def test_download_success(extractor, metadata_sample): - with patch.object(extractor, "_send_url_to_bot", return_value=(MagicMock(), 101)), \ - patch.object(extractor, "_process_messages", return_value="Sample Instagram post caption"): - result = extractor.download(metadata_sample) + +def test_download_success(extractor, metadata_sample, mocker): + mocker.patch.object(extractor, "_send_url_to_bot", return_value=(mocker.MagicMock(), 101)) + mocker.patch.object(extractor, "_process_messages", return_value="Sample Instagram post caption") + result = extractor.download(metadata_sample) assert result.is_success() assert result.status == "insta-via-bot: success" assert result.metadata.get("title") == "Sample Instagram post caption" -def test_download_invalid(extractor, metadata_sample): - with patch.object(extractor, "_send_url_to_bot", return_value=(MagicMock(), 101)), \ - patch.object(extractor, "_process_messages", return_value="You must enter a URL to a post"): - assert extractor.download(metadata_sample) is False - +def test_download_invalid(extractor, metadata_sample, mocker): + mocker.patch.object(extractor, "_send_url_to_bot", return_value=(mocker.MagicMock(), 101)) + mocker.patch.object(extractor, "_process_messages", return_value="You must enter a URL to a post") + assert extractor.download(metadata_sample) is False @pytest.mark.skip(reason="Requires authentication.") @@ -89,8 +82,12 @@ class TestInstagramTbotExtractorReal(TestExtractorBase): } @pytest.mark.parametrize("url, expected_status, message, len_media", [ - ("https://www.instagram.com/p/C4QgLbrIKXG", "insta-via-bot: success", "Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou", 6), - ("https://www.instagram.com/reel/DEVLK8qoIbg/", "insta-via-bot: success", "Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol", 3), + ("https://www.instagram.com/p/C4QgLbrIKXG", "insta-via-bot: success", + "Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou", + 6), + ("https://www.instagram.com/reel/DEVLK8qoIbg/", "insta-via-bot: success", + "Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol", + 3), # instagram tbot not working (potentially intermittently?) for stories - replace with a live story to retest # ("https://www.instagram.com/stories/bellingcatofficial/3556336382743057476/", False, "Media not found or unavailable"), # Seems to be working intermittently for highlights diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py index b86e329..7c5f501 100644 --- a/tests/feeders/test_gsheet_feeder.py +++ b/tests/feeders/test_gsheet_feeder.py @@ -2,27 +2,23 @@ from typing import Type import gspread import pytest -from unittest.mock import patch, MagicMock from auto_archiver.modules.gsheet_feeder import GsheetsFeeder from auto_archiver.core import Metadata, Feeder -def test_setup_without_sheet_and_sheet_id(setup_module): +def test_setup_without_sheet_and_sheet_id(setup_module, mocker): # Ensure setup() raises AssertionError if neither sheet nor sheet_id is set. - with patch("gspread.service_account"): - with pytest.raises(AssertionError): - setup_module( - "gsheet_feeder", - {"service_account": "dummy.json", "sheet": None, "sheet_id": None}, - ) + mocker.patch("gspread.service_account") + with pytest.raises(AssertionError): + setup_module( + "gsheet_feeder", + {"service_account": "dummy.json", "sheet": None, "sheet_id": None}, + ) @pytest.fixture -def gsheet_feeder(setup_module) -> GsheetsFeeder: - with patch("gspread.service_account"): - feeder = setup_module( - "gsheet_feeder", - { +def gsheet_feeder(setup_module, mocker) -> GsheetsFeeder: + config: dict = { "service_account": "dummy.json", "sheet": "test-auto-archiver", "sheet_id": None, @@ -46,9 +42,13 @@ def gsheet_feeder(setup_module) -> GsheetsFeeder: "allow_worksheets": set(), "block_worksheets": set(), "use_sheet_names_in_stored_paths": True, - }, - ) - feeder.gsheets_client = MagicMock() + } + mocker.patch("gspread.service_account") + feeder = setup_module( + "gsheet_feeder", + config + ) + feeder.gsheets_client = mocker.MagicMock() return feeder @@ -129,56 +129,56 @@ def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder): ], ) def test_open_sheet_with_name_or_id( - setup_module, sheet, sheet_id, expected_method, expected_arg, description + setup_module, sheet, sheet_id, expected_method, expected_arg, description, mocker ): """Ensure open_sheet() correctly opens by name or ID based on configuration.""" - with patch("gspread.service_account") as mock_service_account: - mock_client = MagicMock() - mock_service_account.return_value = mock_client - mock_client.open.return_value = "MockSheet" - mock_client.open_by_key.return_value = "MockSheet" + mock_service_account = mocker.patch("gspread.service_account") + mock_client = mocker.MagicMock() + mock_service_account.return_value = mock_client + mock_client.open.return_value = "MockSheet" + mock_client.open_by_key.return_value = "MockSheet" - # Setup module with parameterized values - feeder = setup_module( - "gsheet_feeder", - {"service_account": "dummy.json", "sheet": sheet, "sheet_id": sheet_id}, - ) - sheet_result = feeder.open_sheet() - # Validate the correct method was called - getattr(mock_client, expected_method).assert_called_once_with( - expected_arg - ), f"Failed: {description}" - assert sheet_result == "MockSheet", f"Failed: {description}" + # Setup module with parameterized values + feeder = setup_module( + "gsheet_feeder", + {"service_account": "dummy.json", "sheet": sheet, "sheet_id": sheet_id}, + ) + sheet_result = feeder.open_sheet() + # Validate the correct method was called + getattr(mock_client, expected_method).assert_called_once_with( + expected_arg + ), f"Failed: {description}" + assert sheet_result == "MockSheet", f"Failed: {description}" @pytest.mark.usefixtures("setup_module") -def test_open_sheet_with_sheet_id(setup_module): +def test_open_sheet_with_sheet_id(setup_module, mocker): """Ensure open_sheet() correctly opens a sheet by ID.""" - with patch("gspread.service_account") as mock_service_account: - mock_client = MagicMock() - mock_service_account.return_value = mock_client - mock_client.open_by_key.return_value = "MockSheet" - feeder = setup_module( - "gsheet_feeder", - {"service_account": "dummy.json", "sheet": None, "sheet_id": "ABC123"}, - ) - sheet = feeder.open_sheet() - mock_client.open_by_key.assert_called_once_with("ABC123") - assert sheet == "MockSheet" + mock_service_account = mocker.patch("gspread.service_account") + mock_client = mocker.MagicMock() + mock_service_account.return_value = mock_client + mock_client.open_by_key.return_value = "MockSheet" + feeder = setup_module( + "gsheet_feeder", + {"service_account": "dummy.json", "sheet": None, "sheet_id": "ABC123"}, + ) + sheet = feeder.open_sheet() + mock_client.open_by_key.assert_called_once_with("ABC123") + assert sheet == "MockSheet" -def test_should_process_sheet(setup_module): - with patch("gspread.service_account"): - gdb = setup_module( - "gsheet_feeder", - { - "service_account": "dummy.json", - "sheet": "TestSheet", - "sheet_id": None, - "allow_worksheets": {"TestSheet", "Sheet2"}, - "block_worksheets": {"Sheet3"}, - }, - ) +def test_should_process_sheet(setup_module, mocker): + mocker.patch("gspread.service_account") + gdb = setup_module( + "gsheet_feeder", + { + "service_account": "dummy.json", + "sheet": "TestSheet", + "sheet_id": None, + "allow_worksheets": {"TestSheet", "Sheet2"}, + "block_worksheets": {"Sheet3"}, + }, + ) assert gdb.should_process_sheet("TestSheet") == True assert gdb.should_process_sheet("Sheet3") == False # False if allow_worksheets is set diff --git a/tests/feeders/test_gworksheet.py b/tests/feeders/test_gworksheet.py index 016cfb2..2b05504 100644 --- a/tests/feeders/test_gworksheet.py +++ b/tests/feeders/test_gworksheet.py @@ -1,14 +1,13 @@ # Note this isn't a feeder, but contained as utility of the gsheet feeder module import pytest -from unittest.mock import MagicMock from auto_archiver.modules.gsheet_feeder import GWorksheet class TestGWorksheet: @pytest.fixture - def mock_worksheet(self): - mock_ws = MagicMock() + def mock_worksheet(self, mocker): + mock_ws = mocker.MagicMock() mock_ws.get_values.return_value = [ ["Link", "Archive Status", "Archive Location", "Archive Date"], ["url1", "archived", "filepath1", "2023-01-01"], @@ -137,8 +136,8 @@ class TestGWorksheet: assert gworksheet.to_a1(row, col) == expected # Test empty worksheet - def test_empty_worksheet_initialization(self): - mock_ws = MagicMock() + def test_empty_worksheet_initialization(self, mocker): + mock_ws = mocker.MagicMock() mock_ws.get_values.return_value = [] g = GWorksheet(mock_ws) assert g.headers == [] diff --git a/tests/storages/test_S3_storage.py b/tests/storages/test_S3_storage.py index 2a5d026..fe60329 100644 --- a/tests/storages/test_S3_storage.py +++ b/tests/storages/test_S3_storage.py @@ -1,6 +1,5 @@ from typing import Type import pytest -from unittest.mock import MagicMock, patch from auto_archiver.core import Media from auto_archiver.modules.s3_storage import S3Storage @@ -11,7 +10,6 @@ class TestS3Storage: """ module_name: str = "s3_storage" storage: Type[S3Storage] - s3: MagicMock config: dict = { "path_generator": "flat", "filename_generator": "static", @@ -25,13 +23,14 @@ class TestS3Storage: "private": False, } - @patch('boto3.client') @pytest.fixture(autouse=True) - def setup_storage(self, setup_module): + def setup_storage(self, setup_module, mocker): + self.s3 = S3Storage() self.storage = setup_module(self.module_name, self.config) def test_client_initialization(self): """Test that S3 client is initialized with correct parameters""" + assert self.storage.s3 is not None assert self.storage.s3.meta.region_name == 'test-region' @@ -44,81 +43,63 @@ class TestS3Storage: media.key = "another/path.jpg" assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg" - def test_uploadf_sets_acl_public(self): + def test_uploadf_sets_acl_public(self, mocker): media = Media("test.txt") - mock_file = MagicMock() - with patch.object(self.storage.s3, 'upload_fileobj') as mock_s3_upload, \ - patch.object(self.storage, 'is_upload_needed', return_value=True): - self.storage.uploadf(mock_file, media) - mock_s3_upload.assert_called_once_with( - mock_file, - Bucket='test-bucket', - Key=media.key, - ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/plain'} - ) + mock_file = mocker.MagicMock() + mock_s3_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj') + mocker.patch.object(self.storage, 'is_upload_needed', return_value=True) + self.storage.uploadf(mock_file, media) + mock_s3_upload.assert_called_once_with( + mock_file, + Bucket='test-bucket', + Key=media.key, + ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/plain'} + ) - def test_upload_decision_logic(self): + def test_upload_decision_logic(self, mocker): """Test is_upload_needed under different conditions""" media = Media("test.txt") - # Test default state (random_no_duplicate=False) assert self.storage.is_upload_needed(media) is True - # Set duplicate checking config to true: - self.storage.random_no_duplicate = True - with patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash') as mock_calc_hash, \ - patch.object(self.storage, 'file_in_folder') as mock_file_in_folder: - mock_calc_hash.return_value = 'beepboop123beepboop123beepboop123' - mock_file_in_folder.return_value = 'existing_key.txt' - # Test duplicate result - assert self.storage.is_upload_needed(media) is False - assert media.key == 'existing_key.txt' - mock_file_in_folder.assert_called_with( - # (first 24 chars of hash) - 'no-dups/beepboop123beepboop123be' - ) + mock_calc_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value='beepboop123beepboop123beepboop123') + mock_file_in_folder = mocker.patch.object(self.storage, 'file_in_folder', return_value='existing_key.txt') + assert self.storage.is_upload_needed(media) is False + assert media.key == 'existing_key.txt' + mock_file_in_folder.assert_called_with('no-dups/beepboop123beepboop123be') - - @patch.object(S3Storage, 'file_in_folder') - def test_skips_upload_when_duplicate_exists(self, mock_file_in_folder): + def test_skips_upload_when_duplicate_exists(self, mocker): """Test that upload skips when file_in_folder finds existing object""" self.storage.random_no_duplicate = True - mock_file_in_folder.return_value = "existing_folder/existing_file.txt" - # Create test media with calculated hash + mock_file_in_folder = mocker.patch.object(S3Storage, 'file_in_folder', return_value="existing_folder/existing_file.txt") media = Media("test.txt") media.key = "original_path.txt" - with patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash') as mock_calculate_hash: - mock_calculate_hash.return_value = "beepboop123beepboop123beepboop123" - # Verify upload - assert self.storage.is_upload_needed(media) is False - assert media.key == "existing_folder/existing_file.txt" - assert media.get("previously archived") is True - with patch.object(self.storage.s3, 'upload_fileobj') as mock_upload: - result = self.storage.uploadf(None, media) - mock_upload.assert_not_called() - assert result is True + mock_calculate_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value="beepboop123beepboop123beepboop123") + assert self.storage.is_upload_needed(media) is False + assert media.key == "existing_folder/existing_file.txt" + assert media.get("previously archived") is True + mock_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj') + result = self.storage.uploadf(None, media) + mock_upload.assert_not_called() + assert result is True - @patch.object(S3Storage, 'is_upload_needed') - def test_uploads_with_correct_parameters(self, mock_upload_needed): + def test_uploads_with_correct_parameters(self, mocker): media = Media("test.txt") media.key = "original_key.txt" - mock_upload_needed.return_value = True + mocker.patch.object(S3Storage, 'is_upload_needed', return_value=True) media.mimetype = 'image/png' - mock_file = MagicMock() + mock_file = mocker.MagicMock() + mock_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj') + self.storage.uploadf(mock_file, media) + mock_upload.assert_called_once_with( + mock_file, + Bucket='test-bucket', + Key='original_key.txt', + ExtraArgs={ + 'ACL': 'public-read', + 'ContentType': 'image/png' + } + ) - with patch.object(self.storage.s3, 'upload_fileobj') as mock_upload: - self.storage.uploadf(mock_file, media) - # verify call occured with these params - mock_upload.assert_called_once_with( - mock_file, - Bucket='test-bucket', - Key='original_key.txt', - ExtraArgs={ - 'ACL': 'public-read', - 'ContentType': 'image/png' - } - ) - - def test_file_in_folder_exists(self): - with patch.object(self.storage.s3, 'list_objects') as mock_list_objects: - mock_list_objects.return_value = {'Contents': [{'Key': 'path/to/file.txt'}]} - assert self.storage.file_in_folder('path/to/') == 'path/to/file.txt' \ No newline at end of file + def test_file_in_folder_exists(self, mocker): + mock_list_objects = mocker.patch.object(self.storage.s3, 'list_objects', return_value={'Contents': [{'Key': 'path/to/file.txt'}]}) + assert self.storage.file_in_folder('path/to/') == 'path/to/file.txt' diff --git a/tests/storages/test_gdrive_storage.py b/tests/storages/test_gdrive_storage.py index aba0a25..f5ff87c 100644 --- a/tests/storages/test_gdrive_storage.py +++ b/tests/storages/test_gdrive_storage.py @@ -1,44 +1,57 @@ from typing import Type import pytest -from unittest.mock import MagicMock, patch +from oauth2client import service_account + from auto_archiver.core import Media from auto_archiver.modules.gdrive_storage import GDriveStorage from auto_archiver.core.metadata import Metadata from tests.storages.test_storage_base import TestStorageBase -class TestGDriveStorage: - """ - Test suite for GDriveStorage. - """ - +@pytest.fixture +def gdrive_storage(setup_module, mocker): module_name: str = "gdrive_storage" - storage: Type[GDriveStorage] + storage: GDriveStorage config: dict = {'path_generator': 'url', 'filename_generator': 'static', 'root_folder_id': "fake_root_folder_id", 'oauth_token': None, 'service_account': 'fake_service_account.json' } - - @pytest.fixture(autouse=True) - def gdrive(self, setup_module): - with patch('google.oauth2.service_account.Credentials.from_service_account_file') as mock_creds: - self.storage = setup_module(self.module_name, self.config) - - def test_initialize_fails_with_non_existent_creds(self): - """ - Test that the Google Drive service raises a FileNotFoundError when the service account file does not exist. - """ - # Act and Assert - with pytest.raises(FileNotFoundError) as exc_info: - self.storage.setup() - assert "No such file or directory" in str(exc_info.value) + mocker.patch('google.oauth2.service_account.Credentials.from_service_account_file') + return setup_module(module_name, config) - def test_path_parts(self): - media = Media(filename="test.jpg") - media.key = "folder1/folder2/test.jpg" +def test_initialize_fails_with_non_existent_creds(setup_module): + """Test that the Google Drive service raises a FileNotFoundError when the service account file does not exist. + (and isn't mocked) + """ + config: dict = {'path_generator': 'url', + 'filename_generator': 'static', + 'root_folder_id': "fake_root_folder_id", + 'oauth_token': None, + 'service_account': 'fake_service_account.json' + } + with pytest.raises(FileNotFoundError) as exc_info: + setup_module("gdrive_storage", config) + assert "No such file or directory" in str(exc_info.value) + + +def test_get_id_from_parent_and_name(gdrive_storage, mocker): + """Test _get_id_from_parent_and_name returns correct id from an API result.""" + fake_list = mocker.MagicMock() + fake_list.execute.return_value = {"files": [{"id": "123", "name": "testname"}]} + fake_service = mocker.MagicMock() + # mock the files.list return value + fake_service.files.return_value.list.return_value = fake_list + gdrive_storage.service = fake_service + result = gdrive_storage._get_id_from_parent_and_name("parent", "mock", retries=1, use_mime_type=False) + assert result == "123" + +def test_path_parts(): + media = Media(filename="test.jpg") + media.key = "folder1/folder2/test.jpg" + @pytest.mark.skip(reason="Requires real credentials") diff --git a/tests/utils/test_misc.py b/tests/utils/test_misc.py index e45c1c1..0023077 100644 --- a/tests/utils/test_misc.py +++ b/tests/utils/test_misc.py @@ -1,7 +1,6 @@ import hashlib import json from datetime import datetime, timezone -from unittest.mock import Mock, patch import pytest @@ -44,20 +43,19 @@ class TestURLExpansion: ("https://example.com", "https://example.com"), ("https://t.co/test", "https://expanded.url") ]) - def test_expand_url(self, input_url, expected): - mock_response = Mock() + def test_expand_url(self, input_url, expected, mocker): + mock_response = mocker.Mock() mock_response.url = "https://expanded.url" - with patch('requests.get', return_value=mock_response): + mocker.patch('requests.get', return_value=mock_response) + result = expand_url(input_url) + assert result == expected - result = expand_url(input_url) - assert result == expected - - def test_expand_url_handles_errors(self, caplog): - with patch('requests.get', side_effect=Exception("Connection error")): - url = "https://t.co/error" - result = expand_url(url) - assert result == url - assert f"Failed to expand url {url}" in caplog.text + def test_expand_url_handles_errors(self, caplog, mocker): + mocker.patch('requests.get', side_effect=Exception("Connection error")) + url = "https://t.co/error" + result = expand_url(url) + assert result == url + assert f"Failed to expand url {url}" in caplog.text class TestAttributeHandling: class Sample: From 10a5ad62b837b356058a1d0680ff3d3d7d9f5f8d Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Wed, 19 Feb 2025 09:18:41 +0000 Subject: [PATCH 09/15] Include Atlos tests, metadata fixture. --- .../modules/atlos_db/__init__.py | 2 +- tests/conftest.py | 2 - tests/databases/test_api_db.py | 8 - tests/databases/test_atlos_db.py | 110 ++++++++++++++ tests/enrichers/test_meta_enricher.py | 8 - tests/feeders/test_atlos_feeder.py | 108 +++++++++++++ tests/storages/test_atlos_storage.py | 142 ++++++++++++++++++ 7 files changed, 361 insertions(+), 19 deletions(-) create mode 100644 tests/databases/test_atlos_db.py create mode 100644 tests/feeders/test_atlos_feeder.py create mode 100644 tests/storages/test_atlos_storage.py diff --git a/src/auto_archiver/modules/atlos_db/__init__.py b/src/auto_archiver/modules/atlos_db/__init__.py index 1552e39..e14d202 100644 --- a/src/auto_archiver/modules/atlos_db/__init__.py +++ b/src/auto_archiver/modules/atlos_db/__init__.py @@ -1 +1 @@ -from atlos_db import AtlosDb \ No newline at end of file +from .atlos_db import AtlosDb \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index a32ca48..2927735 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -155,7 +155,5 @@ def mock_sleep(mocker): def metadata(): metadata = Metadata() metadata.set("_processed_at", "2021-01-01T00:00:00") - metadata.set_title("Example Title") - metadata.set_content("Example Content") metadata.set_url("https://example.com") return metadata \ No newline at end of file diff --git a/tests/databases/test_api_db.py b/tests/databases/test_api_db.py index 6d7a2bc..5d1ea84 100644 --- a/tests/databases/test_api_db.py +++ b/tests/databases/test_api_db.py @@ -19,14 +19,6 @@ def api_db(setup_module): return setup_module(AAApiDb, configs) -@pytest.fixture -def metadata(): - metadata = Metadata() - metadata.set("_processed_at", "2021-01-01T00:00:00") - metadata.set_url("https://example.com") - return metadata - - def test_fetch_no_cache(api_db, metadata): # Test fetch api_db.use_api_cache = False diff --git a/tests/databases/test_atlos_db.py b/tests/databases/test_atlos_db.py new file mode 100644 index 0000000..82c07ef --- /dev/null +++ b/tests/databases/test_atlos_db.py @@ -0,0 +1,110 @@ +import pytest +from datetime import datetime + +from auto_archiver.core import Metadata +from auto_archiver.modules.atlos_db import AtlosDb + + +class FakeAPIResponse: + """Simulate a response object.""" + + def __init__(self, data: dict, raise_error: bool = False) -> None: + self._data = data + self.raise_error = raise_error + + def raise_for_status(self) -> None: + if self.raise_error: + raise Exception("HTTP error") + + +@pytest.fixture +def atlos_db(setup_module) -> AtlosDb: + """Fixture for AtlosDb.""" + configs: dict = { + "api_token": "abc123", + "atlos_url": "https://platform.atlos.org", + } + return setup_module("atlos_db", configs) + + +def test_failed_no_atlos_id(atlos_db, metadata, mocker): + """Test failed() skips posting when no atlos_id present.""" + post_mock = mocker.patch("requests.post") + atlos_db.failed(metadata, "failure reason") + post_mock.assert_not_called() + + +def test_failed_with_atlos_id(atlos_db, metadata, mocker): + """Test failed() posts failure when atlos_id is present.""" + metadata.set("atlos_id", 42) + fake_resp = FakeAPIResponse({}, raise_error=False) + post_mock = mocker.patch("requests.post", return_value=fake_resp) + atlos_db.failed(metadata, "failure reason") + expected_url = ( + f"{atlos_db.atlos_url}/api/v2/source_material/metadata/42/auto_archiver" + ) + expected_headers = {"Authorization": f"Bearer {atlos_db.api_token}"} + expected_json = { + "metadata": {"processed": True, "status": "error", "error": "failure reason"} + } + post_mock.assert_called_once_with( + expected_url, headers=expected_headers, json=expected_json + ) + + +def test_failed_http_error(atlos_db, metadata, mocker): + """Test failed() raises exception on HTTP error.""" + metadata.set("atlos_id", 42) + fake_resp = FakeAPIResponse({}, raise_error=True) + mocker.patch("requests.post", return_value=fake_resp) + with pytest.raises(Exception, match="HTTP error"): + atlos_db.failed(metadata, "failure reason") + + +def test_fetch_returns_false(atlos_db): + """Test fetch() always returns False.""" + item = Metadata() + assert atlos_db.fetch(item) is False + + +def test_done_no_atlos_id(atlos_db, mocker): + """Test done() skips posting when no atlos_id present.""" + item = Metadata().set_url("http://example.com") + post_mock = mocker.patch("requests.post") + atlos_db.done(item) + post_mock.assert_not_called() + + +def test_done_with_atlos_id(atlos_db, metadata, mocker): + """Test done() posts success when atlos_id is present.""" + metadata.set("atlos_id", 99) + now = datetime.now() + metadata.set("timestamp", now) + fake_resp = FakeAPIResponse({}, raise_error=False) + post_mock = mocker.patch("requests.post", return_value=fake_resp) + atlos_db.done(metadata) + expected_url = ( + f"{atlos_db.atlos_url}/api/v2/source_material/metadata/99/auto_archiver" + ) + expected_headers = {"Authorization": f"Bearer {atlos_db.api_token}"} + expected_results = metadata.metadata.copy() + expected_results["timestamp"] = now.isoformat() + expected_json = { + "metadata": { + "processed": True, + "status": "success", + "results": expected_results, + } + } + post_mock.assert_called_once_with( + expected_url, headers=expected_headers, json=expected_json + ) + + +def test_done_http_error(atlos_db, metadata, mocker): + """Test done() raises exception on HTTP error.""" + metadata.set("atlos_id", 123) + fake_resp = FakeAPIResponse({}, raise_error=True) + mocker.patch("requests.post", return_value=fake_resp) + with pytest.raises(Exception, match="HTTP error"): + atlos_db.done(metadata) diff --git a/tests/enrichers/test_meta_enricher.py b/tests/enrichers/test_meta_enricher.py index cc283c0..476e25b 100644 --- a/tests/enrichers/test_meta_enricher.py +++ b/tests/enrichers/test_meta_enricher.py @@ -23,14 +23,6 @@ def mock_media(mocker): mock.filename = "mock_file.txt" return mock -@pytest.fixture -def metadata(): - m = Metadata() - m.set_url("https://example.com") - m.set_title("Test Title") - m.set_content("Test Content") - return m - @pytest.fixture(autouse=True) def meta_enricher(setup_module): diff --git a/tests/feeders/test_atlos_feeder.py b/tests/feeders/test_atlos_feeder.py new file mode 100644 index 0000000..f26bdc9 --- /dev/null +++ b/tests/feeders/test_atlos_feeder.py @@ -0,0 +1,108 @@ +import pytest +from auto_archiver.modules.atlos_feeder import AtlosFeeder + + +class FakeAPIResponse: + """Simulate a response object.""" + + def __init__(self, data: dict, raise_error: bool = False) -> None: + self._data = data + self.raise_error = raise_error + + def json(self) -> dict: + return self._data + + def raise_for_status(self) -> None: + if self.raise_error: + raise Exception("HTTP error") + + +@pytest.fixture +def atlos_feeder(setup_module) -> AtlosFeeder: + """Fixture for AtlosFeeder.""" + configs: dict = { + "api_token": "abc123", + "atlos_url": "https://platform.atlos.org", + } + return setup_module("atlos_feeder", configs) + + +@pytest.fixture +def mock_atlos_api(mocker): + """Fixture to mock requests to Atlos API.""" + def _mock_responses(responses): + mocker.patch( + "requests.get", + side_effect=[FakeAPIResponse(data) for data in responses], + ) + return _mock_responses + + +def test_atlos_feeder_iter_yields_valid_metadata(atlos_feeder, mock_atlos_api): + """Test valid items are yielded and invalid ones ignored.""" + mock_atlos_api([ + { + "next": None, + "results": [ + {"source_url": "http://example.com", "id": 1, + "metadata": {"auto_archiver": {"processed": False}}, + "visibility": "visible", "status": "complete"}, + {"source_url": "", "id": 2, + "metadata": {"auto_archiver": {"processed": False}}, + "visibility": "visible", "status": "complete"}, + {"source_url": "http://example.org", "id": 3, + "metadata": {"auto_archiver": {"processed": True}}, + "visibility": "visible", "status": "complete"}, + ], + } + ]) + + items = list(atlos_feeder) + assert len(items) == 1 + assert items[0].get_url() == "http://example.com" + assert items[0].get("atlos_id") == 1 + + +def test_atlos_feeder_multiple_pages(atlos_feeder, mock_atlos_api): + """Test iteration over multiple pages with valid items.""" + mock_atlos_api([ + { + "next": "cursor2", + "results": [ + {"source_url": "http://example1.com", "id": 10, + "metadata": {"auto_archiver": {"processed": False}}, + "visibility": "visible", "status": "complete"}, + ], + }, + { + "next": None, + "results": [ + {"source_url": "http://example2.com", "id": 20, + "metadata": {"auto_archiver": {"processed": False}}, + "visibility": "visible", "status": "complete"}, + ], + }, + ]) + + items = list(atlos_feeder) + assert len(items) == 2 + assert items[0].get_url() == "http://example1.com" + assert items[0].get("atlos_id") == 10 + assert items[1].get_url() == "http://example2.com" + assert items[1].get("atlos_id") == 20 + + +def test_atlos_feeder_no_results(atlos_feeder, mock_atlos_api): + """Test iteration stops when no results are returned.""" + mock_atlos_api([{"next": None, "results": []}]) + assert list(atlos_feeder) == [] + + +def test_atlos_feeder_http_error(atlos_feeder, mocker): + """Test raises an exception on HTTP error.""" + mocker.patch( + "requests.get", + return_value=FakeAPIResponse({"next": None, "results": []}, raise_error=True), + ) + with pytest.raises(Exception, match="HTTP error"): + list(atlos_feeder) diff --git a/tests/storages/test_atlos_storage.py b/tests/storages/test_atlos_storage.py new file mode 100644 index 0000000..7528456 --- /dev/null +++ b/tests/storages/test_atlos_storage.py @@ -0,0 +1,142 @@ +import os +import hashlib +import pytest +from auto_archiver.core import Media, Metadata +from auto_archiver.modules.atlos_storage import AtlosStorage + + +class FakeAPIResponse: + """Simulate a response object.""" + + def __init__(self, data: dict, raise_error: bool = False) -> None: + self._data = data + self.raise_error = raise_error + + def json(self) -> dict: + return self._data + + def raise_for_status(self) -> None: + if self.raise_error: + raise Exception("HTTP error") + + +@pytest.fixture +def atlos_storage(setup_module) -> AtlosStorage: + """Fixture for AtlosStorage.""" + configs: dict = { + "api_token": "abc123", + "atlos_url": "https://platform.atlos.org", + } + return setup_module("atlos_storage", configs) + + +@pytest.fixture +def media(tmp_path) -> Media: + """Fixture for Media.""" + content = b"media content" + file_path = tmp_path / "media.txt" + file_path.write_bytes(content) + media = Media(filename=str(file_path)) + media.properties = {"something": "Title"} + media.key = "key" + return media + + +def test_get_cdn_url(atlos_storage: AtlosStorage) -> None: + """Test get_cdn_url returns the configured atlos_url.""" + media = Media(filename="dummy.mp4") + url = atlos_storage.get_cdn_url(media) + assert url == atlos_storage.atlos_url + + +def test_hash(tmp_path, atlos_storage: AtlosStorage) -> None: + """Test _hash() computes the correct SHA-256 hash of a file.""" + content = b"hello world" + file_path = tmp_path / "test.txt" + file_path.write_bytes(content) + media = Media(filename="dummy.mp4") + media.filename = str(file_path) + expected_hash = hashlib.sha256(content).hexdigest() + assert atlos_storage._hash(media) == expected_hash + + +def test_upload_no_atlos_id(tmp_path, atlos_storage: AtlosStorage, media: Media, mocker) -> None: + """Test upload() returns False when metadata lacks atlos_id.""" + metadata = Metadata() # atlos_id not set + post_mock = mocker.patch("requests.post") + result = atlos_storage.upload(media, metadata) + assert result is False + post_mock.assert_not_called() + + +def test_upload_already_uploaded(atlos_storage: AtlosStorage, + metadata: Metadata, + media: Media, + tmp_path, + mocker) -> None: + """Test upload() returns True if media hash already exists.""" + content = b"media content" + metadata.set("atlos_id", 101) + media_hash = hashlib.sha256(content).hexdigest() + fake_get = FakeAPIResponse({ + "result": {"artifacts": [{"file_hash_sha256": media_hash}]} + }) + get_mock = mocker.patch("requests.get", return_value=fake_get) + post_mock = mocker.patch("requests.post") + result = atlos_storage.upload(media, metadata) + assert result is True + get_mock.assert_called_once() + post_mock.assert_not_called() + + +def test_upload_not_uploaded(tmp_path, atlos_storage: AtlosStorage, + metadata: Metadata, + media: Media, + mocker) -> None: + """Test upload() uploads media when not already present.""" + metadata.set("atlos_id", 202) + fake_get = FakeAPIResponse({ + "result": {"artifacts": [{"file_hash_sha256": "different_hash"}]} + }) + get_mock = mocker.patch("requests.get", return_value=fake_get) + fake_post = FakeAPIResponse({}, raise_error=False) + post_mock = mocker.patch("requests.post", return_value=fake_post) + result = atlos_storage.upload(media, metadata) + assert result is True + get_mock.assert_called_once() + post_mock.assert_called_once() + expected_url = f"{atlos_storage.atlos_url}/api/v2/source_material/upload/202" + expected_headers = {"Authorization": f"Bearer {atlos_storage.api_token}"} + expected_params = {"title": media.properties} + call_kwargs = post_mock.call_args.kwargs + assert call_kwargs["headers"] == expected_headers + assert call_kwargs["params"] == expected_params + # Verify the URL passed to requests.post. + posted_url = call_kwargs.get("url") or post_mock.call_args.args[0] + assert posted_url == expected_url + # Verify files parameter contains the correct filename. + file_tuple = call_kwargs["files"]["file"] + assert file_tuple[0] == os.path.basename(media.filename) + + +def test_upload_post_http_error(tmp_path, + atlos_storage: AtlosStorage, + metadata: Metadata, + media: Media, + mocker) -> None: + """Test upload() propagates HTTP error during POST.""" + metadata.set("atlos_id", 303) + fake_get = FakeAPIResponse({ + "result": {"artifacts": []} + }) + mocker.patch("requests.get", return_value=fake_get) + fake_post = FakeAPIResponse({}, raise_error=True) + mocker.patch("requests.post", return_value=fake_post) + with pytest.raises(Exception, match="HTTP error"): + atlos_storage.upload(media, metadata) + + +def test_uploadf_not_implemented(atlos_storage: AtlosStorage) -> None: + """Test uploadf() returns None (not implemented).""" + result = atlos_storage.uploadf(None, "dummy") + assert result is None From ddf2e7662498d4965fed371dffaf7d92a11a6b6d Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Wed, 19 Feb 2025 09:24:34 +0000 Subject: [PATCH 10/15] Include Atlos Storage __init__.py for module recognition. --- src/auto_archiver/modules/atlos_storage/__init__.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 src/auto_archiver/modules/atlos_storage/__init__.py diff --git a/src/auto_archiver/modules/atlos_storage/__init__.py b/src/auto_archiver/modules/atlos_storage/__init__.py new file mode 100644 index 0000000..9e815c7 --- /dev/null +++ b/src/auto_archiver/modules/atlos_storage/__init__.py @@ -0,0 +1 @@ +from .atlos_storage import AtlosStorage \ No newline at end of file From eb60b271b91cbaabc65755dde9933dc0427ef82b Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 19 Feb 2025 10:28:35 +0000 Subject: [PATCH 11/15] Fix issue #200 --- src/auto_archiver/core/orchestrator.py | 35 ++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 208512a..f319e0d 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -95,6 +95,8 @@ class UniqueAppendAction(argparse.Action): class ArchivingOrchestrator: + setup_finished: bool = False + logger_id: int = None feeders: List[Type[Feeder]] extractors: List[Type[Extractor]] enrichers: List[Type[Enricher]] @@ -274,11 +276,18 @@ class ArchivingOrchestrator: def setup_logging(self, config): # setup loguru logging - logger.remove(0) # remove the default logger + try: + logger.remove(0) # remove the default logger + except ValueError: + pass + logging_config = config['logging'] - logger.add(sys.stderr, level=logging_config['level']) - if log_file := logging_config['file']: - logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation']) + + # add other logging info + if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0 + self.logger_id = logger.add(sys.stderr, level=logging_config['level']) + if log_file := logging_config['file']: + logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation']) def install_modules(self, modules_by_type): """ @@ -359,7 +368,10 @@ class ArchivingOrchestrator: def setup_config(self, args: list) -> dict: """ Sets up the configuration file, merging the default config with the user's config + + This function should only ever be run once. """ + self.setup_basic_parser() # parse the known arguments for now (basically, we want the config file) @@ -378,8 +390,19 @@ class ArchivingOrchestrator: def setup(self, args: list): """ - Main entry point for the orchestrator, sets up the basic parser, loads the config file, and sets up the complete parser + Function to configure all setup of the orchestrator: setup configs and load modules. + + This method should only ever be called once """ + + if self.setup_finished: + logger.warning("The `setup_config()` function should only ever be run once. \ + If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \ + For code implementatations, you should call .setup_config() once then you may call .feed() \ + multiple times to archive multiple URLs.") + return + + self.setup_basic_parser() self.config = self.setup_config(args) logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========") @@ -388,6 +411,8 @@ class ArchivingOrchestrator: # log out the modules that were loaded for module_type in BaseModule.MODULE_TYPES: logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s"))) + + self.setup_finished = True def _command_line_run(self, args: list) -> Generator[Metadata]: """ From a8ffb193254323d0734b1268fdc6541804968846 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Wed, 19 Feb 2025 10:40:54 +0000 Subject: [PATCH 12/15] Fix auth key name for cookies_from_browser. --- .../modules/generic_extractor/generic_extractor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 6bcb249..72fe3e0 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -280,7 +280,7 @@ class GenericExtractor(Extractor): # set up auth auth = self.auth_for_site(url, extract_cookies=False) - # order of importance: username/pasword -> api_key -> cookie -> cookie_from_browser -> cookies_file + # order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file if auth: if 'username' in auth and 'password' in auth: logger.debug(f'Using provided auth username and password for {url}') @@ -289,7 +289,7 @@ class GenericExtractor(Extractor): elif 'cookie' in auth: logger.debug(f'Using provided auth cookie for {url}') yt_dlp.utils.std_headers['cookie'] = auth['cookie'] - elif 'cookie_from_browser' in auth: + elif 'cookies_from_browser' in auth: logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}') ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser'] elif 'cookies_file' in auth: From a9802dd004336ede086e58aac679ae8920de112c Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 19 Feb 2025 12:25:35 +0000 Subject: [PATCH 13/15] Remove the global _LAZY_LOADED_MODULES and allow each instance of ArchivingOrchestrator to load its own modules --- docs/scripts/scripts.py | 4 +- src/auto_archiver/core/__init__.py | 2 +- src/auto_archiver/core/base_module.py | 36 ++-- src/auto_archiver/core/config.py | 4 +- src/auto_archiver/core/consts.py | 23 ++ src/auto_archiver/core/module.py | 203 ++++++++++-------- src/auto_archiver/core/orchestrator.py | 45 ++-- src/auto_archiver/core/storage.py | 4 +- .../modules/html_formatter/html_formatter.py | 3 +- .../whisper_enricher/whisper_enricher.py | 3 +- tests/conftest.py | 10 +- tests/enrichers/test_hash_enricher.py | 4 +- tests/test_modules.py | 26 +-- tests/test_orchestrator.py | 19 +- 14 files changed, 203 insertions(+), 183 deletions(-) create mode 100644 src/auto_archiver/core/consts.py diff --git a/docs/scripts/scripts.py b/docs/scripts/scripts.py index 9712439..a5f2998 100644 --- a/docs/scripts/scripts.py +++ b/docs/scripts/scripts.py @@ -1,6 +1,6 @@ # iterate through all the modules in auto_archiver.modules and turn the __manifest__.py file into a markdown table from pathlib import Path -from auto_archiver.core.module import available_modules +from auto_archiver.core.module import ModuleFactory from auto_archiver.core.base_module import BaseModule from ruamel.yaml import YAML import io @@ -41,7 +41,7 @@ def generate_module_docs(): configs_cheatsheet = "\n## Configuration Options\n" configs_cheatsheet += header_row - for module in sorted(available_modules(with_manifest=True), key=lambda x: (x.requires_setup, x.name)): + for module in sorted(ModuleFactory().available_modules(), key=lambda x: (x.requires_setup, x.name)): # generate the markdown file from the __manifest__.py file. manifest = module.manifest diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py index ae4c41c..78d9a3d 100644 --- a/src/auto_archiver/core/__init__.py +++ b/src/auto_archiver/core/__init__.py @@ -3,7 +3,7 @@ """ from .metadata import Metadata from .media import Media -from .module import BaseModule +from .base_module import BaseModule # cannot import ArchivingOrchestrator/Config to avoid circular dep # from .orchestrator import ArchivingOrchestrator diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index dfdd5ad..50ea3ff 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -1,13 +1,18 @@ -from urllib.parse import urlparse -from typing import Mapping, Any +from __future__ import annotations + +from typing import Mapping, Any, Type, TYPE_CHECKING from abc import ABC from copy import deepcopy, copy from tempfile import TemporaryDirectory from auto_archiver.utils import url as UrlUtil +from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES from loguru import logger +if TYPE_CHECKING: + from .module import ModuleFactory + class BaseModule(ABC): """ @@ -17,41 +22,24 @@ class BaseModule(ABC): however modules can have a .setup() method to run any setup code (e.g. logging in to a site, spinning up a browser etc.) - See BaseModule.MODULE_TYPES for the types of modules you can create, noting that + See consts.MODULE_TYPES for the types of modules you can create, noting that a subclass can be of multiple types. For example, a module that extracts data from a website and stores it in a database would be both an 'extractor' and a 'database' module. Each module is a python package, and should have a __manifest__.py file in the same directory as the module file. The __manifest__.py specifies the module information - like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the + like name, author, version, dependencies etc. See DEFAULT_MANIFEST for the default manifest structure. """ - MODULE_TYPES = [ - 'feeder', - 'extractor', - 'enricher', - 'database', - 'storage', - 'formatter' - ] - - _DEFAULT_MANIFEST = { - 'name': '', # the display name of the module - 'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name! - 'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES - 'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare - 'description': '', # a description of the module - 'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format - 'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName - 'version': '1.0', # the version of the module - 'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line -} + MODULE_TYPES = CONF_MODULE_TYPES + # NOTE: these here are declard as class variables, but they are overridden by the instance variables in the __init__ method config: Mapping[str, Any] authentication: Mapping[str, Mapping[str, str]] name: str + module_factory: ModuleFactory # this is set by the orchestrator prior to archiving tmp_dir: TemporaryDirectory = None diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 322ef6e..c3bc706 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -11,7 +11,7 @@ from ruamel.yaml import YAML, CommentedMap, add_representer from loguru import logger from copy import deepcopy -from .module import BaseModule +from auto_archiver.core.consts import MODULE_TYPES from typing import Any, List, Type, Tuple @@ -21,7 +21,7 @@ EMPTY_CONFIG = _yaml.load(""" # Auto Archiver Configuration # Steps are the modules that will be run in the order they are defined -steps:""" + "".join([f"\n {module}s: []" for module in BaseModule.MODULE_TYPES]) + \ +steps:""" + "".join([f"\n {module}s: []" for module in MODULE_TYPES]) + \ """ # Global configuration diff --git a/src/auto_archiver/core/consts.py b/src/auto_archiver/core/consts.py new file mode 100644 index 0000000..0fb81fb --- /dev/null +++ b/src/auto_archiver/core/consts.py @@ -0,0 +1,23 @@ + +MODULE_TYPES = [ + 'feeder', + 'extractor', + 'enricher', + 'database', + 'storage', + 'formatter' +] + +MANIFEST_FILE = "__manifest__.py" + +DEFAULT_MANIFEST = { + 'name': '', # the display name of the module + 'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name! + 'type': [], # the type of the module, can be one or more of MODULE_TYPES + 'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare + 'description': '', # a description of the module + 'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format + 'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName + 'version': '1.0', # the version of the module + 'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line +} \ No newline at end of file diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index c81e26a..9556621 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -6,7 +6,7 @@ by handling user configuration, validating the steps properties, and implementin from __future__ import annotations from dataclasses import dataclass -from typing import List +from typing import List, TYPE_CHECKING import shutil import ast import copy @@ -16,99 +16,113 @@ import os from os.path import join from loguru import logger import auto_archiver -from .base_module import BaseModule +from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE -_LAZY_LOADED_MODULES = {} - -MANIFEST_FILE = "__manifest__.py" +if TYPE_CHECKING: + from .base_module import BaseModule -def setup_paths(paths: list[str]) -> None: - """ - Sets up the paths for the modules to be loaded from - - This is necessary for the modules to be imported correctly - - """ - for path in paths: - # check path exists, if it doesn't, log a warning - if not os.path.exists(path): - logger.warning(f"Path '{path}' does not exist. Skipping...") - continue +HAS_SETUP_PATHS = False - # see odoo/module/module.py -> initialize_sys_path - if path not in auto_archiver.modules.__path__: - auto_archiver.modules.__path__.append(path) +class ModuleFactory: - # sort based on the length of the path, so that the longest path is last in the list - auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True) + def __init__(self): + self._lazy_modules = {} -def get_module(module_name: str, config: dict) -> BaseModule: - """ - Gets and sets up a module using the provided config - - This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy) - - """ - return get_module_lazy(module_name).load(config) + def setup_paths(self, paths: list[str]) -> None: + """ + Sets up the paths for the modules to be loaded from + + This is necessary for the modules to be imported correctly + + """ + global HAS_SETUP_PATHS -def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBaseModule: - """ - Lazily loads a module, returning a LazyBaseModule - - This has all the information about the module, but does not load the module itself or its dependencies - - To load an actual module, call .setup() on a lazy module - - """ - if module_name in _LAZY_LOADED_MODULES: - return _LAZY_LOADED_MODULES[module_name] - - available = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings) - if not available: - raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?") - return available[0] - -def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]: - - # search through all valid 'modules' paths. Default is 'modules' in the current directory - - # see odoo/modules/module.py -> get_modules - def is_really_module(module_path): - if os.path.isfile(join(module_path, MANIFEST_FILE)): - return True - - all_modules = [] - - for module_folder in auto_archiver.modules.__path__: - # walk through each module in module_folder and check if it has a valid manifest - try: - possible_modules = os.listdir(module_folder) - except FileNotFoundError: - logger.warning(f"Module folder {module_folder} does not exist") - continue - - for possible_module in possible_modules: - if limit_to_modules and possible_module not in limit_to_modules: + for path in paths: + # check path exists, if it doesn't, log a warning + if not os.path.exists(path): + logger.warning(f"Path '{path}' does not exist. Skipping...") continue - possible_module_path = join(module_folder, possible_module) - if not is_really_module(possible_module_path): + # see odoo/module/module.py -> initialize_sys_path + if path not in auto_archiver.modules.__path__: + if HAS_SETUP_PATHS == True: + logger.warning(f"You are attempting to re-initialise the module paths with: '{path}' for a 2nd time. \ + This could lead to unexpected behaviour. It is recommended to only use a single modules path. \ + If you wish to load modules from different paths then load a 2nd python interpreter (e.g. using multiprocessing).") + auto_archiver.modules.__path__.append(path) + + # sort based on the length of the path, so that the longest path is last in the list + auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True) + + HAS_SETUP_PATHS = True + + def get_module(self, module_name: str, config: dict) -> BaseModule: + """ + Gets and sets up a module using the provided config + + This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy) + + """ + return self.get_module_lazy(module_name).load(config) + + def get_module_lazy(self, module_name: str, suppress_warnings: bool = False) -> LazyBaseModule: + """ + Lazily loads a module, returning a LazyBaseModule + + This has all the information about the module, but does not load the module itself or its dependencies + + To load an actual module, call .setup() on a lazy module + + """ + if module_name in self._lazy_modules: + return self._lazy_modules[module_name] + + available = self.available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings) + if not available: + raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?") + return available[0] + + def available_modules(self, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]: + + # search through all valid 'modules' paths. Default is 'modules' in the current directory + + # see odoo/modules/module.py -> get_modules + def is_really_module(module_path): + if os.path.isfile(join(module_path, MANIFEST_FILE)): + return True + + all_modules = [] + + for module_folder in auto_archiver.modules.__path__: + # walk through each module in module_folder and check if it has a valid manifest + try: + possible_modules = os.listdir(module_folder) + except FileNotFoundError: + logger.warning(f"Module folder {module_folder} does not exist") continue - if _LAZY_LOADED_MODULES.get(possible_module): - continue - lazy_module = LazyBaseModule(possible_module, possible_module_path) - _LAZY_LOADED_MODULES[possible_module] = lazy_module + for possible_module in possible_modules: + if limit_to_modules and possible_module not in limit_to_modules: + continue - all_modules.append(lazy_module) - - if not suppress_warnings: - for module in limit_to_modules: - if not any(module == m.name for m in all_modules): - logger.warning(f"Module '{module}' not found. Are you sure it's installed?") + possible_module_path = join(module_folder, possible_module) + if not is_really_module(possible_module_path): + continue + if self._lazy_modules.get(possible_module): + continue + lazy_module = LazyBaseModule(possible_module, possible_module_path, factory=self) - return all_modules + self._lazy_modules[possible_module] = lazy_module + + all_modules.append(lazy_module) + + if not suppress_warnings: + for module in limit_to_modules: + if not any(module == m.name for m in all_modules): + logger.warning(f"Module '{module}' not found. Are you sure it's installed?") + + return all_modules @dataclass class LazyBaseModule: @@ -123,14 +137,16 @@ class LazyBaseModule: type: list description: str path: str + module_factory: ModuleFactory _manifest: dict = None _instance: BaseModule = None _entry_point: str = None - def __init__(self, module_name, path): + def __init__(self, module_name, path, factory: ModuleFactory): self.name = module_name self.path = path + self.module_factory = factory @property def entry_point(self): @@ -161,7 +177,7 @@ class LazyBaseModule: return self._manifest # print(f"Loading manifest for module {module_path}") # load the manifest file - manifest = copy.deepcopy(BaseModule._DEFAULT_MANIFEST) + manifest = copy.deepcopy(DEFAULT_MANIFEST) with open(join(self.path, MANIFEST_FILE)) as f: try: @@ -189,13 +205,14 @@ class LazyBaseModule: # clear out any empty strings that a user may have erroneously added continue if not check(dep): - logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.") + logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \ + Have you installed the required dependencies for the '{self.name}' module? See the README for more information.") exit(1) def check_python_dep(dep): # first check if it's a module: try: - m = get_module_lazy(dep, suppress_warnings=True) + m = self.module_factory.get_module_lazy(dep, suppress_warnings=True) try: # we must now load this module and set it up with the config m.load(config) @@ -230,19 +247,21 @@ class LazyBaseModule: __import__(f'{qualname}.{file_name}', fromlist=[self.entry_point]) # finally, get the class instance instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)() - if not getattr(instance, 'name', None): - instance.name = self.name - - if not getattr(instance, 'display_name', None): - instance.display_name = self.display_name - - self._instance = instance + # set the name, display name and module factory + instance.name = self.name + instance.display_name = self.display_name + instance.module_factory = self.module_factory + # merge the default config with the user config default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default')) + config[self.name] = default_config | config.get(self.name, {}) instance.config_setup(config) instance.setup() + + # save the instance for future easy loading + self._instance = instance return instance def __repr__(self): diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index f319e0d..10d9215 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -5,7 +5,7 @@ """ from __future__ import annotations -from typing import Generator, Union, List, Type +from typing import Generator, Union, List, Type, TYPE_CHECKING from urllib.parse import urlparse from ipaddress import ip_address from copy import copy @@ -22,12 +22,14 @@ from rich_argparse import RichHelpFormatter from .metadata import Metadata, Media from auto_archiver.version import __version__ from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser -from .module import available_modules, LazyBaseModule, get_module, setup_paths +from .module import ModuleFactory, LazyBaseModule from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher -from .module import BaseModule - +from .consts import MODULE_TYPES from loguru import logger +if TYPE_CHECKING: + from .base_module import BaseModule + from .module import LazyBaseModule DEFAULT_CONFIG_FILE = "orchestration.yaml" @@ -95,8 +97,12 @@ class UniqueAppendAction(argparse.Action): class ArchivingOrchestrator: - setup_finished: bool = False - logger_id: int = None + # instance variables + module_factory: ModuleFactory + setup_finished: bool + logger_id: int + + # instance variables, used for convenience to access modules by step feeders: List[Type[Feeder]] extractors: List[Type[Extractor]] enrichers: List[Type[Enricher]] @@ -104,6 +110,11 @@ class ArchivingOrchestrator: storages: List[Type[Storage]] formatters: List[Type[Formatter]] + def __init__(self): + self.module_factory = ModuleFactory() + self.setup_finished = False + self.logger_id = None + def setup_basic_parser(self): parser = argparse.ArgumentParser( prog="auto-archiver", @@ -135,7 +146,7 @@ class ArchivingOrchestrator: ) self.add_modules_args(modules_parser) cli_modules, unused_args = modules_parser.parse_known_args(unused_args) - for module_type in BaseModule.MODULE_TYPES: + for module_type in MODULE_TYPES: yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", []) parser = DefaultValidatingParser( @@ -157,15 +168,15 @@ class ArchivingOrchestrator: # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty? enabled_modules = [] # first loads the modules from the config file, then from the command line - for module_type in BaseModule.MODULE_TYPES: + for module_type in MODULE_TYPES: enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", [])) # clear out duplicates, but keep the order enabled_modules = list(dict.fromkeys(enabled_modules)) - avail_modules = available_modules(with_manifest=True, limit_to_modules=enabled_modules, suppress_warnings=True) + avail_modules = self.module_factory.available_modules(limit_to_modules=enabled_modules, suppress_warnings=True) self.add_individual_module_args(avail_modules, parser) elif basic_config.mode == 'simple': - simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup] + simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup] self.add_individual_module_args(simple_modules, parser) # for simple mode, we use the cli_feeder and any modules that don't require setup @@ -178,7 +189,7 @@ class ArchivingOrchestrator: yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name) else: # load all modules, they're not using the 'simple' mode - self.add_individual_module_args(available_modules(with_manifest=True), parser) + self.add_individual_module_args(self.module_factory.available_modules(), parser) parser.set_defaults(**to_dot_notation(yaml_config)) @@ -208,7 +219,7 @@ class ArchivingOrchestrator: parser = self.parser # Module loading from the command line - for module_type in BaseModule.MODULE_TYPES: + for module_type in MODULE_TYPES: parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction) def add_additional_args(self, parser: argparse.ArgumentParser = None): @@ -234,7 +245,7 @@ class ArchivingOrchestrator: def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None: if not modules: - modules = available_modules(with_manifest=True) + modules = self.module_factory.available_modules() for module in modules: @@ -297,7 +308,7 @@ class ArchivingOrchestrator: """ invalid_modules = [] - for module_type in BaseModule.MODULE_TYPES: + for module_type in MODULE_TYPES: step_items = [] modules_to_load = modules_by_type[f"{module_type}s"] @@ -342,7 +353,7 @@ class ArchivingOrchestrator: if module in invalid_modules: continue try: - loaded_module: BaseModule = get_module(module, self.config) + loaded_module: BaseModule = self.module_factory.get_module(module, self.config) except (KeyboardInterrupt, Exception) as e: logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}") if module_type == 'extractor' and loaded_module.name == module: @@ -378,7 +389,7 @@ class ArchivingOrchestrator: basic_config, unused_args = self.basic_parser.parse_known_args(args) # setup any custom module paths, so they'll show in the help and for arg parsing - setup_paths(basic_config.module_paths) + self.module_factory.setup_paths(basic_config.module_paths) # if help flag was called, then show the help if basic_config.help: @@ -409,7 +420,7 @@ class ArchivingOrchestrator: self.install_modules(self.config['steps']) # log out the modules that were loaded - for module_type in BaseModule.MODULE_TYPES: + for module_type in MODULE_TYPES: logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s"))) self.setup_finished = True diff --git a/src/auto_archiver/core/storage.py b/src/auto_archiver/core/storage.py index 15d4705..1535eab 100644 --- a/src/auto_archiver/core/storage.py +++ b/src/auto_archiver/core/storage.py @@ -14,7 +14,7 @@ from auto_archiver.utils.misc import random_str from auto_archiver.core import Media, BaseModule, Metadata from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher -from auto_archiver.core.module import get_module + class Storage(BaseModule): """ @@ -74,7 +74,7 @@ class Storage(BaseModule): filename = random_str(24) elif filename_generator == "static": # load the hash_enricher module - he = get_module(HashEnricher, self.config) + he = self.module_factory.get_module(HashEnricher, self.config) hd = he.calculate_hash(media.filename) filename = hd[:24] else: diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index ce4e67b..deb4b44 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -10,7 +10,6 @@ from auto_archiver.version import __version__ from auto_archiver.core import Metadata, Media from auto_archiver.core import Formatter from auto_archiver.utils.misc import random_str -from auto_archiver.core.module import get_module class HtmlFormatter(Formatter): environment: Environment = None @@ -50,7 +49,7 @@ class HtmlFormatter(Formatter): final_media = Media(filename=html_path, _mimetype="text/html") # get the already instantiated hash_enricher module - he = get_module('hash_enricher', self.config) + he = self.module_factory.get_module('hash_enricher', self.config) if len(hd := he.calculate_hash(final_media.filename)): final_media.set("hash", f"{he.algorithm}:{hd}") diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index 89579f9..7179bdd 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -4,7 +4,6 @@ from loguru import logger from auto_archiver.core import Enricher from auto_archiver.core import Metadata, Media -from auto_archiver.core.module import get_module class WhisperEnricher(Enricher): """ @@ -15,7 +14,7 @@ class WhisperEnricher(Enricher): def setup(self) -> None: self.stores = self.config['steps']['storages'] - self.s3 = get_module("s3_storage", self.config) + self.s3 = self.module_factory.get_module("s3_storage", self.config) if not "s3_storage" in self.stores: logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.") return diff --git a/tests/conftest.py b/tests/conftest.py index 8675fbc..eaa59b8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,7 +8,7 @@ from typing import Dict, Tuple import hashlib import pytest from auto_archiver.core.metadata import Metadata -from auto_archiver.core.module import get_module, _LAZY_LOADED_MODULES +from auto_archiver.core.module import ModuleFactory # Test names inserted into this list will be run last. This is useful for expensive/costly tests # that you only want to run if everything else succeeds (e.g. API calls). The order here is important @@ -20,19 +20,19 @@ TESTS_TO_RUN_LAST = ['test_twitter_api_archiver'] def setup_module(request): def _setup_module(module_name, config={}): + module_factory = ModuleFactory() + if isinstance(module_name, type): # get the module name: # if the class does not have a .name, use the name of the parent folder module_name = module_name.__module__.rsplit(".",2)[-2] - m = get_module(module_name, {module_name: config}) - + m = module_factory.get_module(module_name, {module_name: config}) # add the tmp_dir to the module tmp_dir = TemporaryDirectory() m.tmp_dir = tmp_dir.name - + def cleanup(): - _LAZY_LOADED_MODULES.pop(module_name) tmp_dir.cleanup() request.addfinalizer(cleanup) diff --git a/tests/enrichers/test_hash_enricher.py b/tests/enrichers/test_hash_enricher.py index 4b61fc2..c2fe67a 100644 --- a/tests/enrichers/test_hash_enricher.py +++ b/tests/enrichers/test_hash_enricher.py @@ -2,7 +2,7 @@ import pytest from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.core import Metadata, Media -from auto_archiver.core.module import get_module_lazy +from auto_archiver.core.module import ModuleFactory @pytest.mark.parametrize("algorithm, filename, expected_hash", [ ("SHA-256", "tests/data/testfile_1.txt", "1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"), @@ -22,7 +22,7 @@ def test_default_config_values(setup_module): def test_config(): # test default config - c = get_module_lazy('hash_enricher').configs + c = ModuleFactory().get_module_lazy('hash_enricher').configs assert c["algorithm"]["default"] == "SHA-256" assert c["chunksize"]["default"] == 16000000 assert c["algorithm"]["choices"] == ["SHA-256", "SHA3-512"] diff --git a/tests/test_modules.py b/tests/test_modules.py index 854edb5..7a2b14d 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -1,24 +1,18 @@ import sys import pytest -from auto_archiver.core.module import get_module_lazy, BaseModule, LazyBaseModule, _LAZY_LOADED_MODULES +from auto_archiver.core.module import ModuleFactory, LazyBaseModule +from auto_archiver.core.base_module import BaseModule @pytest.fixture def example_module(): import auto_archiver + module_factory = ModuleFactory() + previous_path = auto_archiver.modules.__path__ auto_archiver.modules.__path__.append("tests/data/test_modules/") - module = get_module_lazy("example_module") - yield module - # cleanup - try: - del module._manifest - except AttributeError: - pass - del _LAZY_LOADED_MODULES["example_module"] - sys.modules.pop("auto_archiver.modules.example_module.example_module", None) - auto_archiver.modules.__path__ = previous_path + return module_factory.get_module_lazy("example_module") def test_get_module_lazy(example_module): assert example_module.name == "example_module" @@ -46,12 +40,14 @@ def test_module_dependency_check_loads_module(example_module): # monkey patch the manifest to include a nonexistnet dependency example_module.manifest["dependencies"]["python"] = ["hash_enricher"] + module_factory = example_module.module_factory + loaded_module = example_module.load({}) assert loaded_module is not None # check the dependency is loaded - assert _LAZY_LOADED_MODULES["hash_enricher"] is not None - assert _LAZY_LOADED_MODULES["hash_enricher"]._instance is not None + assert module_factory._lazy_modules["hash_enricher"] is not None + assert module_factory._lazy_modules["hash_enricher"]._instance is not None def test_load_module(example_module): @@ -69,7 +65,7 @@ def test_load_module(example_module): @pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"]) def test_load_modules(module_name): # test that specific modules can be loaded - module = get_module_lazy(module_name) + module = ModuleFactory().get_module_lazy(module_name) assert module is not None assert isinstance(module, LazyBaseModule) assert module.name == module_name @@ -86,7 +82,7 @@ def test_load_modules(module_name): @pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"]) def test_lazy_base_module(module_name): - lazy_module = get_module_lazy(module_name) + lazy_module = ModuleFactory().get_module_lazy(module_name) assert lazy_module is not None assert isinstance(lazy_module, LazyBaseModule) diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index f93f8b8..301e4d9 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -4,7 +4,7 @@ from argparse import ArgumentParser, ArgumentTypeError from auto_archiver.core.orchestrator import ArchivingOrchestrator from auto_archiver.version import __version__ from auto_archiver.core.config import read_yaml, store_yaml -from auto_archiver.core.module import _LAZY_LOADED_MODULES + TEST_ORCHESTRATION = "tests/data/test_orchestration.yaml" TEST_MODULES = "tests/data/test_modules/" @@ -17,22 +17,7 @@ def test_args(): @pytest.fixture def orchestrator(): - yield ArchivingOrchestrator() - # hack - the loguru logger starts with one logger, but if orchestrator has run before - # it'll remove the default logger, add it back in: - - from loguru import logger - - if not logger._core.handlers.get(0): - logger._core.handlers_count = 0 - logger.add(sys.stderr) - # and remove the custom logger - if logger._core.handlers.get(1): - logger.remove(1) - - # delete out any loaded modules - _LAZY_LOADED_MODULES.clear() - + return ArchivingOrchestrator() @pytest.fixture def basic_parser(orchestrator) -> ArgumentParser: From 47a634fc63e1ba7a821e9b60d52d9fb057f3499d Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Wed, 19 Feb 2025 13:14:08 +0000 Subject: [PATCH 14/15] Add WACZ, Wayback and local storage tests. --- tests/enrichers/test_wacz_enricher.py | 112 ++++++++++++ tests/enrichers/test_wayback_enricher.py | 168 ++++++++++++++++++ .../test_instagram_tbot_extractor.py | 2 - tests/storages/test_local_storage.py | 54 ++++++ 4 files changed, 334 insertions(+), 2 deletions(-) create mode 100644 tests/enrichers/test_wacz_enricher.py create mode 100644 tests/enrichers/test_wayback_enricher.py create mode 100644 tests/storages/test_local_storage.py diff --git a/tests/enrichers/test_wacz_enricher.py b/tests/enrichers/test_wacz_enricher.py new file mode 100644 index 0000000..d55733d --- /dev/null +++ b/tests/enrichers/test_wacz_enricher.py @@ -0,0 +1,112 @@ +import os +from zipfile import ZipFile + +import pytest + +from auto_archiver.core import Metadata, Media + + +@pytest.fixture +def wacz_enricher(setup_module, mock_binary_dependencies): + configs: dict = { + "profile": None, + "docker_commands": None, + "timeout": 120, + "extract_media": False, + "extract_screenshot": True, + "socks_proxy_host": None, + "socks_proxy_port": None, + "proxy_server": None, + } + wacz = setup_module("wacz_enricher", configs) + return wacz + + +def test_setup_without_docker(wacz_enricher, mocker): + mocker.patch.dict(os.environ, {"RUNNING_IN_DOCKER": "1"}, clear=True) + wacz_enricher.setup() + assert not wacz_enricher.docker_in_docker + + +def test_setup_with_docker(wacz_enricher, mocker): + mocker.patch.dict(os.environ, {"WACZ_ENABLE_DOCKER": "1"}, clear=True) + wacz_enricher.setup() + assert wacz_enricher.use_docker + + +def test_already_ran(wacz_enricher, metadata, mocker): + metadata.add_media(Media("test.wacz"), id="browsertrix") + mock_log = mocker.patch("loguru.logger.info") + assert wacz_enricher.enrich(metadata) is True + assert "WACZ enricher had already been executed" in mock_log.call_args[0][0] + + +def test_basic_call_execution(wacz_enricher, mocker): + mock_run = mocker.patch("subprocess.run") + mock_run.return_value = mocker.Mock(returncode=0) + metadata = Metadata().set_url("https://example.com") + wacz_enricher.enrich(metadata) + assert mock_run.called + # Checks that the url is passed to the cmd + assert "--url https://example.com" in " ".join(mock_run.call_args[0][0]) + + +def test_download_success(wacz_enricher, mocker) -> None: + """Test download returns metadata on successful enrichment.""" + basic_metadata = Metadata().set_url("https://example.com") + mocker.patch.object(wacz_enricher, "enrich", return_value=True) + result = wacz_enricher.download(basic_metadata) + assert result is not None + assert isinstance(result, Metadata) + assert result.status == "wacz: success" + + +def test_enrich_already_executed(wacz_enricher, mocker) -> None: + """Test enrich if already executed.""" + mock_log = mocker.patch("loguru.logger.info") + metadata = Metadata().set_url("https://example.com") + media = Media(filename="some_file.wacz") + metadata.add_media(media, id="browsertrix") + result = wacz_enricher.enrich(metadata) + assert result is True + assert "WACZ enricher had already been executed:" in mock_log.call_args[0][0] + + +def test_enrich_subprocess_exception(wacz_enricher, mocker, tmp_path) -> None: + """Test enrich returns False when subprocess fails.""" + wacz_enricher.tmp_dir = str(tmp_path) + wacz_enricher.extract_media = False + wacz_enricher.extract_screenshot = True + mocker.patch("auto_archiver.utils.misc.random_str", return_value="TESTCOL") + mocker.patch("subprocess.run", side_effect=Exception("fail")) + basic_metadata = Metadata().set_url("https://example.com") + result = wacz_enricher.enrich(basic_metadata) + assert result is False + + +def test_extract_media(wacz_enricher, metadata, tmp_path, mocker) -> None: + """Test extract_media_from_wacz extracts screenshot media.""" + wacz_enricher.tmp_dir = str(tmp_path) + + # Create a *real* zip file so ZipFile won't fail. + wacz_file = tmp_path / "dummy.wacz" + with ZipFile(wacz_file, "w") as zf: + zf.writestr("dummy.txt", "test content") + + mocker.patch("os.listdir", return_value=[]) + warc_data = ( + b"WARC/1.0\r\n" + b"WARC-Type: resource\r\n" + b"Content-Type: image/png\r\n" + b"WARC-Target-URI: http://example.com/image.png\r\n" + b"Content-Length: 12\r\n" + b"\r\n" + b"image-bytes" + b"\r\n\r\nWARC/1.0\r\n\r\n" + ) + mock_file = mocker.mock_open(read_data=warc_data) + mocker.patch("builtins.open", mock_file) + metadata.add_media(Media("something.wacz"), "browsertrix") + wacz_enricher.extract_media_from_wacz(metadata, str(wacz_file)) + assert len(metadata.media) == 2 + assert metadata.media[1].properties.get("id") == "browsertrix-screenshot" diff --git a/tests/enrichers/test_wayback_enricher.py b/tests/enrichers/test_wayback_enricher.py new file mode 100644 index 0000000..88f4662 --- /dev/null +++ b/tests/enrichers/test_wayback_enricher.py @@ -0,0 +1,168 @@ +import json +import requests +import pytest +from auto_archiver.modules.wayback_extractor_enricher import WaybackExtractorEnricher +from auto_archiver.core import Metadata + + +@pytest.fixture +def mock_is_auth_wall(mocker): + """Fixture to mock is_auth_wall behavior.""" + def _mock_is_auth_wall(return_value: bool): + return mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=return_value) + return _mock_is_auth_wall + +@pytest.fixture +def mock_post_success(mocker): + """Fixture to mock POST requests with a successful response.""" + def _mock_post(json_data: dict = None, status_code: int = 200): + json_data = json_data or {"job_id": "job123"} + resp = mocker.Mock(status_code=status_code) + resp.json.return_value = json_data + return mocker.patch("requests.post", return_value=resp) + return _mock_post + +@pytest.fixture +def mock_get_success(mocker): + """Fixture to mock GET requests returning a completed archive status.""" + def _mock_get(json_data: dict = None, status_code: int = 200): + json_data = json_data or { + "status": "success", + "timestamp": "20250101010101", + "original_url": "https://example.com" + } + resp = mocker.Mock(status_code=status_code) + resp.json.return_value = json_data + return mocker.patch("requests.get", return_value=resp) + return _mock_get + +@pytest.fixture +def wayback_extractor_enricher(setup_module) -> WaybackExtractorEnricher: + configs: dict = { + "timeout": 5, + "if_not_archived_within": None, + "key": "somekey", + "secret": "secret", + "proxy_http": None, + "proxy_https": None, + } + return setup_module("wayback_extractor_enricher", configs) + + +def test_download_success( + wayback_extractor_enricher, + mock_is_auth_wall, + mock_post_success, + mock_get_success +): + mock_is_auth_wall(False) + mock_post_success() + mock_get_success() + # Basic metadata to allow merge + metadata = Metadata().set_url("https://example.com") + result = wayback_extractor_enricher.download(metadata) + assert result.get("wayback") == "https://web.archive.org/web/20250101010101/https://example.com" + +def test_enrich_auth_wall(wayback_extractor_enricher, metadata, mock_is_auth_wall): + mock_is_auth_wall(True) + result = wayback_extractor_enricher.enrich(metadata) + assert result is None + +def test_enrich_already_enriched(wayback_extractor_enricher, metadata): + metadata.set("wayback", "existing") + result = wayback_extractor_enricher.enrich(metadata) + assert result is True + +def test_enrich_post_failure( + wayback_extractor_enricher, + metadata, + mock_is_auth_wall, + mock_post_success +): + mock_is_auth_wall(False) + mock_post_success(json_data={"error": "server error"}, status_code=500) + result = wayback_extractor_enricher.enrich(metadata) + assert result is False + assert "Internet archive failed with status of 500" in metadata.get("wayback") + +def test_enrich_post_json_decode_error( + wayback_extractor_enricher, + metadata, + mock_is_auth_wall, + mocker +): + mock_is_auth_wall(False) + resp = mocker.Mock(status_code=200) + resp.json.side_effect = json.decoder.JSONDecodeError("msg", "doc", 0) + resp.text = "invalid json" + mocker.patch("requests.post", return_value=resp) + assert wayback_extractor_enricher.enrich(metadata) is False + +def test_enrich_no_job_id( + wayback_extractor_enricher, + metadata, + mock_is_auth_wall, + mock_post_success +): + mock_is_auth_wall(False) + mock_post_success(json_data={}) + assert wayback_extractor_enricher.enrich(metadata) is False + +def test_enrich_get_success( + wayback_extractor_enricher, + metadata, + mock_is_auth_wall, + mock_post_success, + mock_get_success +): + mock_is_auth_wall(False) + mock_post_success() + mock_get_success() + assert wayback_extractor_enricher.enrich(metadata) is True + assert metadata.get("wayback") == "https://web.archive.org/web/20250101010101/https://example.com" + assert metadata.get("check wayback") == "https://web.archive.org/web/*/https://example.com" + +def test_enrich_get_failure( + wayback_extractor_enricher, + metadata, + mock_is_auth_wall, + mock_post_success, + mock_get_success +): + mock_is_auth_wall(False) + mock_post_success() + mock_get_success(json_data={"status": "failed"}, status_code=400) + assert wayback_extractor_enricher.enrich(metadata) is False + +def test_enrich_get_request_exception( + wayback_extractor_enricher, + metadata, + mock_is_auth_wall, + mock_post_success, + mocker +): + mock_is_auth_wall(False) + mock_post_success() + mocker.patch("requests.get", side_effect=requests.exceptions.RequestException("error")) + mocker.patch("time.sleep", return_value=None) + # check it still enriches the job_id information + assert wayback_extractor_enricher.enrich(metadata) is True + assert metadata.get("wayback").get("job_id") == "job123" + +def test_enrich_get_json_decode_error( + wayback_extractor_enricher, + metadata, + mock_is_auth_wall, + mock_post_success, + mocker +): + mock_is_auth_wall(False) + mock_post_success() + resp = mocker.Mock() + resp.json.side_effect = json.decoder.JSONDecodeError("msg", "doc", 0) + resp.text = "invalid json" + mocker.patch("requests.get", return_value=resp) + mocker.patch("time.sleep", return_value=None) + # check it still enriches the job_id information + assert wayback_extractor_enricher.enrich(metadata) is True + assert metadata.get("wayback").get("job_id") == "job123" diff --git a/tests/extractors/test_instagram_tbot_extractor.py b/tests/extractors/test_instagram_tbot_extractor.py index 9238f89..f274728 100644 --- a/tests/extractors/test_instagram_tbot_extractor.py +++ b/tests/extractors/test_instagram_tbot_extractor.py @@ -6,8 +6,6 @@ from auto_archiver.core import Metadata from auto_archiver.modules.instagram_tbot_extractor import InstagramTbotExtractor from tests.extractors.test_extractor_base import TestExtractorBase -TESTFILES = os.path.join(os.path.dirname(__file__), "testfiles") - @pytest.fixture def patch_extractor_methods(request, setup_module, mocker): diff --git a/tests/storages/test_local_storage.py b/tests/storages/test_local_storage.py new file mode 100644 index 0000000..85f97c6 --- /dev/null +++ b/tests/storages/test_local_storage.py @@ -0,0 +1,54 @@ + +import os +from pathlib import Path + +import pytest + +from auto_archiver.core import Media +from auto_archiver.modules.local_storage import LocalStorage + + +@pytest.fixture +def local_storage(setup_module) -> LocalStorage: + configs: dict = { + "path_generator": "flat", + "filename_generator": "static", + "save_to": "./local_archive", + "save_absolute": False, + } + return setup_module("local_storage", configs) + + +@pytest.fixture +def sample_media(tmp_path) -> Media: + """Fixture creating a Media object with temporary source file""" + src_file = tmp_path / "source.txt" + src_file.write_text("test content") + return Media(key="subdir/test.txt", filename=str(src_file)) + + +def test_get_cdn_url_relative(local_storage): + media = Media(key="test.txt", filename="dummy.txt") + expected = os.path.join(local_storage.save_to, media.key) + assert local_storage.get_cdn_url(media) == expected + + + +def test_get_cdn_url_absolute(local_storage): + media = Media(key="test.txt", filename="dummy.txt") + local_storage.save_absolute = True + expected = os.path.abspath(os.path.join(local_storage.save_to, media.key)) + assert local_storage.get_cdn_url(media) == expected + +def test_upload_file_contents_and_metadata(local_storage, sample_media): + dest = os.path.join(local_storage.save_to, sample_media.key) + assert local_storage.upload(sample_media) is True + assert Path(sample_media.filename).read_text() == Path(dest).read_text() + + +def test_upload_nonexistent_source(local_storage): + media = Media(key="missing.txt", filename="nonexistent.txt") + with pytest.raises(FileNotFoundError): + local_storage.upload(media) + + From 04507577b68272365dd76725ac311ae396a5f301 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 19 Feb 2025 13:36:50 +0000 Subject: [PATCH 15/15] Version bump --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9823833..636604d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [project] name = "auto-archiver" -version = "0.13.3" +version = "0.13.4" description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)." requires-python = ">=3.10,<3.13"