diff --git a/poetry.lock b/poetry.lock index bbfb975..a8d43e6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -152,34 +152,34 @@ lxml = ["lxml"] [[package]] name = "boto3" -version = "1.35.99" +version = "1.36.3" description = "The AWS SDK for Python" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "boto3-1.35.99-py3-none-any.whl", hash = "sha256:83e560faaec38a956dfb3d62e05e1703ee50432b45b788c09e25107c5058bd71"}, - {file = "boto3-1.35.99.tar.gz", hash = "sha256:e0abd794a7a591d90558e92e29a9f8837d25ece8e3c120e530526fe27eba5fca"}, + {file = "boto3-1.36.3-py3-none-any.whl", hash = "sha256:f9843a5d06f501d66ada06f5a5417f671823af2cf319e36ceefa1bafaaaaa953"}, + {file = "boto3-1.36.3.tar.gz", hash = "sha256:53a5307f6a3526ee2f8590e3c45efa504a3ea4532c1bfe4926c0c19bf188d141"}, ] [package.dependencies] -botocore = ">=1.35.99,<1.36.0" +botocore = ">=1.36.3,<1.37.0" jmespath = ">=0.7.1,<2.0.0" -s3transfer = ">=0.10.0,<0.11.0" +s3transfer = ">=0.11.0,<0.12.0" [package.extras] crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.35.99" +version = "1.36.3" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "botocore-1.35.99-py3-none-any.whl", hash = "sha256:b22d27b6b617fc2d7342090d6129000af2efd20174215948c0d7ae2da0fab445"}, - {file = "botocore-1.35.99.tar.gz", hash = "sha256:1eab44e969c39c5f3d9a3104a0836c24715579a455f12b3979a31d7cde51b3c3"}, + {file = "botocore-1.36.3-py3-none-any.whl", hash = "sha256:536ab828e6f90dbb000e3702ac45fd76642113ae2db1b7b1373ad24104e89255"}, + {file = "botocore-1.36.3.tar.gz", hash = "sha256:775b835e979da5c96548ed1a0b798101a145aec3cd46541d62e27dda5a94d7f8"}, ] [package.dependencies] @@ -188,7 +188,7 @@ python-dateutil = ">=2.1,<3.0.0" urllib3 = {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""} [package.extras] -crt = ["awscrt (==0.22.0)"] +crt = ["awscrt (==0.23.4)"] [[package]] name = "brotli" @@ -343,14 +343,14 @@ beautifulsoup4 = "*" [[package]] name = "cachetools" -version = "5.5.0" +version = "5.5.1" description = "Extensible memoizing collections and decorators" optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "cachetools-5.5.0-py3-none-any.whl", hash = "sha256:02134e8439cdc2ffb62023ce1debca2944c3f289d66bb17ead3ab3dede74b292"}, - {file = "cachetools-5.5.0.tar.gz", hash = "sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a"}, + {file = "cachetools-5.5.1-py3-none-any.whl", hash = "sha256:b76651fdc3b24ead3c648bbdeeb940c1b04d365b38b4af66788f9ec4a81d42bb"}, + {file = "cachetools-5.5.1.tar.gz", hash = "sha256:70f238fbba50383ef62e55c6aff6d9673175fe59f7c6782c7a0b9e38f4a9df95"}, ] [[package]] @@ -2083,32 +2083,32 @@ pyasn1 = ">=0.1.3" [[package]] name = "s3transfer" -version = "0.10.4" +version = "0.11.1" description = "An Amazon S3 Transfer Manager" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "s3transfer-0.10.4-py3-none-any.whl", hash = "sha256:244a76a24355363a68164241438de1b72f8781664920260c48465896b712a41e"}, - {file = "s3transfer-0.10.4.tar.gz", hash = "sha256:29edc09801743c21eb5ecbc617a152df41d3c287f67b615f73e5f750583666a7"}, + {file = "s3transfer-0.11.1-py3-none-any.whl", hash = "sha256:8fa0aa48177be1f3425176dfe1ab85dcd3d962df603c3dbfc585e6bf857ef0ff"}, + {file = "s3transfer-0.11.1.tar.gz", hash = "sha256:3f25c900a367c8b7f7d8f9c34edc87e300bde424f779dc9f0a8ae4f9df9264f6"}, ] [package.dependencies] -botocore = ">=1.33.2,<2.0a.0" +botocore = ">=1.36.0,<2.0a.0" [package.extras] -crt = ["botocore[crt] (>=1.33.2,<2.0a.0)"] +crt = ["botocore[crt] (>=1.36.0,<2.0a.0)"] [[package]] name = "selenium" -version = "4.27.1" +version = "4.28.0" description = "Official Python bindings for Selenium WebDriver" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" groups = ["main"] files = [ - {file = "selenium-4.27.1-py3-none-any.whl", hash = "sha256:b89b1f62b5cfe8025868556fe82360d6b649d464f75d2655cb966c8f8447ea18"}, - {file = "selenium-4.27.1.tar.gz", hash = "sha256:5296c425a75ff1b44d0d5199042b36a6d1ef76c04fb775b97b40be739a9caae2"}, + {file = "selenium-4.28.0-py3-none-any.whl", hash = "sha256:3d6a2e8e1b850a1078884ea19f4e011ecdc12263434d87a0b78769836fb82dd8"}, + {file = "selenium-4.28.0.tar.gz", hash = "sha256:a9fae6eef48d470a1b0c6e45185d96f0dafb025e8da4b346cc41e4da3ac54fa0"}, ] [package.dependencies] @@ -2617,15 +2617,15 @@ typing-extensions = ">=3.7.4" [[package]] name = "tzdata" -version = "2024.2" +version = "2025.1" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" groups = ["main"] markers = "platform_system == \"Windows\"" files = [ - {file = "tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd"}, - {file = "tzdata-2024.2.tar.gz", hash = "sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc"}, + {file = "tzdata-2025.1-py2.py3-none-any.whl", hash = "sha256:7e127113816800496f027041c570f50bcd464a020098a3b6b199517772303639"}, + {file = "tzdata-2025.1.tar.gz", hash = "sha256:24894909e88cdb28bd1636c6887801df64cb485bd593f2fd83ef29075a81d694"}, ] [[package]] @@ -2868,81 +2868,81 @@ test = ["websockets"] [[package]] name = "websockets" -version = "14.1" +version = "14.2" description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)" optional = false python-versions = ">=3.9" groups = ["main", "docs"] files = [ - {file = "websockets-14.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a0adf84bc2e7c86e8a202537b4fd50e6f7f0e4a6b6bf64d7ccb96c4cd3330b29"}, - {file = "websockets-14.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90b5d9dfbb6d07a84ed3e696012610b6da074d97453bd01e0e30744b472c8179"}, - {file = "websockets-14.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2177ee3901075167f01c5e335a6685e71b162a54a89a56001f1c3e9e3d2ad250"}, - {file = "websockets-14.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f14a96a0034a27f9d47fd9788913924c89612225878f8078bb9d55f859272b0"}, - {file = "websockets-14.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f874ba705deea77bcf64a9da42c1f5fc2466d8f14daf410bc7d4ceae0a9fcb0"}, - {file = "websockets-14.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9607b9a442392e690a57909c362811184ea429585a71061cd5d3c2b98065c199"}, - {file = "websockets-14.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:bea45f19b7ca000380fbd4e02552be86343080120d074b87f25593ce1700ad58"}, - {file = "websockets-14.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:219c8187b3ceeadbf2afcf0f25a4918d02da7b944d703b97d12fb01510869078"}, - {file = "websockets-14.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ad2ab2547761d79926effe63de21479dfaf29834c50f98c4bf5b5480b5838434"}, - {file = "websockets-14.1-cp310-cp310-win32.whl", hash = "sha256:1288369a6a84e81b90da5dbed48610cd7e5d60af62df9851ed1d1d23a9069f10"}, - {file = "websockets-14.1-cp310-cp310-win_amd64.whl", hash = "sha256:e0744623852f1497d825a49a99bfbec9bea4f3f946df6eb9d8a2f0c37a2fec2e"}, - {file = "websockets-14.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:449d77d636f8d9c17952628cc7e3b8faf6e92a17ec581ec0c0256300717e1512"}, - {file = "websockets-14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a35f704be14768cea9790d921c2c1cc4fc52700410b1c10948511039be824aac"}, - {file = "websockets-14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b1f3628a0510bd58968c0f60447e7a692933589b791a6b572fcef374053ca280"}, - {file = "websockets-14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c3deac3748ec73ef24fc7be0b68220d14d47d6647d2f85b2771cb35ea847aa1"}, - {file = "websockets-14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7048eb4415d46368ef29d32133134c513f507fff7d953c18c91104738a68c3b3"}, - {file = "websockets-14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6cf0ad281c979306a6a34242b371e90e891bce504509fb6bb5246bbbf31e7b6"}, - {file = "websockets-14.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cc1fc87428c1d18b643479caa7b15db7d544652e5bf610513d4a3478dbe823d0"}, - {file = "websockets-14.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f95ba34d71e2fa0c5d225bde3b3bdb152e957150100e75c86bc7f3964c450d89"}, - {file = "websockets-14.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9481a6de29105d73cf4515f2bef8eb71e17ac184c19d0b9918a3701c6c9c4f23"}, - {file = "websockets-14.1-cp311-cp311-win32.whl", hash = "sha256:368a05465f49c5949e27afd6fbe0a77ce53082185bbb2ac096a3a8afaf4de52e"}, - {file = "websockets-14.1-cp311-cp311-win_amd64.whl", hash = "sha256:6d24fc337fc055c9e83414c94e1ee0dee902a486d19d2a7f0929e49d7d604b09"}, - {file = "websockets-14.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ed907449fe5e021933e46a3e65d651f641975a768d0649fee59f10c2985529ed"}, - {file = "websockets-14.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:87e31011b5c14a33b29f17eb48932e63e1dcd3fa31d72209848652310d3d1f0d"}, - {file = "websockets-14.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bc6ccf7d54c02ae47a48ddf9414c54d48af9c01076a2e1023e3b486b6e72c707"}, - {file = "websockets-14.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9777564c0a72a1d457f0848977a1cbe15cfa75fa2f67ce267441e465717dcf1a"}, - {file = "websockets-14.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a655bde548ca98f55b43711b0ceefd2a88a71af6350b0c168aa77562104f3f45"}, - {file = "websockets-14.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a3dfff83ca578cada2d19e665e9c8368e1598d4e787422a460ec70e531dbdd58"}, - {file = "websockets-14.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6a6c9bcf7cdc0fd41cc7b7944447982e8acfd9f0d560ea6d6845428ed0562058"}, - {file = "websockets-14.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4b6caec8576e760f2c7dd878ba817653144d5f369200b6ddf9771d64385b84d4"}, - {file = "websockets-14.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:eb6d38971c800ff02e4a6afd791bbe3b923a9a57ca9aeab7314c21c84bf9ff05"}, - {file = "websockets-14.1-cp312-cp312-win32.whl", hash = "sha256:1d045cbe1358d76b24d5e20e7b1878efe578d9897a25c24e6006eef788c0fdf0"}, - {file = "websockets-14.1-cp312-cp312-win_amd64.whl", hash = "sha256:90f4c7a069c733d95c308380aae314f2cb45bd8a904fb03eb36d1a4983a4993f"}, - {file = "websockets-14.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:3630b670d5057cd9e08b9c4dab6493670e8e762a24c2c94ef312783870736ab9"}, - {file = "websockets-14.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:36ebd71db3b89e1f7b1a5deaa341a654852c3518ea7a8ddfdf69cc66acc2db1b"}, - {file = "websockets-14.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5b918d288958dc3fa1c5a0b9aa3256cb2b2b84c54407f4813c45d52267600cd3"}, - {file = "websockets-14.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00fe5da3f037041da1ee0cf8e308374e236883f9842c7c465aa65098b1c9af59"}, - {file = "websockets-14.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8149a0f5a72ca36720981418eeffeb5c2729ea55fa179091c81a0910a114a5d2"}, - {file = "websockets-14.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:77569d19a13015e840b81550922056acabc25e3f52782625bc6843cfa034e1da"}, - {file = "websockets-14.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cf5201a04550136ef870aa60ad3d29d2a59e452a7f96b94193bee6d73b8ad9a9"}, - {file = "websockets-14.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:88cf9163ef674b5be5736a584c999e98daf3aabac6e536e43286eb74c126b9c7"}, - {file = "websockets-14.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:836bef7ae338a072e9d1863502026f01b14027250a4545672673057997d5c05a"}, - {file = "websockets-14.1-cp313-cp313-win32.whl", hash = "sha256:0d4290d559d68288da9f444089fd82490c8d2744309113fc26e2da6e48b65da6"}, - {file = "websockets-14.1-cp313-cp313-win_amd64.whl", hash = "sha256:8621a07991add373c3c5c2cf89e1d277e49dc82ed72c75e3afc74bd0acc446f0"}, - {file = "websockets-14.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:01bb2d4f0a6d04538d3c5dfd27c0643269656c28045a53439cbf1c004f90897a"}, - {file = "websockets-14.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:414ffe86f4d6f434a8c3b7913655a1a5383b617f9bf38720e7c0799fac3ab1c6"}, - {file = "websockets-14.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8fda642151d5affdee8a430bd85496f2e2517be3a2b9d2484d633d5712b15c56"}, - {file = "websockets-14.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd7c11968bc3860d5c78577f0dbc535257ccec41750675d58d8dc66aa47fe52c"}, - {file = "websockets-14.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a032855dc7db987dff813583d04f4950d14326665d7e714d584560b140ae6b8b"}, - {file = "websockets-14.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7e7ea2f782408c32d86b87a0d2c1fd8871b0399dd762364c731d86c86069a78"}, - {file = "websockets-14.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:39450e6215f7d9f6f7bc2a6da21d79374729f5d052333da4d5825af8a97e6735"}, - {file = "websockets-14.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ceada5be22fa5a5a4cdeec74e761c2ee7db287208f54c718f2df4b7e200b8d4a"}, - {file = "websockets-14.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:3fc753451d471cff90b8f467a1fc0ae64031cf2d81b7b34e1811b7e2691bc4bc"}, - {file = "websockets-14.1-cp39-cp39-win32.whl", hash = "sha256:14839f54786987ccd9d03ed7f334baec0f02272e7ec4f6e9d427ff584aeea8b4"}, - {file = "websockets-14.1-cp39-cp39-win_amd64.whl", hash = "sha256:d9fd19ecc3a4d5ae82ddbfb30962cf6d874ff943e56e0c81f5169be2fda62979"}, - {file = "websockets-14.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:e5dc25a9dbd1a7f61eca4b7cb04e74ae4b963d658f9e4f9aad9cd00b688692c8"}, - {file = "websockets-14.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:04a97aca96ca2acedf0d1f332c861c5a4486fdcba7bcef35873820f940c4231e"}, - {file = "websockets-14.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:df174ece723b228d3e8734a6f2a6febbd413ddec39b3dc592f5a4aa0aff28098"}, - {file = "websockets-14.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:034feb9f4286476f273b9a245fb15f02c34d9586a5bc936aff108c3ba1b21beb"}, - {file = "websockets-14.1-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:660c308dabd2b380807ab64b62985eaccf923a78ebc572bd485375b9ca2b7dc7"}, - {file = "websockets-14.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5a42d3ecbb2db5080fc578314439b1d79eef71d323dc661aa616fb492436af5d"}, - {file = "websockets-14.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:ddaa4a390af911da6f680be8be4ff5aaf31c4c834c1a9147bc21cbcbca2d4370"}, - {file = "websockets-14.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:a4c805c6034206143fbabd2d259ec5e757f8b29d0a2f0bf3d2fe5d1f60147a4a"}, - {file = "websockets-14.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:205f672a6c2c671a86d33f6d47c9b35781a998728d2c7c2a3e1cf3333fcb62b7"}, - {file = "websockets-14.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef440054124728cc49b01c33469de06755e5a7a4e83ef61934ad95fc327fbb0"}, - {file = "websockets-14.1-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7591d6f440af7f73c4bd9404f3772bfee064e639d2b6cc8c94076e71b2471c1"}, - {file = "websockets-14.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:25225cc79cfebc95ba1d24cd3ab86aaa35bcd315d12fa4358939bd55e9bd74a5"}, - {file = "websockets-14.1-py3-none-any.whl", hash = "sha256:4d4fc827a20abe6d544a119896f6b78ee13fe81cbfef416f3f2ddf09a03f0e2e"}, - {file = "websockets-14.1.tar.gz", hash = "sha256:398b10c77d471c0aab20a845e7a60076b6390bfdaac7a6d2edb0d2c59d75e8d8"}, + {file = "websockets-14.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e8179f95323b9ab1c11723e5d91a89403903f7b001828161b480a7810b334885"}, + {file = "websockets-14.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0d8c3e2cdb38f31d8bd7d9d28908005f6fa9def3324edb9bf336d7e4266fd397"}, + {file = "websockets-14.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:714a9b682deb4339d39ffa674f7b674230227d981a37d5d174a4a83e3978a610"}, + {file = "websockets-14.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2e53c72052f2596fb792a7acd9704cbc549bf70fcde8a99e899311455974ca3"}, + {file = "websockets-14.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e3fbd68850c837e57373d95c8fe352203a512b6e49eaae4c2f4088ef8cf21980"}, + {file = "websockets-14.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b27ece32f63150c268593d5fdb82819584831a83a3f5809b7521df0685cd5d8"}, + {file = "websockets-14.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4daa0faea5424d8713142b33825fff03c736f781690d90652d2c8b053345b0e7"}, + {file = "websockets-14.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:bc63cee8596a6ec84d9753fd0fcfa0452ee12f317afe4beae6b157f0070c6c7f"}, + {file = "websockets-14.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7a570862c325af2111343cc9b0257b7119b904823c675b22d4ac547163088d0d"}, + {file = "websockets-14.2-cp310-cp310-win32.whl", hash = "sha256:75862126b3d2d505e895893e3deac0a9339ce750bd27b4ba515f008b5acf832d"}, + {file = "websockets-14.2-cp310-cp310-win_amd64.whl", hash = "sha256:cc45afb9c9b2dc0852d5c8b5321759cf825f82a31bfaf506b65bf4668c96f8b2"}, + {file = "websockets-14.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3bdc8c692c866ce5fefcaf07d2b55c91d6922ac397e031ef9b774e5b9ea42166"}, + {file = "websockets-14.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c93215fac5dadc63e51bcc6dceca72e72267c11def401d6668622b47675b097f"}, + {file = "websockets-14.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1c9b6535c0e2cf8a6bf938064fb754aaceb1e6a4a51a80d884cd5db569886910"}, + {file = "websockets-14.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a52a6d7cf6938e04e9dceb949d35fbdf58ac14deea26e685ab6368e73744e4c"}, + {file = "websockets-14.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9f05702e93203a6ff5226e21d9b40c037761b2cfb637187c9802c10f58e40473"}, + {file = "websockets-14.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22441c81a6748a53bfcb98951d58d1af0661ab47a536af08920d129b4d1c3473"}, + {file = "websockets-14.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:efd9b868d78b194790e6236d9cbc46d68aba4b75b22497eb4ab64fa640c3af56"}, + {file = "websockets-14.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1a5a20d5843886d34ff8c57424cc65a1deda4375729cbca4cb6b3353f3ce4142"}, + {file = "websockets-14.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:34277a29f5303d54ec6468fb525d99c99938607bc96b8d72d675dee2b9f5bf1d"}, + {file = "websockets-14.2-cp311-cp311-win32.whl", hash = "sha256:02687db35dbc7d25fd541a602b5f8e451a238ffa033030b172ff86a93cb5dc2a"}, + {file = "websockets-14.2-cp311-cp311-win_amd64.whl", hash = "sha256:862e9967b46c07d4dcd2532e9e8e3c2825e004ffbf91a5ef9dde519ee2effb0b"}, + {file = "websockets-14.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1f20522e624d7ffbdbe259c6b6a65d73c895045f76a93719aa10cd93b3de100c"}, + {file = "websockets-14.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:647b573f7d3ada919fd60e64d533409a79dcf1ea21daeb4542d1d996519ca967"}, + {file = "websockets-14.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6af99a38e49f66be5a64b1e890208ad026cda49355661549c507152113049990"}, + {file = "websockets-14.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:091ab63dfc8cea748cc22c1db2814eadb77ccbf82829bac6b2fbe3401d548eda"}, + {file = "websockets-14.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b374e8953ad477d17e4851cdc66d83fdc2db88d9e73abf755c94510ebddceb95"}, + {file = "websockets-14.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a39d7eceeea35db85b85e1169011bb4321c32e673920ae9c1b6e0978590012a3"}, + {file = "websockets-14.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0a6f3efd47ffd0d12080594f434faf1cd2549b31e54870b8470b28cc1d3817d9"}, + {file = "websockets-14.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:065ce275e7c4ffb42cb738dd6b20726ac26ac9ad0a2a48e33ca632351a737267"}, + {file = "websockets-14.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e9d0e53530ba7b8b5e389c02282f9d2aa47581514bd6049d3a7cffe1385cf5fe"}, + {file = "websockets-14.2-cp312-cp312-win32.whl", hash = "sha256:20e6dd0984d7ca3037afcb4494e48c74ffb51e8013cac71cf607fffe11df7205"}, + {file = "websockets-14.2-cp312-cp312-win_amd64.whl", hash = "sha256:44bba1a956c2c9d268bdcdf234d5e5ff4c9b6dc3e300545cbe99af59dda9dcce"}, + {file = "websockets-14.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6f1372e511c7409a542291bce92d6c83320e02c9cf392223272287ce55bc224e"}, + {file = "websockets-14.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4da98b72009836179bb596a92297b1a61bb5a830c0e483a7d0766d45070a08ad"}, + {file = "websockets-14.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8a86a269759026d2bde227652b87be79f8a734e582debf64c9d302faa1e9f03"}, + {file = "websockets-14.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86cf1aaeca909bf6815ea714d5c5736c8d6dd3a13770e885aafe062ecbd04f1f"}, + {file = "websockets-14.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9b0f6c3ba3b1240f602ebb3971d45b02cc12bd1845466dd783496b3b05783a5"}, + {file = "websockets-14.2-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:669c3e101c246aa85bc8534e495952e2ca208bd87994650b90a23d745902db9a"}, + {file = "websockets-14.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:eabdb28b972f3729348e632ab08f2a7b616c7e53d5414c12108c29972e655b20"}, + {file = "websockets-14.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2066dc4cbcc19f32c12a5a0e8cc1b7ac734e5b64ac0a325ff8353451c4b15ef2"}, + {file = "websockets-14.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ab95d357cd471df61873dadf66dd05dd4709cae001dd6342edafc8dc6382f307"}, + {file = "websockets-14.2-cp313-cp313-win32.whl", hash = "sha256:a9e72fb63e5f3feacdcf5b4ff53199ec8c18d66e325c34ee4c551ca748623bbc"}, + {file = "websockets-14.2-cp313-cp313-win_amd64.whl", hash = "sha256:b439ea828c4ba99bb3176dc8d9b933392a2413c0f6b149fdcba48393f573377f"}, + {file = "websockets-14.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7cd5706caec1686c5d233bc76243ff64b1c0dc445339bd538f30547e787c11fe"}, + {file = "websockets-14.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ec607328ce95a2f12b595f7ae4c5d71bf502212bddcea528290b35c286932b12"}, + {file = "websockets-14.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:da85651270c6bfb630136423037dd4975199e5d4114cae6d3066641adcc9d1c7"}, + {file = "websockets-14.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3ecadc7ce90accf39903815697917643f5b7cfb73c96702318a096c00aa71f5"}, + {file = "websockets-14.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1979bee04af6a78608024bad6dfcc0cc930ce819f9e10342a29a05b5320355d0"}, + {file = "websockets-14.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2dddacad58e2614a24938a50b85969d56f88e620e3f897b7d80ac0d8a5800258"}, + {file = "websockets-14.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:89a71173caaf75fa71a09a5f614f450ba3ec84ad9fca47cb2422a860676716f0"}, + {file = "websockets-14.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:6af6a4b26eea4fc06c6818a6b962a952441e0e39548b44773502761ded8cc1d4"}, + {file = "websockets-14.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:80c8efa38957f20bba0117b48737993643204645e9ec45512579132508477cfc"}, + {file = "websockets-14.2-cp39-cp39-win32.whl", hash = "sha256:2e20c5f517e2163d76e2729104abc42639c41cf91f7b1839295be43302713661"}, + {file = "websockets-14.2-cp39-cp39-win_amd64.whl", hash = "sha256:b4c8cef610e8d7c70dea92e62b6814a8cd24fbd01d7103cc89308d2bfe1659ef"}, + {file = "websockets-14.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:d7d9cafbccba46e768be8a8ad4635fa3eae1ffac4c6e7cb4eb276ba41297ed29"}, + {file = "websockets-14.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:c76193c1c044bd1e9b3316dcc34b174bbf9664598791e6fb606d8d29000e070c"}, + {file = "websockets-14.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd475a974d5352390baf865309fe37dec6831aafc3014ffac1eea99e84e83fc2"}, + {file = "websockets-14.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2c6c0097a41968b2e2b54ed3424739aab0b762ca92af2379f152c1aef0187e1c"}, + {file = "websockets-14.2-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d7ff794c8b36bc402f2e07c0b2ceb4a2424147ed4785ff03e2a7af03711d60a"}, + {file = "websockets-14.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:dec254fcabc7bd488dab64846f588fc5b6fe0d78f641180030f8ea27b76d72c3"}, + {file = "websockets-14.2-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:bbe03eb853e17fd5b15448328b4ec7fb2407d45fb0245036d06a3af251f8e48f"}, + {file = "websockets-14.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:a3c4aa3428b904d5404a0ed85f3644d37e2cb25996b7f096d77caeb0e96a3b42"}, + {file = "websockets-14.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:577a4cebf1ceaf0b65ffc42c54856214165fb8ceeba3935852fc33f6b0c55e7f"}, + {file = "websockets-14.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ad1c1d02357b7665e700eca43a31d52814ad9ad9b89b58118bdabc365454b574"}, + {file = "websockets-14.2-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f390024a47d904613577df83ba700bd189eedc09c57af0a904e5c39624621270"}, + {file = "websockets-14.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3c1426c021c38cf92b453cdf371228d3430acd775edee6bac5a4d577efc72365"}, + {file = "websockets-14.2-py3-none-any.whl", hash = "sha256:7a6ceec4ea84469f15cf15807a747e9efe57e369c384fa86e022b3bea679b79b"}, + {file = "websockets-14.2.tar.gz", hash = "sha256:5059ed9c54945efb321f097084b4c7e52c246f2c869815876a69d1efc4ad6eb5"}, ] [[package]] diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py index 779d3ac..cf95dc0 100644 --- a/src/auto_archiver/core/__init__.py +++ b/src/auto_archiver/core/__init__.py @@ -4,4 +4,9 @@ # cannot import ArchivingOrchestrator/Config to avoid circular dep # from .orchestrator import ArchivingOrchestrator -# from .config import Config \ No newline at end of file +# from .config import Config + +from .media import Media +from .step import Step +from .context import ArchivingContext +from .metadata import Metadata diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index ee3a190..f6c411e 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -190,7 +190,6 @@ class ArchivingOrchestrator: yaml_config = read_yaml(basic_config.config_file) - breakpoint() self.setup_complete_parser(basic_config, yaml_config, unused_args) self.install_modules() diff --git a/src/auto_archiver/databases/__init__.py b/src/auto_archiver/databases/__init__.py index d6de470..3a8d787 100644 --- a/src/auto_archiver/databases/__init__.py +++ b/src/auto_archiver/databases/__init__.py @@ -3,8 +3,3 @@ """ from .database import Database -from .gsheet_db.gsheet_db import GsheetsDb -from .console_db.console_db import ConsoleDb -from .csv_db.csv_db import CSVDb -from .api_db.api_db import AAApiDb -from .atlos_db.atlos_db import AtlosDb \ No newline at end of file diff --git a/src/auto_archiver/databases/api_db/api_db.py b/src/auto_archiver/databases/api_db/api_db.py deleted file mode 100644 index 84bdfcb..0000000 --- a/src/auto_archiver/databases/api_db/api_db.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Union -import requests, os -from loguru import logger - -from .. import Database -from ...core import Metadata - - -class AAApiDb(Database): - """ - Connects to auto-archiver-api instance - """ - name = "auto_archiver_api_db" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - self.allow_rearchive = bool(self.allow_rearchive) - self.store_results = bool(self.store_results) - self.assert_valid_string("api_endpoint") - - @staticmethod - def configs() -> dict: - return { - "api_endpoint": {"default": None, "help": "API endpoint where calls are made to"}, - "api_token": {"default": None, "help": "API Bearer token."}, - "public": {"default": False, "help": "whether the URL should be publicly available via the API"}, - "author_id": {"default": None, "help": "which email to assign as author"}, - "group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"}, - "allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"}, - "store_results": {"default": True, "help": "when set, will send the results to the API database."}, - "tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))}, - } - def fetch(self, item: Metadata) -> Union[Metadata, bool]: - """ query the database for the existence of this item. - Helps avoid re-archiving the same URL multiple times. - """ - if not self.allow_rearchive: return - - params = {"url": item.get_url(), "limit": 15} - headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"} - response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers) - - if response.status_code == 200: - if len(response.json()): - logger.success(f"API returned {len(response.json())} previously archived instance(s)") - fetched_metadata = [Metadata.from_dict(r["result"]) for r in response.json()] - return Metadata.choose_most_complete(fetched_metadata) - else: - logger.error(f"AA API FAIL ({response.status_code}): {response.json()}") - return False - - - def done(self, item: Metadata, cached: bool=False) -> None: - """archival result ready - should be saved to DB""" - if not self.store_results: return - if cached: - logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached") - return - logger.debug(f"saving archive of {item.get_url()} to the AA API.") - - payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)} - headers = {"Authorization": f"Bearer {self.api_token}"} - response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, headers=headers) - - if response.status_code == 200: - logger.success(f"AA API: {response.json()}") - else: - logger.error(f"AA API FAIL ({response.status_code}): {response.json()}") - diff --git a/src/auto_archiver/databases/atlos_db/atlos_db.py b/src/auto_archiver/databases/atlos_db/atlos_db.py deleted file mode 100644 index 4a00b9d..0000000 --- a/src/auto_archiver/databases/atlos_db/atlos_db.py +++ /dev/null @@ -1,79 +0,0 @@ -import os -from typing import Union -from loguru import logger -from csv import DictWriter -from dataclasses import asdict -import requests - -from .. import Database -from ...core import Metadata -from ...utils import get_atlos_config_options - - -class AtlosDb(Database): - """ - Outputs results to Atlos - """ - - name = "atlos_db" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - - @staticmethod - def configs() -> dict: - return get_atlos_config_options() - - def failed(self, item: Metadata, reason: str) -> None: - """Update DB accordingly for failure""" - # If the item has no Atlos ID, there's nothing for us to do - if not item.metadata.get("atlos_id"): - logger.info(f"Item {item.get_url()} has no Atlos ID, skipping") - return - - requests.post( - f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver", - headers={"Authorization": f"Bearer {self.api_token}"}, - json={"metadata": {"processed": True, "status": "error", "error": reason}}, - ).raise_for_status() - logger.info( - f"Stored failure for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos: {reason}" - ) - - def fetch(self, item: Metadata) -> Union[Metadata, bool]: - """check and fetch if the given item has been archived already, each - database should handle its own caching, and configuration mechanisms""" - return False - - def _process_metadata(self, item: Metadata) -> dict: - """Process metadata for storage on Atlos. Will convert any datetime - objects to ISO format.""" - - return { - k: v.isoformat() if hasattr(v, "isoformat") else v - for k, v in item.metadata.items() - } - - def done(self, item: Metadata, cached: bool = False) -> None: - """archival result ready - should be saved to DB""" - - if not item.metadata.get("atlos_id"): - logger.info(f"Item {item.get_url()} has no Atlos ID, skipping") - return - - requests.post( - f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver", - headers={"Authorization": f"Bearer {self.api_token}"}, - json={ - "metadata": dict( - processed=True, - status="success", - results=self._process_metadata(item), - ) - }, - ).raise_for_status() - - logger.info( - f"Stored success for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos" - ) diff --git a/src/auto_archiver/databases/console_db/console_db.py b/src/auto_archiver/databases/console_db/console_db.py deleted file mode 100644 index a5e648b..0000000 --- a/src/auto_archiver/databases/console_db/console_db.py +++ /dev/null @@ -1,32 +0,0 @@ -from loguru import logger - -from .. import Database -from ...core import Metadata - - -class ConsoleDb(Database): - """ - Outputs results to the console - """ - name = "console_db" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - - @staticmethod - def configs() -> dict: - return {} - - def started(self, item: Metadata) -> None: - logger.warning(f"STARTED {item}") - - def failed(self, item: Metadata, reason:str) -> None: - logger.error(f"FAILED {item}: {reason}") - - def aborted(self, item: Metadata) -> None: - logger.warning(f"ABORTED {item}") - - def done(self, item: Metadata, cached: bool=False) -> None: - """archival result ready - should be saved to DB""" - logger.success(f"DONE {item}") \ No newline at end of file diff --git a/src/auto_archiver/databases/csv_db/csv_db.py b/src/auto_archiver/databases/csv_db/csv_db.py deleted file mode 100644 index e24306f..0000000 --- a/src/auto_archiver/databases/csv_db/csv_db.py +++ /dev/null @@ -1,34 +0,0 @@ -import os -from loguru import logger -from csv import DictWriter -from dataclasses import asdict - -from .. import Database -from ...core import Metadata - - -class CSVDb(Database): - """ - Outputs results to a CSV file - """ - name = "csv_db" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - self.assert_valid_string("csv_file") - - @staticmethod - def configs() -> dict: - return { - "csv_file": {"default": "db.csv", "help": "CSV file name"} - } - - def done(self, item: Metadata, cached: bool=False) -> None: - """archival result ready - should be saved to DB""" - logger.success(f"DONE {item}") - is_empty = not os.path.isfile(self.csv_file) or os.path.getsize(self.csv_file) == 0 - with open(self.csv_file, "a", encoding="utf-8") as outf: - writer = DictWriter(outf, fieldnames=asdict(Metadata())) - if is_empty: writer.writeheader() - writer.writerow(asdict(item)) diff --git a/src/auto_archiver/databases/gsheet_db/gsheet_db.py b/src/auto_archiver/databases/gsheet_db/gsheet_db.py deleted file mode 100644 index 631a554..0000000 --- a/src/auto_archiver/databases/gsheet_db/gsheet_db.py +++ /dev/null @@ -1,112 +0,0 @@ -from typing import Union, Tuple -import datetime -from urllib.parse import quote - -from loguru import logger - -from .. import Database -from ...core import Metadata, Media, ArchivingContext -from ...utils import GWorksheet - - -class GsheetsDb(Database): - """ - NB: only works if GsheetFeeder is used. - could be updated in the future to support non-GsheetFeeder metadata - """ - name = "gsheet_db" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - - @staticmethod - def configs() -> dict: - return {} - - def started(self, item: Metadata) -> None: - logger.warning(f"STARTED {item}") - gw, row = self._retrieve_gsheet(item) - gw.set_cell(row, 'status', 'Archive in progress') - - def failed(self, item: Metadata, reason:str) -> None: - logger.error(f"FAILED {item}") - self._safe_status_update(item, f'Archive failed {reason}') - - def aborted(self, item: Metadata) -> None: - logger.warning(f"ABORTED {item}") - self._safe_status_update(item, '') - - def fetch(self, item: Metadata) -> Union[Metadata, bool]: - """check if the given item has been archived already""" - return False - - def done(self, item: Metadata, cached: bool=False) -> None: - """archival result ready - should be saved to DB""" - logger.success(f"DONE {item.get_url()}") - gw, row = self._retrieve_gsheet(item) - # self._safe_status_update(item, 'done') - - cell_updates = [] - row_values = gw.get_row(row) - - def batch_if_valid(col, val, final_value=None): - final_value = final_value or val - try: - if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '': - cell_updates.append((row, col, final_value)) - except Exception as e: - logger.error(f"Unable to batch {col}={final_value} due to {e}") - status_message = item.status - if cached: - status_message = f"[cached] {status_message}" - cell_updates.append((row, 'status', status_message)) - - media: Media = item.get_final_media() - if hasattr(media, "urls"): - batch_if_valid('archive', "\n".join(media.urls)) - batch_if_valid('date', True, datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat()) - batch_if_valid('title', item.get_title()) - batch_if_valid('text', item.get("content", "")) - batch_if_valid('timestamp', item.get_timestamp()) - if media: batch_if_valid('hash', media.get("hash", "not-calculated")) - - # merge all pdq hashes into a single string, if present - pdq_hashes = [] - all_media = item.get_all_media() - for m in all_media: - if pdq := m.get("pdq_hash"): - pdq_hashes.append(pdq) - if len(pdq_hashes): - batch_if_valid('pdq_hash', ",".join(pdq_hashes)) - - if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"): - batch_if_valid('screenshot', "\n".join(screenshot.urls)) - - if (thumbnail := item.get_first_image("thumbnail")): - if hasattr(thumbnail, "urls"): - batch_if_valid('thumbnail', f'=IMAGE("{thumbnail.urls[0]}")') - - if (browsertrix := item.get_media_by_id("browsertrix")): - batch_if_valid('wacz', "\n".join(browsertrix.urls)) - batch_if_valid('replaywebpage', "\n".join([f'https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}' for wacz in browsertrix.urls])) - - gw.batch_set_cell(cell_updates) - - def _safe_status_update(self, item: Metadata, new_status: str) -> None: - try: - gw, row = self._retrieve_gsheet(item) - gw.set_cell(row, 'status', new_status) - except Exception as e: - logger.debug(f"Unable to update sheet: {e}") - - def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: - # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now - if gsheet := ArchivingContext.get("gsheet"): - gw: GWorksheet = gsheet.get("worksheet") - row: int = gsheet.get("row") - elif self.sheet_id: - print(self.sheet_id) - - - return gw, row diff --git a/src/auto_archiver/databases/api_db/__init__.py b/src/auto_archiver/modules/api_db/__init__.py similarity index 100% rename from src/auto_archiver/databases/api_db/__init__.py rename to src/auto_archiver/modules/api_db/__init__.py diff --git a/src/auto_archiver/databases/api_db/__manifest__.py b/src/auto_archiver/modules/api_db/__manifest__.py similarity index 100% rename from src/auto_archiver/databases/api_db/__manifest__.py rename to src/auto_archiver/modules/api_db/__manifest__.py diff --git a/src/auto_archiver/databases/api_db.py b/src/auto_archiver/modules/api_db/api_db.py similarity index 69% rename from src/auto_archiver/databases/api_db.py rename to src/auto_archiver/modules/api_db/api_db.py index 4304855..fa1ae75 100644 --- a/src/auto_archiver/databases/api_db.py +++ b/src/auto_archiver/modules/api_db/api_db.py @@ -2,8 +2,8 @@ from typing import Union import requests, os from loguru import logger -from . import Database -from ..core import Metadata +from auto_archiver.databases import Database +from auto_archiver.core import Metadata class AAApiDb(Database): @@ -19,18 +19,7 @@ class AAApiDb(Database): self.store_results = bool(self.store_results) self.assert_valid_string("api_endpoint") - @staticmethod - def configs() -> dict: - return { - "api_endpoint": {"default": None, "help": "API endpoint where calls are made to"}, - "api_token": {"default": None, "help": "API Bearer token."}, - "public": {"default": False, "help": "whether the URL should be publicly available via the API"}, - "author_id": {"default": None, "help": "which email to assign as author"}, - "group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"}, - "allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"}, - "store_results": {"default": True, "help": "when set, will send the results to the API database."}, - "tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))}, - } + def fetch(self, item: Metadata) -> Union[Metadata, bool]: """ query the database for the existence of this item. Helps avoid re-archiving the same URL multiple times. diff --git a/src/auto_archiver/databases/atlos_db/__init__.py b/src/auto_archiver/modules/atlos_db/__init__.py similarity index 100% rename from src/auto_archiver/databases/atlos_db/__init__.py rename to src/auto_archiver/modules/atlos_db/__init__.py diff --git a/src/auto_archiver/databases/atlos_db/__manifest__.py b/src/auto_archiver/modules/atlos_db/__manifest__.py similarity index 59% rename from src/auto_archiver/databases/atlos_db/__manifest__.py rename to src/auto_archiver/modules/atlos_db/__manifest__.py index 1e2b676..470d07d 100644 --- a/src/auto_archiver/databases/atlos_db/__manifest__.py +++ b/src/auto_archiver/modules/atlos_db/__manifest__.py @@ -7,7 +7,18 @@ {"python": ["loguru", ""], "bin": [""]}, - "configs": {}, + "configs": { + "api_token": { + "default": None, + "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", + "cli_set": lambda cli_val, _: cli_val + }, + "atlos_url": { + "default": "https://platform.atlos.org", + "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", + "cli_set": lambda cli_val, _: cli_val + }, + }, "description": """ Handles integration with the Atlos platform for managing archival results. diff --git a/src/auto_archiver/databases/atlos_db.py b/src/auto_archiver/modules/atlos_db/atlos_db.py similarity index 94% rename from src/auto_archiver/databases/atlos_db.py rename to src/auto_archiver/modules/atlos_db/atlos_db.py index 16c4910..376ba32 100644 --- a/src/auto_archiver/databases/atlos_db.py +++ b/src/auto_archiver/modules/atlos_db/atlos_db.py @@ -5,9 +5,9 @@ from csv import DictWriter from dataclasses import asdict import requests -from . import Database -from ..core import Metadata -from ..utils import get_atlos_config_options +from auto_archiver.databases import Database +from auto_archiver.core import Metadata +from auto_archiver.utils import get_atlos_config_options class AtlosDb(Database): @@ -21,6 +21,7 @@ class AtlosDb(Database): # without this STEP.__init__ is not called super().__init__(config) + # TODO @staticmethod def configs() -> dict: return get_atlos_config_options() diff --git a/src/auto_archiver/databases/console_db/__init__.py b/src/auto_archiver/modules/atlos_feeder/__init__.py similarity index 100% rename from src/auto_archiver/databases/console_db/__init__.py rename to src/auto_archiver/modules/atlos_feeder/__init__.py diff --git a/src/auto_archiver/modules/atlos_feeder/__manifest__.py b/src/auto_archiver/modules/atlos_feeder/__manifest__.py new file mode 100644 index 0000000..f0b216b --- /dev/null +++ b/src/auto_archiver/modules/atlos_feeder/__manifest__.py @@ -0,0 +1,34 @@ +{ + "name": "Atlos Feeder", + "type": ["feeder"], + "requires_setup": True, + "external_dependencies": { + "python": ["loguru", "requests"], + }, + "configs": { + "api_token": { + "default": None, + "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", + "cli_set": lambda cli_val, _: cli_val + }, + "atlos_url": { + "default": "https://platform.atlos.org", + "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", + "cli_set": lambda cli_val, _: cli_val + }, + }, + "description": """ + AtlosFeeder: A feeder module that integrates with the Atlos API to fetch source material URLs for archival. + + ### Features + - Connects to the Atlos API to retrieve a list of source material URLs. + - Filters source materials based on visibility, processing status, and metadata. + - Converts filtered source materials into `Metadata` objects with the relevant `atlos_id` and URL. + - Iterates through paginated results using a cursor for efficient API interaction. + + ### Notes + - Requires an Atlos API endpoint and a valid API token for authentication. + - Ensures only unprocessed, visible, and ready-to-archive URLs are returned. + - Handles pagination transparently when retrieving data from the Atlos API. + """ +} diff --git a/src/auto_archiver/feeders/atlos_feeder.py b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py similarity index 91% rename from src/auto_archiver/feeders/atlos_feeder.py rename to src/auto_archiver/modules/atlos_feeder/atlos_feeder.py index d3acc00..d344139 100644 --- a/src/auto_archiver/feeders/atlos_feeder.py +++ b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py @@ -1,9 +1,9 @@ from loguru import logger import requests -from . import Feeder -from ..core import Metadata, ArchivingContext -from ..utils import get_atlos_config_options +from auto_archiver.feeders import Feeder +from auto_archiver.core import Metadata, ArchivingContext +from auto_archiver.utils import get_atlos_config_options class AtlosFeeder(Feeder): @@ -15,6 +15,7 @@ class AtlosFeeder(Feeder): if type(self.api_token) != str: raise Exception("Atlos Feeder did not receive an Atlos API token") + # TODO @staticmethod def configs() -> dict: return get_atlos_config_options() diff --git a/src/auto_archiver/databases/csv_db/__init__.py b/src/auto_archiver/modules/cli_feeder/__init__.py similarity index 100% rename from src/auto_archiver/databases/csv_db/__init__.py rename to src/auto_archiver/modules/cli_feeder/__init__.py diff --git a/src/auto_archiver/modules/cli_feeder/__manifest__.py b/src/auto_archiver/modules/cli_feeder/__manifest__.py new file mode 100644 index 0000000..fcb9099 --- /dev/null +++ b/src/auto_archiver/modules/cli_feeder/__manifest__.py @@ -0,0 +1,24 @@ +{ + "name": "CLI Feeder", + "type": ["feeder"], + "requires_setup": False, + "external_dependencies": { + "python": ["loguru"], + }, + "configs": { + "urls": { + "default": None, + "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml", + "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(","))) + }, + }, + "description": """ + Processes URLs to archive passed via the command line and feeds them into the archiving pipeline. + + ### Features + - Takes a single URL or a list of URLs provided via the command line. + - Converts each URL into a `Metadata` object and yields it for processing. + - Ensures URLs are processed only if they are explicitly provided. + + """ +} diff --git a/src/auto_archiver/feeders/cli_feeder.py b/src/auto_archiver/modules/cli_feeder/cli_feeder.py similarity index 57% rename from src/auto_archiver/feeders/cli_feeder.py rename to src/auto_archiver/modules/cli_feeder/cli_feeder.py index b2f0add..1376379 100644 --- a/src/auto_archiver/feeders/cli_feeder.py +++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py @@ -1,7 +1,7 @@ from loguru import logger -from . import Feeder -from ..core import Metadata, ArchivingContext +from auto_archiver.feeders import Feeder +from auto_archiver.core import Metadata, ArchivingContext class CLIFeeder(Feeder): @@ -13,15 +13,15 @@ class CLIFeeder(Feeder): if type(self.urls) != list or len(self.urls) == 0: raise Exception("CLI Feeder did not receive any URL to process") - @staticmethod - def configs() -> dict: - return { - "urls": { - "default": None, - "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml", - "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(","))) - }, - } + # @staticmethod + # def configs() -> dict: + # return { + # "urls": { + # "default": None, + # "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml", + # "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(","))) + # }, + # } def __iter__(self) -> Metadata: for url in self.urls: diff --git a/src/auto_archiver/databases/gsheet_db/__init__.py b/src/auto_archiver/modules/console_db/__init__.py similarity index 100% rename from src/auto_archiver/databases/gsheet_db/__init__.py rename to src/auto_archiver/modules/console_db/__init__.py diff --git a/src/auto_archiver/databases/console_db/__manifest__.py b/src/auto_archiver/modules/console_db/__manifest__.py similarity index 100% rename from src/auto_archiver/databases/console_db/__manifest__.py rename to src/auto_archiver/modules/console_db/__manifest__.py diff --git a/src/auto_archiver/databases/console_db.py b/src/auto_archiver/modules/console_db/console_db.py similarity index 86% rename from src/auto_archiver/databases/console_db.py rename to src/auto_archiver/modules/console_db/console_db.py index bd45f95..357c696 100644 --- a/src/auto_archiver/databases/console_db.py +++ b/src/auto_archiver/modules/console_db/console_db.py @@ -1,7 +1,7 @@ from loguru import logger -from . import Database -from ..core import Metadata +from auto_archiver.databases import Database +from auto_archiver.core import Metadata class ConsoleDb(Database): @@ -14,10 +14,6 @@ class ConsoleDb(Database): # without this STEP.__init__ is not called super().__init__(config) - @staticmethod - def configs() -> dict: - return {} - def started(self, item: Metadata) -> None: logger.warning(f"STARTED {item}") diff --git a/src/auto_archiver/modules/csv_db/__init__.py b/src/auto_archiver/modules/csv_db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/databases/csv_db/__manifest__.py b/src/auto_archiver/modules/csv_db/__manifest__.py similarity index 100% rename from src/auto_archiver/databases/csv_db/__manifest__.py rename to src/auto_archiver/modules/csv_db/__manifest__.py diff --git a/src/auto_archiver/databases/csv_db.py b/src/auto_archiver/modules/csv_db/csv_db.py similarity index 81% rename from src/auto_archiver/databases/csv_db.py rename to src/auto_archiver/modules/csv_db/csv_db.py index f0d7153..642e889 100644 --- a/src/auto_archiver/databases/csv_db.py +++ b/src/auto_archiver/modules/csv_db/csv_db.py @@ -3,8 +3,8 @@ from loguru import logger from csv import DictWriter from dataclasses import asdict -from . import Database -from ..core import Metadata +from auto_archiver.databases import Database +from auto_archiver.core import Metadata class CSVDb(Database): @@ -18,11 +18,6 @@ class CSVDb(Database): super().__init__(config) self.assert_valid_string("csv_file") - @staticmethod - def configs() -> dict: - return { - "csv_file": {"default": "db.csv", "help": "CSV file name"} - } def done(self, item: Metadata, cached: bool=False) -> None: """archival result ready - should be saved to DB""" diff --git a/src/auto_archiver/modules/csv_feeder/__init__.py b/src/auto_archiver/modules/csv_feeder/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/csv_feeder/__manifest__.py b/src/auto_archiver/modules/csv_feeder/__manifest__.py new file mode 100644 index 0000000..ad5d40b --- /dev/null +++ b/src/auto_archiver/modules/csv_feeder/__manifest__.py @@ -0,0 +1,33 @@ +{ + "name": "CSV Feeder", + "type": ["feeder"], + "requires_setup": False, + "external_dependencies": { + "python": ["loguru"], + "bin": [""] + }, + "configs": { + "files": { + "default": None, + "help": "Path to the input file(s) to read the URLs from, comma separated. \ + Input files should be formatted with one URL per line", + "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(","))) + }, + "column": { + "default": None, + "help": "Column number or name to read the URLs from, 0-indexed", + } + }, + "description": """ + Reads URLs from CSV files and feeds them into the archiving process. + + ### Features + - Supports reading URLs from multiple input files, specified as a comma-separated list. + - Allows specifying the column number or name to extract URLs from. + - Skips header rows if the first value is not a valid URL. + - Integrates with the `ArchivingContext` to manage URL feeding. + + ### Setu N + - Input files should be formatted with one URL per line. + """ +} diff --git a/src/auto_archiver/feeders/csv_feeder.py b/src/auto_archiver/modules/csv_feeder/csv_feeder.py similarity index 88% rename from src/auto_archiver/feeders/csv_feeder.py rename to src/auto_archiver/modules/csv_feeder/csv_feeder.py index 00bf7d7..b665bd9 100644 --- a/src/auto_archiver/feeders/csv_feeder.py +++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py @@ -1,12 +1,15 @@ from loguru import logger import csv -from . import Feeder -from ..core import Metadata, ArchivingContext -from ..utils import url_or_none +from auto_archiver.feeders import Feeder +from auto_archiver.core import Metadata, ArchivingContext +from auto_archiver.utils import url_or_none class CSVFeeder(Feeder): + name = "csv_feeder" + + @staticmethod def configs() -> dict: return { diff --git a/src/auto_archiver/modules/gsheet_db/__init__.py b/src/auto_archiver/modules/gsheet_db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/databases/gsheet_db/__manifest__.py b/src/auto_archiver/modules/gsheet_db/__manifest__.py similarity index 100% rename from src/auto_archiver/databases/gsheet_db/__manifest__.py rename to src/auto_archiver/modules/gsheet_db/__manifest__.py diff --git a/src/auto_archiver/databases/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py similarity index 96% rename from src/auto_archiver/databases/gsheet_db.py rename to src/auto_archiver/modules/gsheet_db/gsheet_db.py index 98e72dc..8e17966 100644 --- a/src/auto_archiver/databases/gsheet_db.py +++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py @@ -4,9 +4,9 @@ from urllib.parse import quote from loguru import logger -from . import Database -from ..core import Metadata, Media, ArchivingContext -from ..utils import GWorksheet +from auto_archiver.databases import Database +from auto_archiver.core import Metadata, Media, ArchivingContext +from auto_archiver.utils import GWorksheet class GsheetsDb(Database): @@ -20,10 +20,6 @@ class GsheetsDb(Database): # without this STEP.__init__ is not called super().__init__(config) - @staticmethod - def configs() -> dict: - return {} - def started(self, item: Metadata) -> None: logger.warning(f"STARTED {item}") gw, row = self._retrieve_gsheet(item) diff --git a/src/auto_archiver/modules/gsheet_feeder/__init__.py b/src/auto_archiver/modules/gsheet_feeder/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py new file mode 100644 index 0000000..2af090c --- /dev/null +++ b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py @@ -0,0 +1,40 @@ +{ + "name": "Google Sheets Feeder", + "type": ["feeder"], + "requires_setup": True, + "external_dependencies": { + "python": ["loguru", "gspread", "python-slugify"], + }, + "configs": { + "allow_worksheets": { + "default": set(), + "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", + "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) + }, + "block_worksheets": { + "default": set(), + "help": "(CSV) explicitly block some worksheets from being processed", + "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) + }, + "use_sheet_names_in_stored_paths": { + "default": True, + "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", + } + }, + "description": """ + GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver. + + This reads data from Google Sheets and filters rows based on user-defined rules. + The filtered rows are processed into `Metadata` objects. + + ### Features + - Validates the sheet structure and filters rows based on input configurations. + - Processes only worksheets allowed by the `allow_worksheets` and `block_worksheets` configurations. + - Ensures only rows with valid URLs and unprocessed statuses are included for archival. + - Supports organizing stored files into folder paths based on sheet and worksheet names. + + ### Notes + - Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`. + - Create the sheet using the template provided in the docs. + """ +} diff --git a/src/auto_archiver/feeders/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py similarity index 74% rename from src/auto_archiver/feeders/gsheet_feeder.py rename to src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index 1c4fc32..5c73bf6 100644 --- a/src/auto_archiver/feeders/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -14,9 +14,9 @@ from loguru import logger from slugify import slugify # from . import Enricher -from . import Feeder -from ..core import Metadata, ArchivingContext -from ..utils import Gsheets, GWorksheet +from auto_archiver.feeders import Feeder +from auto_archiver.core import Metadata, ArchivingContext +from auto_archiver.utils import Gsheets, GWorksheet class GsheetsFeeder(Gsheets, Feeder): @@ -27,26 +27,26 @@ class GsheetsFeeder(Gsheets, Feeder): super().__init__(config) self.gsheets_client = gspread.service_account(filename=self.service_account) - @staticmethod - def configs() -> dict: - return dict( - Gsheets.configs(), - ** { - "allow_worksheets": { - "default": set(), - "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", - "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) - }, - "block_worksheets": { - "default": set(), - "help": "(CSV) explicitly block some worksheets from being processed", - "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) - }, - "use_sheet_names_in_stored_paths": { - "default": True, - "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", - } - }) + # @staticmethod + # def configs() -> dict: + # return dict( + # Gsheets.configs(), + # ** { + # "allow_worksheets": { + # "default": set(), + # "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", + # "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) + # }, + # "block_worksheets": { + # "default": set(), + # "help": "(CSV) explicitly block some worksheets from being processed", + # "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) + # }, + # "use_sheet_names_in_stored_paths": { + # "default": True, + # "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", + # } + # }) def __iter__(self) -> Metadata: sh = self.open_sheet() diff --git a/src/auto_archiver/modules/hash_enricher/__init__.py b/src/auto_archiver/modules/hash_enricher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/hash_enricher/__manifest__.py b/src/auto_archiver/modules/hash_enricher/__manifest__.py new file mode 100644 index 0000000..311ed6f --- /dev/null +++ b/src/auto_archiver/modules/hash_enricher/__manifest__.py @@ -0,0 +1,27 @@ +{ + "name": "Hash Enricher", + "type": ["enricher"], + "requires_setup": False, + "external_dependencies": { + "python": ["loguru"], + }, + "configs": { + "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}, + "chunksize": {"default": int(1.6e7), "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"}, + }, + "description": """ +Generates cryptographic hashes for media files to ensure data integrity and authenticity. + +### Features +- Calculates cryptographic hashes (SHA-256 or SHA3-512) for media files stored in `Metadata` objects. +- Ensures content authenticity, integrity validation, and duplicate identification. +- Efficiently processes large files by reading file bytes in configurable chunk sizes. +- Supports dynamic configuration of hash algorithms and chunk sizes. +- Updates media metadata with the computed hash value in the format `:`. + +### Notes +- Default hash algorithm is SHA-256, but SHA3-512 is also supported. +- Chunk size defaults to 16 MB but can be adjusted based on memory requirements. +- Useful for workflows requiring hash-based content validation or deduplication. +""", +} diff --git a/src/auto_archiver/enrichers/hash_enricher.py b/src/auto_archiver/modules/hash_enricher/hash_enricher.py similarity index 85% rename from src/auto_archiver/enrichers/hash_enricher.py rename to src/auto_archiver/modules/hash_enricher/hash_enricher.py index 69973b7..355413a 100644 --- a/src/auto_archiver/enrichers/hash_enricher.py +++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py @@ -10,8 +10,8 @@ making it suitable for handling large files efficiently. import hashlib from loguru import logger -from . import Enricher -from ..core import Metadata, ArchivingContext +from auto_archiver.enrichers import Enricher +from auto_archiver.core import Metadata, ArchivingContext class HashEnricher(Enricher): @@ -45,13 +45,6 @@ class HashEnricher(Enricher): ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True) - @staticmethod - def configs() -> dict: - return { - "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}, - "chunksize": {"default": int(1.6e7), "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"}, - } - def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})") diff --git a/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py b/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py index fb12dc2..b2225fa 100644 --- a/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py +++ b/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py @@ -8,7 +8,7 @@ "retrying", "tqdm",], }, - "no_setup_required": False, + "requires_setup": True, "configs": { "access_token": {"default": None, "help": "a valid instagrapi-api token"}, "api_endpoint": {"default": None, "help": "API endpoint to use"}, @@ -25,5 +25,22 @@ "help": "if true, will remove empty values from the json output", }, }, - "description": "", + "description": """ +Archives various types of Instagram content using the Instagrapi API. + +### Features +- Connects to an Instagrapi API deployment to fetch Instagram profiles, posts, stories, highlights, reels, and tagged content. +- Supports advanced configuration options, including: + - Full profile download (all posts, stories, highlights, and tagged content). + - Limiting the number of posts to fetch for large profiles. + - Minimising JSON output to remove empty fields and redundant data. +- Provides robust error handling and retries for API calls. +- Ensures efficient media scraping, including handling nested or carousel media items. +- Adds downloaded media and metadata to the result for further processing. + +### Notes +- Requires a valid Instagrapi API token (`access_token`) and API endpoint (`api_endpoint`). +- Full-profile downloads can be limited by setting `full_profile_max_posts`. +- Designed to fetch content in batches for large profiles, minimising API load. +""", } diff --git a/src/auto_archiver/modules/instagram_api_archiver/instagram_api_archiver.py b/src/auto_archiver/modules/instagram_api_archiver/instagram_api_archiver.py index cc6e074..dc3f1ec 100644 --- a/src/auto_archiver/modules/instagram_api_archiver/instagram_api_archiver.py +++ b/src/auto_archiver/modules/instagram_api_archiver/instagram_api_archiver.py @@ -45,25 +45,6 @@ class InstagramAPIArchiver(Archiver): self.full_profile = bool(self.full_profile) self.minimize_json_output = bool(self.minimize_json_output) - @staticmethod - def configs() -> dict: - return { - "access_token": {"default": None, "help": "a valid instagrapi-api token"}, - "api_endpoint": {"default": None, "help": "API endpoint to use"}, - "full_profile": { - "default": False, - "help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.", - }, - "full_profile_max_posts": { - "default": 0, - "help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights", - }, - "minimize_json_output": { - "default": True, - "help": "if true, will remove empty values from the json output", - }, - } - def download(self, item: Metadata) -> Metadata: url = item.get_url() diff --git a/src/auto_archiver/modules/instagram_archiver/__manifest__.py b/src/auto_archiver/modules/instagram_archiver/__manifest__.py index bb143b3..44cd7bb 100644 --- a/src/auto_archiver/modules/instagram_archiver/__manifest__.py +++ b/src/auto_archiver/modules/instagram_archiver/__manifest__.py @@ -3,10 +3,12 @@ "type": ["extractor"], "entry_point": "instagram_archiver:InstagramArchiver", "external_dependencies": { - "python": ["instaloader", - "loguru",], + "python": [ + "instaloader", + "loguru", + ], }, - "no_setup_required": False, + "requires_setup": True, "configs": { "username": {"default": None, "help": "a valid Instagram username"}, "password": { diff --git a/src/auto_archiver/modules/instagram_archiver/instagram_archiver.py b/src/auto_archiver/modules/instagram_archiver/instagram_archiver.py index 4cf001d..7daf291 100644 --- a/src/auto_archiver/modules/instagram_archiver/instagram_archiver.py +++ b/src/auto_archiver/modules/instagram_archiver/instagram_archiver.py @@ -45,16 +45,7 @@ class InstagramArchiver(Archiver): except Exception as e2: logger.error(f"Unable to finish login (retrying from file): {e2}\n{traceback.format_exc()}") - @staticmethod - def configs() -> dict: - return { - "username": {"default": None, "help": "a valid Instagram username"}, - "password": {"default": None, "help": "the corresponding Instagram account password"}, - "download_folder": {"default": "instaloader", "help": "name of a folder to temporarily download content to"}, - "session_file": {"default": "secrets/instaloader.session", "help": "path to the instagram session which saves session credentials"}, - #TODO: fine-grain - # "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"}, - } + def download(self, item: Metadata) -> Metadata: url = item.get_url() diff --git a/src/auto_archiver/modules/instagram_tbot_archiver/instagram_tbot_archiver.py b/src/auto_archiver/modules/instagram_tbot_archiver/instagram_tbot_archiver.py index 9fdc208..3423010 100644 --- a/src/auto_archiver/modules/instagram_tbot_archiver/instagram_tbot_archiver.py +++ b/src/auto_archiver/modules/instagram_tbot_archiver/instagram_tbot_archiver.py @@ -34,15 +34,6 @@ class InstagramTbotArchiver(Archiver): self.assert_valid_string("api_hash") self.timeout = int(self.timeout) - @staticmethod - def configs() -> dict: - return { - "api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"}, - "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"}, - "session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."}, - "timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."}, - } - def setup(self) -> None: """ 1. makes a copy of session_file that is removed in cleanup diff --git a/src/auto_archiver/modules/meta_enricher/__init__.py b/src/auto_archiver/modules/meta_enricher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/meta_enricher/__manifest__.py b/src/auto_archiver/modules/meta_enricher/__manifest__.py new file mode 100644 index 0000000..10acf71 --- /dev/null +++ b/src/auto_archiver/modules/meta_enricher/__manifest__.py @@ -0,0 +1,22 @@ +{ + "name": "Archive Metadata Enricher", + "type": ["enricher"], + "requires_setup": False, + "external_dependencies": { + "python": ["loguru"], + }, + "description": """ + Adds metadata information about the archive operations, Adds metadata about archive operations, including file sizes and archive duration./ + To be included at the end of all enrichments. + + ### Features +- Calculates the total size of all archived media files, storing the result in human-readable and byte formats. +- Computes the duration of the archival process, storing the elapsed time in seconds. +- Ensures all enrichments are performed only if the `Metadata` object contains valid data. +- Adds detailed metadata to provide insights into file sizes and archival performance. + +### Notes +- Skips enrichment if no media or metadata is available in the `Metadata` object. +- File sizes are calculated using the `os.stat` module, ensuring accurate byte-level reporting. +""", +} diff --git a/src/auto_archiver/enrichers/meta_enricher.py b/src/auto_archiver/modules/meta_enricher/meta_enricher.py similarity index 93% rename from src/auto_archiver/enrichers/meta_enricher.py rename to src/auto_archiver/modules/meta_enricher/meta_enricher.py index b721bb5..ab0e73d 100644 --- a/src/auto_archiver/enrichers/meta_enricher.py +++ b/src/auto_archiver/modules/meta_enricher/meta_enricher.py @@ -2,8 +2,8 @@ import datetime import os from loguru import logger -from . import Enricher -from ..core import Metadata +from auto_archiver.enrichers import Enricher +from auto_archiver.core import Metadata class MetaEnricher(Enricher): @@ -17,10 +17,6 @@ class MetaEnricher(Enricher): # without this STEP.__init__ is not called super().__init__(config) - @staticmethod - def configs() -> dict: - return {} - def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() if to_enrich.is_empty(): @@ -28,7 +24,7 @@ class MetaEnricher(Enricher): return logger.debug(f"calculating archive metadata information for {url=}") - + self.enrich_file_sizes(to_enrich) self.enrich_archive_duration(to_enrich) @@ -40,10 +36,10 @@ class MetaEnricher(Enricher): media.set("bytes", file_stats.st_size) media.set("size", self.human_readable_bytes(file_stats.st_size)) total_size += file_stats.st_size - + to_enrich.set("total_bytes", total_size) to_enrich.set("total_size", self.human_readable_bytes(total_size)) - + def human_readable_bytes(self, size: int) -> str: # receives number of bytes and returns human readble size diff --git a/src/auto_archiver/modules/metadata_enricher/__init__.py b/src/auto_archiver/modules/metadata_enricher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/metadata_enricher/__manifest__.py b/src/auto_archiver/modules/metadata_enricher/__manifest__.py new file mode 100644 index 0000000..bfc9b75 --- /dev/null +++ b/src/auto_archiver/modules/metadata_enricher/__manifest__.py @@ -0,0 +1,22 @@ +{ + "name": "Media Metadata Enricher", + "type": ["enricher"], + "requires_setup": False, + "external_dependencies": { + "python": ["loguru"], + "bin": ["exiftool"] + + }, + "description": """ + Extracts metadata information from files using ExifTool. + + ### Features + - Uses ExifTool to extract detailed metadata from media files. + - Processes file-specific data like camera settings, geolocation, timestamps, and other embedded metadata. + - Adds extracted metadata to the corresponding `Media` object within the `Metadata`. + + ### Notes + - Requires ExifTool to be installed and accessible via the system's PATH. + - Skips enrichment for files where metadata extraction fails. + """ +} diff --git a/src/auto_archiver/enrichers/metadata_enricher.py b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py similarity index 92% rename from src/auto_archiver/enrichers/metadata_enricher.py rename to src/auto_archiver/modules/metadata_enricher/metadata_enricher.py index 9fe257e..5887d16 100644 --- a/src/auto_archiver/enrichers/metadata_enricher.py +++ b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py @@ -2,8 +2,8 @@ import subprocess import traceback from loguru import logger -from . import Enricher -from ..core import Metadata +from auto_archiver.enrichers import Enricher +from auto_archiver.core import Metadata class MetadataEnricher(Enricher): @@ -16,9 +16,6 @@ class MetadataEnricher(Enricher): # without this STEP.__init__ is not called super().__init__(config) - @staticmethod - def configs() -> dict: - return {} def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() diff --git a/src/auto_archiver/modules/pdq_hash_enricher/__init__.py b/src/auto_archiver/modules/pdq_hash_enricher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/pdq_hash_enricher/__manifest__.py b/src/auto_archiver/modules/pdq_hash_enricher/__manifest__.py new file mode 100644 index 0000000..7b418b1 --- /dev/null +++ b/src/auto_archiver/modules/pdq_hash_enricher/__manifest__.py @@ -0,0 +1,21 @@ +{ + "name": "PDQ Hash Enricher", + "type": ["enricher"], + "requires_setup": False, + "external_dependencies": { + "python": ["loguru", "pdqhash", "numpy", "Pillow"], + }, + "description": """ + PDQ Hash Enricher for generating perceptual hashes of media files. + + ### Features + - Calculates perceptual hashes for image files using the PDQ hashing algorithm. + - Enables detection of duplicate or near-duplicate visual content. + - Processes images stored in `Metadata` objects, adding computed hashes to the corresponding `Media` entries. + - Skips non-image media or files unsuitable for hashing (e.g., corrupted or unsupported formats). + + ### Notes + - Best used after enrichers like `thumbnail_enricher` or `screenshot_enricher` to ensure images are available. + - Uses the `pdqhash` library to compute 256-bit perceptual hashes, which are stored as hexadecimal strings. + """ +} diff --git a/src/auto_archiver/enrichers/pdq_hash_enricher.py b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py similarity index 95% rename from src/auto_archiver/enrichers/pdq_hash_enricher.py rename to src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py index 36f793d..e3e9d10 100644 --- a/src/auto_archiver/enrichers/pdq_hash_enricher.py +++ b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py @@ -16,8 +16,8 @@ import numpy as np from PIL import Image, UnidentifiedImageError from loguru import logger -from . import Enricher -from ..core import Metadata +from auto_archiver.enrichers import Enricher +from auto_archiver.core import Metadata class PdqHashEnricher(Enricher): @@ -31,10 +31,6 @@ class PdqHashEnricher(Enricher): # Without this STEP.__init__ is not called super().__init__(config) - @staticmethod - def configs() -> dict: - return {} - def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() logger.debug(f"calculating perceptual hashes for {url=}") diff --git a/src/auto_archiver/modules/screenshot_enricher/__init__.py b/src/auto_archiver/modules/screenshot_enricher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/screenshot_enricher/__manifest__.py b/src/auto_archiver/modules/screenshot_enricher/__manifest__.py new file mode 100644 index 0000000..c1a30e7 --- /dev/null +++ b/src/auto_archiver/modules/screenshot_enricher/__manifest__.py @@ -0,0 +1,30 @@ +{ + "name": "Screenshot Enricher", + "type": ["enricher"], + "requires_setup": True, + "external_dependencies": { + "python": ["loguru", "selenium"], + "bin": ["chromedriver"] + }, + "configs": { + "width": {"default": 1280, "help": "width of the screenshots"}, + "height": {"default": 720, "help": "height of the screenshots"}, + "timeout": {"default": 60, "help": "timeout for taking the screenshot"}, + "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}, + "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"}, + "save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"}, + "print_options": {"default": {}, "help": "options to pass to the pdf printer"} + }, + "description": """ + Captures screenshots and optionally saves web pages as PDFs using a WebDriver. + + ### Features + - Takes screenshots of web pages, with configurable width, height, and timeout settings. + - Optionally saves pages as PDFs, with additional configuration for PDF printing options. + - Bypasses URLs detected as authentication walls. + - Integrates seamlessly with the metadata enrichment pipeline, adding screenshots and PDFs as media. + + ### Notes + - Requires a WebDriver (e.g., ChromeDriver) installed and accessible via the system's PATH. + """ +} diff --git a/src/auto_archiver/enrichers/screenshot_enricher.py b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py similarity index 59% rename from src/auto_archiver/enrichers/screenshot_enricher.py rename to src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py index b2ef096..dd1d38a 100644 --- a/src/auto_archiver/enrichers/screenshot_enricher.py +++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py @@ -5,24 +5,30 @@ import base64 from selenium.common.exceptions import TimeoutException -from . import Enricher -from ..utils import Webdriver, UrlUtil, random_str -from ..core import Media, Metadata, ArchivingContext +from auto_archiver.enrichers import Enricher +from auto_archiver.utils import Webdriver, UrlUtil, random_str +from auto_archiver.core import Media, Metadata, ArchivingContext class ScreenshotEnricher(Enricher): name = "screenshot_enricher" - @staticmethod - def configs() -> dict: - return { - "width": {"default": 1280, "help": "width of the screenshots"}, - "height": {"default": 720, "help": "height of the screenshots"}, - "timeout": {"default": 60, "help": "timeout for taking the screenshot"}, - "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}, - "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"}, - "save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"}, - "print_options": {"default": {}, "help": "options to pass to the pdf printer"} - } + def __init__(self, config: dict) -> None: + super().__init__(config) + # TODO? + + + + # @staticmethod + # def configs() -> dict: + # return { + # "width": {"default": 1280, "help": "width of the screenshots"}, + # "height": {"default": 720, "help": "height of the screenshots"}, + # "timeout": {"default": 60, "help": "timeout for taking the screenshot"}, + # "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}, + # "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"}, + # "save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"}, + # "print_options": {"default": {}, "help": "options to pass to the pdf printer"} + # } def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() diff --git a/src/auto_archiver/modules/ssl_enricher/__init__.py b/src/auto_archiver/modules/ssl_enricher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/ssl_enricher/__manifest__.py b/src/auto_archiver/modules/ssl_enricher/__manifest__.py new file mode 100644 index 0000000..f44fc94 --- /dev/null +++ b/src/auto_archiver/modules/ssl_enricher/__manifest__.py @@ -0,0 +1,22 @@ +{ + "name": "SSL Certificate Enricher", + "type": ["enricher"], + "requires_setup": False, + "external_dependencies": { + "python": ["loguru", "python-slugify"], + }, + "configs": { + "skip_when_nothing_archived": {"default": True, "help": "if true, will skip enriching when no media is archived"}, + }, + "description": """ + Retrieves SSL certificate information for a domain and stores it as a file. + + ### Features + - Fetches SSL certificates for domains using the HTTPS protocol. + - Stores certificates in PEM format and adds them as media to the metadata. + - Skips enrichment if no media has been archived, based on the `skip_when_nothing_archived` configuration. + + ### Notes + - Requires the target URL to use the HTTPS scheme; other schemes are not supported. + """ +} diff --git a/src/auto_archiver/enrichers/ssl_enricher.py b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py similarity index 73% rename from src/auto_archiver/enrichers/ssl_enricher.py rename to src/auto_archiver/modules/ssl_enricher/ssl_enricher.py index 396df2e..0474d8f 100644 --- a/src/auto_archiver/enrichers/ssl_enricher.py +++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py @@ -3,8 +3,8 @@ from slugify import slugify from urllib.parse import urlparse from loguru import logger -from . import Enricher -from ..core import Metadata, ArchivingContext, Media +from auto_archiver.enrichers import Enricher +from auto_archiver.core import Metadata, ArchivingContext, Media class SSLEnricher(Enricher): @@ -15,13 +15,7 @@ class SSLEnricher(Enricher): def __init__(self, config: dict) -> None: super().__init__(config) - self. skip_when_nothing_archived = bool(self.skip_when_nothing_archived) - - @staticmethod - def configs() -> dict: - return { - "skip_when_nothing_archived": {"default": True, "help": "if true, will skip enriching when no media is archived"}, - } + self.skip_when_nothing_archived = bool(self.skip_when_nothing_archived) def enrich(self, to_enrich: Metadata) -> None: if not to_enrich.media and self.skip_when_nothing_archived: return diff --git a/src/auto_archiver/modules/telegram_archiver/telegram_archiver.py b/src/auto_archiver/modules/telegram_archiver/telegram_archiver.py index c793095..c5e5ef0 100644 --- a/src/auto_archiver/modules/telegram_archiver/telegram_archiver.py +++ b/src/auto_archiver/modules/telegram_archiver/telegram_archiver.py @@ -16,9 +16,6 @@ class TelegramArchiver(Archiver): def __init__(self, config: dict) -> None: super().__init__(config) - @staticmethod - def configs() -> dict: - return {} def download(self, item: Metadata) -> Metadata: url = item.get_url() diff --git a/src/auto_archiver/modules/telethon_archiver/__manifest__.py b/src/auto_archiver/modules/telethon_archiver/__manifest__.py index e7359d7..d44acf3 100644 --- a/src/auto_archiver/modules/telethon_archiver/__manifest__.py +++ b/src/auto_archiver/modules/telethon_archiver/__manifest__.py @@ -21,7 +21,7 @@ "default": {}, "help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup", # TODO - #"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val)) + "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val)) } }, "description": """ diff --git a/src/auto_archiver/modules/telethon_archiver/telethon_archiver.py b/src/auto_archiver/modules/telethon_archiver/telethon_archiver.py index 89668f3..fc89c9e 100644 --- a/src/auto_archiver/modules/telethon_archiver/telethon_archiver.py +++ b/src/auto_archiver/modules/telethon_archiver/telethon_archiver.py @@ -23,20 +23,6 @@ class TelethonArchiver(Archiver): self.assert_valid_string("api_id") self.assert_valid_string("api_hash") - @staticmethod - def configs() -> dict: - return { - "api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"}, - "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"}, - "bot_token": {"default": None, "help": "optional, but allows access to more content such as large videos, talk to @botfather"}, - "session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."}, - "join_channels": {"default": True, "help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"}, - "channel_invites": { - "default": {}, - "help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup", - "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val)) - } - } def setup(self) -> None: """ diff --git a/src/auto_archiver/modules/thumbnail_enricher/__init__.py b/src/auto_archiver/modules/thumbnail_enricher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/thumbnail_enricher/__manifest__.py b/src/auto_archiver/modules/thumbnail_enricher/__manifest__.py new file mode 100644 index 0000000..2b0f167 --- /dev/null +++ b/src/auto_archiver/modules/thumbnail_enricher/__manifest__.py @@ -0,0 +1,27 @@ +{ + "name": "Thumbnail Enricher", + "type": ["enricher"], + "requires_setup": False, + "external_dependencies": { + "python": ["loguru", "ffmpeg-python"], + "bin": ["ffmpeg"] + }, + "configs": { + "thumbnails_per_minute": {"default": 60, "help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"}, + "max_thumbnails": {"default": 16, "help": "limit the number of thumbnails to generate per video, 0 means no limit"}, + }, + "description": """ + Generates thumbnails for video files to provide visual previews. + + ### Features + - Processes video files and generates evenly distributed thumbnails. + - Calculates the number of thumbnails based on video duration, `thumbnails_per_minute`, and `max_thumbnails`. + - Distributes thumbnails equally across the video's duration and stores them as media objects. + - Adds metadata for each thumbnail, including timestamps and IDs. + + ### Notes + - Requires `ffmpeg` to be installed and accessible via the system's PATH. + - Handles videos without pre-existing duration metadata by probing with `ffmpeg`. + - Skips enrichment for non-video media files. + """ +} diff --git a/src/auto_archiver/enrichers/thumbnail_enricher.py b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py similarity index 86% rename from src/auto_archiver/enrichers/thumbnail_enricher.py rename to src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py index 5d8bee2..3edd40c 100644 --- a/src/auto_archiver/enrichers/thumbnail_enricher.py +++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py @@ -9,9 +9,9 @@ and identify important moments without watching the entire video. import ffmpeg, os from loguru import logger -from . import Enricher -from ..core import Media, Metadata, ArchivingContext -from ..utils.misc import random_str +from auto_archiver.enrichers import Enricher +from auto_archiver.core import Media, Metadata, ArchivingContext +from auto_archiver.utils.misc import random_str class ThumbnailEnricher(Enricher): @@ -25,13 +25,6 @@ class ThumbnailEnricher(Enricher): super().__init__(config) self.thumbnails_per_second = int(self.thumbnails_per_minute) / 60 self.max_thumbnails = int(self.max_thumbnails) - - @staticmethod - def configs() -> dict: - return { - "thumbnails_per_minute": {"default": 60, "help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"}, - "max_thumbnails": {"default": 16, "help": "limit the number of thumbnails to generate per video, 0 means no limit"}, - } def enrich(self, to_enrich: Metadata) -> None: """ diff --git a/src/auto_archiver/modules/timestamping_enricher/__init__.py b/src/auto_archiver/modules/timestamping_enricher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/timestamping_enricher/__manifest__.py b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py new file mode 100644 index 0000000..a66cc31 --- /dev/null +++ b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py @@ -0,0 +1,40 @@ +{ + "name": "Timestamping Enricher", + "type": ["enricher"], + "requires_setup": True, + "external_dependencies": { + "python": [ + "loguru", + "slugify", + "tsp_client", + "asn1crypto", + "certvalidator", + "certifi" + ], + }, + "configs": { + "tsa_urls": { + "default": [ + "http://timestamp.digicert.com", + "http://timestamp.identrust.com", + "http://timestamp.globalsign.com/tsa/r6advanced1", + "http://tss.accv.es:8318/tsa" + ], + "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.", + "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) + } + }, + "description": """ + Generates RFC3161-compliant timestamp tokens using Time Stamp Authorities (TSA) for archived files. + + ### Features + - Creates timestamp tokens to prove the existence of files at a specific time, useful for legal and authenticity purposes. + - Aggregates file hashes into a text file and timestamps the concatenated data. + - Uses multiple Time Stamp Authorities (TSAs) to ensure reliability and redundancy. + - Validates timestamping certificates against trusted Certificate Authorities (CAs) using the `certifi` trust store. + + ### Notes + - Should be run after the `hash_enricher` to ensure file hashes are available. + - Requires internet access to interact with the configured TSAs. + """ +} diff --git a/src/auto_archiver/enrichers/timestamping_enricher.py b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py similarity index 72% rename from src/auto_archiver/enrichers/timestamping_enricher.py rename to src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py index dffa1a3..a9cf753 100644 --- a/src/auto_archiver/enrichers/timestamping_enricher.py +++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py @@ -8,9 +8,9 @@ from certvalidator import CertificateValidator, ValidationContext from asn1crypto import pem import certifi -from . import Enricher -from ..core import Metadata, ArchivingContext, Media -from ..archivers import Archiver +from auto_archiver.enrichers import Enricher +from auto_archiver.core import Metadata, ArchivingContext, Media +from auto_archiver.archivers import Archiver class TimestampingEnricher(Enricher): @@ -26,36 +26,36 @@ class TimestampingEnricher(Enricher): def __init__(self, config: dict) -> None: super().__init__(config) - @staticmethod - def configs() -> dict: - return { - "tsa_urls": { - "default": [ - # [Adobe Approved Trust List] and [Windows Cert Store] - "http://timestamp.digicert.com", - "http://timestamp.identrust.com", - # "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping - # "https://timestamp.sectigo.com", # wait 15 seconds between each request. - - # [Adobe: European Union Trusted Lists]. - # "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request. - - # [Windows Cert Store] - "http://timestamp.globalsign.com/tsa/r6advanced1", - - # [Adobe: European Union Trusted Lists] and [Windows Cert Store] - # "http://ts.quovadisglobal.com/eu", # not valid for timestamping - # "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain - # "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain - # "http://tsa.sep.bg", # self-signed certificate in certificate chain - # "http://tsa.izenpe.com", #unable to get local issuer certificate - # "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate - "http://tss.accv.es:8318/tsa", - ], - "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.", - "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) - } - } + # @staticmethod + # def configs() -> dict: + # return { + # "tsa_urls": { + # "default": [ + # # [Adobe Approved Trust List] and [Windows Cert Store] + # "http://timestamp.digicert.com", + # "http://timestamp.identrust.com", + # # "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping + # # "https://timestamp.sectigo.com", # wait 15 seconds between each request. + # + # # [Adobe: European Union Trusted Lists]. + # # "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request. + # + # # [Windows Cert Store] + # "http://timestamp.globalsign.com/tsa/r6advanced1", + # + # # [Adobe: European Union Trusted Lists] and [Windows Cert Store] + # # "http://ts.quovadisglobal.com/eu", # not valid for timestamping + # # "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain + # # "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain + # # "http://tsa.sep.bg", # self-signed certificate in certificate chain + # # "http://tsa.izenpe.com", #unable to get local issuer certificate + # # "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate + # "http://tss.accv.es:8318/tsa", + # ], + # "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.", + # "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) + # } + # } def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() diff --git a/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py b/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py index 203eee9..5dc7364 100644 --- a/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py +++ b/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py @@ -12,7 +12,8 @@ }, "configs": { "bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"}, - "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line"}, + "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", + "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))}, "consumer_key": {"default": None, "help": "twitter API consumer_key"}, "consumer_secret": {"default": None, "help": "twitter API consumer_secret"}, "access_token": {"default": None, "help": "twitter API access_token"}, diff --git a/src/auto_archiver/modules/twitter_api_archiver/twitter_api_archiver.py b/src/auto_archiver/modules/twitter_api_archiver/twitter_api_archiver.py index eb607cc..9c931ef 100644 --- a/src/auto_archiver/modules/twitter_api_archiver/twitter_api_archiver.py +++ b/src/auto_archiver/modules/twitter_api_archiver/twitter_api_archiver.py @@ -34,17 +34,6 @@ class TwitterApiArchiver(Archiver): access_token=self.access_token, access_secret=self.access_secret)) assert self.api_client is not None, "Missing Twitter API configurations, please provide either AND/OR (consumer_key, consumer_secret, access_token, access_secret) to use this archiver, you can provide both for better rate-limit results." - @staticmethod - def configs() -> dict: - return { - "bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"}, - "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))}, - "consumer_key": {"default": None, "help": "twitter API consumer_key"}, - "consumer_secret": {"default": None, "help": "twitter API consumer_secret"}, - "access_token": {"default": None, "help": "twitter API access_token"}, - "access_secret": {"default": None, "help": "twitter API access_secret"}, - } - @property # getter .mimetype def api_client(self) -> str: return self.apis[self.api_index] diff --git a/src/auto_archiver/modules/vk_archiver/vk_archiver.py b/src/auto_archiver/modules/vk_archiver/vk_archiver.py index 3cfb446..7ba7a68 100644 --- a/src/auto_archiver/modules/vk_archiver/vk_archiver.py +++ b/src/auto_archiver/modules/vk_archiver/vk_archiver.py @@ -19,14 +19,6 @@ class VkArchiver(Archiver): self.assert_valid_string("password") self.vks = VkScraper(self.username, self.password, session_file=self.session_file) - @staticmethod - def configs() -> dict: - return { - "username": {"default": None, "help": "valid VKontakte username"}, - "password": {"default": None, "help": "valid VKontakte password"}, - "session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"}, - } - def download(self, item: Metadata) -> Metadata: url = item.get_url() diff --git a/src/auto_archiver/modules/wacz_enricher/__init__.py b/src/auto_archiver/modules/wacz_enricher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/wacz_enricher/__manifest__.py b/src/auto_archiver/modules/wacz_enricher/__manifest__.py new file mode 100644 index 0000000..07983d9 --- /dev/null +++ b/src/auto_archiver/modules/wacz_enricher/__manifest__.py @@ -0,0 +1,39 @@ +{ + "name": "WACZ Enricher", + "type": ["enricher", "archiver"], + "requires_setup": True, + "external_dependencies": { + "python": [ + "loguru", + "jsonlines", + "warcio" + ], + # TODO? + "bin": [ + "docker" + ] + }, + "configs": { + "profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."}, + "docker_commands": {"default": None, "help":"if a custom docker invocation is needed"}, + "timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"}, + "extract_media": {"default": False, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."}, + "extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."}, + "socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"}, + "socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"}, + "proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"}, + }, + "description": """ + Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving. + + ### Features + - Archives web pages into .WACZ format using Docker or direct invocation of `browsertrix-crawler`. + - Supports custom profiles for archiving private or dynamic content. + - Extracts media (images, videos, audio) and screenshots from the archive, optionally adding them to the enrichment pipeline. + - Generates metadata from the archived page's content and structure (e.g., titles, text). + + ### Notes + - Requires Docker for running `browsertrix-crawler` unless explicitly disabled. + - Configurable via parameters for timeout, media extraction, screenshots, and proxy settings. + """ +} diff --git a/src/auto_archiver/enrichers/wacz_enricher.py b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py similarity index 87% rename from src/auto_archiver/enrichers/wacz_enricher.py rename to src/auto_archiver/modules/wacz_enricher/wacz_enricher.py index dc38488..124382b 100644 --- a/src/auto_archiver/enrichers/wacz_enricher.py +++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py @@ -5,10 +5,10 @@ from zipfile import ZipFile from loguru import logger from warcio.archiveiterator import ArchiveIterator -from ..core import Media, Metadata, ArchivingContext -from . import Enricher -from ..archivers import Archiver -from ..utils import UrlUtil, random_str +from auto_archiver.core import Media, Metadata, ArchivingContext +from auto_archiver.enrichers import Enricher +from auto_archiver.archivers import Archiver +from auto_archiver.utils import UrlUtil, random_str class WaczArchiverEnricher(Enricher, Archiver): @@ -24,19 +24,6 @@ class WaczArchiverEnricher(Enricher, Archiver): # without this STEP.__init__ is not called super().__init__(config) - @staticmethod - def configs() -> dict: - return { - "profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."}, - "docker_commands": {"default": None, "help":"if a custom docker invocation is needed"}, - "timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"}, - "extract_media": {"default": False, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."}, - "extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."}, - "socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"}, - "socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"}, - "proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"}, - } - def setup(self) -> None: self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER') self.docker_in_docker = os.environ.get('WACZ_ENABLE_DOCKER') and os.environ.get('RUNNING_IN_DOCKER') diff --git a/src/auto_archiver/modules/wayback_enricher/__init__.py b/src/auto_archiver/modules/wayback_enricher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/wayback_enricher/__manifest__.py b/src/auto_archiver/modules/wayback_enricher/__manifest__.py new file mode 100644 index 0000000..b3af284 --- /dev/null +++ b/src/auto_archiver/modules/wayback_enricher/__manifest__.py @@ -0,0 +1,29 @@ +{ + "name": "Wayback Machine Enricher", + "type": ["enricher", "archiver"], + "requires_setup": True, + "external_dependencies": { + "python": ["loguru", "requests"], + }, + "configs": { + "timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."}, + "if_not_archived_within": {"default": None, "help": "only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA"}, + "key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"}, + "secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"}, + "proxy_http": {"default": None, "help": "http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port"}, + "proxy_https": {"default": None, "help": "https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port"}, + }, + "description": """ + Submits the current URL to the Wayback Machine for archiving and returns either a job ID or the completed archive URL. + + ### Features + - Archives URLs using the Internet Archive's Wayback Machine API. + - Supports conditional archiving based on the existence of prior archives within a specified time range. + - Provides proxies for HTTP and HTTPS requests. + - Fetches and confirms the archive URL or provides a job ID for later status checks. + + ### Notes + - Requires a valid Wayback Machine API key and secret. + - Handles rate-limiting by Wayback Machine and retries status checks with exponential backoff. + """ +} diff --git a/src/auto_archiver/enrichers/wayback_enricher.py b/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py similarity index 78% rename from src/auto_archiver/enrichers/wayback_enricher.py rename to src/auto_archiver/modules/wayback_enricher/wayback_enricher.py index 305bfcf..8ddec82 100644 --- a/src/auto_archiver/enrichers/wayback_enricher.py +++ b/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py @@ -2,10 +2,10 @@ import json from loguru import logger import time, requests -from . import Enricher -from ..archivers import Archiver -from ..utils import UrlUtil -from ..core import Metadata +from auto_archiver.enrichers import Enricher +from auto_archiver.archivers import Archiver +from auto_archiver.utils import UrlUtil +from auto_archiver.core import Metadata class WaybackArchiverEnricher(Enricher, Archiver): """ @@ -21,17 +21,6 @@ class WaybackArchiverEnricher(Enricher, Archiver): assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API key" assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API secret" - @staticmethod - def configs() -> dict: - return { - "timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."}, - "if_not_archived_within": {"default": None, "help": "only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA"}, - "key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"}, - "secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"}, - "proxy_http": {"default": None, "help": "http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port"}, - "proxy_https": {"default": None, "help": "https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port"}, - } - def download(self, item: Metadata) -> Metadata: # this new Metadata object is required to avoid duplication result = Metadata() diff --git a/src/auto_archiver/modules/whisper_enricher/__init__.py b/src/auto_archiver/modules/whisper_enricher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/whisper_enricher/__manifest__.py b/src/auto_archiver/modules/whisper_enricher/__manifest__.py new file mode 100644 index 0000000..25eae25 --- /dev/null +++ b/src/auto_archiver/modules/whisper_enricher/__manifest__.py @@ -0,0 +1,30 @@ +{ + "name": "Whisper Enricher", + "type": ["enricher"], + "requires_setup": True, + "external_dependencies": { + "python": ["loguru", "requests"], + }, + "configs": { + "api_endpoint": {"default": None, "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."}, + "api_key": {"default": None, "help": "WhisperApi api key for authentication"}, + "include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."}, + "timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."}, + "action": {"default": "translate", "help": "which Whisper operation to execute", "choices": ["transcribe", "translate", "language_detection"]}, + }, + "description": """ + Integrates with a Whisper API service to transcribe, translate, or detect the language of audio and video files. + + ### Features + - Submits audio or video files to a Whisper API deployment for processing. + - Supports operations such as transcription, translation, and language detection. + - Optionally generates SRT subtitle files for video content. + - Integrates with S3-compatible storage systems to make files publicly accessible for processing. + - Handles job submission, status checking, artifact retrieval, and cleanup. + + ### Notes + - Requires a Whisper API endpoint and API key for authentication. + - Only compatible with S3-compatible storage systems for media file accessibility. + - Handles multiple jobs and retries for failed or incomplete processing. + """ +} diff --git a/src/auto_archiver/enrichers/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py similarity index 87% rename from src/auto_archiver/enrichers/whisper_enricher.py rename to src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index c0089a4..f6294f3 100644 --- a/src/auto_archiver/enrichers/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -2,9 +2,9 @@ import traceback import requests, time from loguru import logger -from . import Enricher -from ..core import Metadata, Media, ArchivingContext -from ..storages import S3Storage +from auto_archiver.enrichers import Enricher +from auto_archiver.core import Metadata, Media, ArchivingContext +from auto_archiver.storages import S3Storage class WhisperEnricher(Enricher): @@ -22,17 +22,6 @@ class WhisperEnricher(Enricher): assert type(self.api_key) == str and len(self.api_key) > 0, "please provide a value for the whisper_enricher api_key" self.timeout = int(self.timeout) - @staticmethod - def configs() -> dict: - return { - "api_endpoint": {"default": None, "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."}, - "api_key": {"default": None, "help": "WhisperApi api key for authentication"}, - "include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."}, - "timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."}, - "action": {"default": "translate", "help": "which Whisper operation to execute", "choices": ["transcribe", "translate", "language_detection"]}, - - } - def enrich(self, to_enrich: Metadata) -> None: if not self._get_s3_storage(): logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.") diff --git a/tests/databases/test_csv_db.py b/tests/databases/test_csv_db.py index 4395ef0..989f1e9 100644 --- a/tests/databases/test_csv_db.py +++ b/tests/databases/test_csv_db.py @@ -1,5 +1,5 @@ -from auto_archiver.databases.csv_db import CSVDb +from auto_archiver.modules.csv_db import CSVDb from auto_archiver.core import Metadata diff --git a/tests/enrichers/test_hash_enricher.py b/tests/enrichers/test_hash_enricher.py index 99f8117..1477cde 100644 --- a/tests/enrichers/test_hash_enricher.py +++ b/tests/enrichers/test_hash_enricher.py @@ -1,6 +1,6 @@ import pytest -from auto_archiver.enrichers.hash_enricher import HashEnricher +from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.core import Metadata, Media @pytest.mark.parametrize("algorithm, filename, expected_hash", [