mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-08 03:18:28 +03:00
Set up feeder manifests (not merged by source yet)
This commit is contained in:
188
poetry.lock
generated
188
poetry.lock
generated
@@ -152,34 +152,34 @@ lxml = ["lxml"]
|
||||
|
||||
[[package]]
|
||||
name = "boto3"
|
||||
version = "1.35.99"
|
||||
version = "1.36.3"
|
||||
description = "The AWS SDK for Python"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "boto3-1.35.99-py3-none-any.whl", hash = "sha256:83e560faaec38a956dfb3d62e05e1703ee50432b45b788c09e25107c5058bd71"},
|
||||
{file = "boto3-1.35.99.tar.gz", hash = "sha256:e0abd794a7a591d90558e92e29a9f8837d25ece8e3c120e530526fe27eba5fca"},
|
||||
{file = "boto3-1.36.3-py3-none-any.whl", hash = "sha256:f9843a5d06f501d66ada06f5a5417f671823af2cf319e36ceefa1bafaaaaa953"},
|
||||
{file = "boto3-1.36.3.tar.gz", hash = "sha256:53a5307f6a3526ee2f8590e3c45efa504a3ea4532c1bfe4926c0c19bf188d141"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
botocore = ">=1.35.99,<1.36.0"
|
||||
botocore = ">=1.36.3,<1.37.0"
|
||||
jmespath = ">=0.7.1,<2.0.0"
|
||||
s3transfer = ">=0.10.0,<0.11.0"
|
||||
s3transfer = ">=0.11.0,<0.12.0"
|
||||
|
||||
[package.extras]
|
||||
crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
|
||||
|
||||
[[package]]
|
||||
name = "botocore"
|
||||
version = "1.35.99"
|
||||
version = "1.36.3"
|
||||
description = "Low-level, data-driven core of boto 3."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "botocore-1.35.99-py3-none-any.whl", hash = "sha256:b22d27b6b617fc2d7342090d6129000af2efd20174215948c0d7ae2da0fab445"},
|
||||
{file = "botocore-1.35.99.tar.gz", hash = "sha256:1eab44e969c39c5f3d9a3104a0836c24715579a455f12b3979a31d7cde51b3c3"},
|
||||
{file = "botocore-1.36.3-py3-none-any.whl", hash = "sha256:536ab828e6f90dbb000e3702ac45fd76642113ae2db1b7b1373ad24104e89255"},
|
||||
{file = "botocore-1.36.3.tar.gz", hash = "sha256:775b835e979da5c96548ed1a0b798101a145aec3cd46541d62e27dda5a94d7f8"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -188,7 +188,7 @@ python-dateutil = ">=2.1,<3.0.0"
|
||||
urllib3 = {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""}
|
||||
|
||||
[package.extras]
|
||||
crt = ["awscrt (==0.22.0)"]
|
||||
crt = ["awscrt (==0.23.4)"]
|
||||
|
||||
[[package]]
|
||||
name = "brotli"
|
||||
@@ -343,14 +343,14 @@ beautifulsoup4 = "*"
|
||||
|
||||
[[package]]
|
||||
name = "cachetools"
|
||||
version = "5.5.0"
|
||||
version = "5.5.1"
|
||||
description = "Extensible memoizing collections and decorators"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "cachetools-5.5.0-py3-none-any.whl", hash = "sha256:02134e8439cdc2ffb62023ce1debca2944c3f289d66bb17ead3ab3dede74b292"},
|
||||
{file = "cachetools-5.5.0.tar.gz", hash = "sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a"},
|
||||
{file = "cachetools-5.5.1-py3-none-any.whl", hash = "sha256:b76651fdc3b24ead3c648bbdeeb940c1b04d365b38b4af66788f9ec4a81d42bb"},
|
||||
{file = "cachetools-5.5.1.tar.gz", hash = "sha256:70f238fbba50383ef62e55c6aff6d9673175fe59f7c6782c7a0b9e38f4a9df95"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2083,32 +2083,32 @@ pyasn1 = ">=0.1.3"
|
||||
|
||||
[[package]]
|
||||
name = "s3transfer"
|
||||
version = "0.10.4"
|
||||
version = "0.11.1"
|
||||
description = "An Amazon S3 Transfer Manager"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "s3transfer-0.10.4-py3-none-any.whl", hash = "sha256:244a76a24355363a68164241438de1b72f8781664920260c48465896b712a41e"},
|
||||
{file = "s3transfer-0.10.4.tar.gz", hash = "sha256:29edc09801743c21eb5ecbc617a152df41d3c287f67b615f73e5f750583666a7"},
|
||||
{file = "s3transfer-0.11.1-py3-none-any.whl", hash = "sha256:8fa0aa48177be1f3425176dfe1ab85dcd3d962df603c3dbfc585e6bf857ef0ff"},
|
||||
{file = "s3transfer-0.11.1.tar.gz", hash = "sha256:3f25c900a367c8b7f7d8f9c34edc87e300bde424f779dc9f0a8ae4f9df9264f6"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
botocore = ">=1.33.2,<2.0a.0"
|
||||
botocore = ">=1.36.0,<2.0a.0"
|
||||
|
||||
[package.extras]
|
||||
crt = ["botocore[crt] (>=1.33.2,<2.0a.0)"]
|
||||
crt = ["botocore[crt] (>=1.36.0,<2.0a.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "selenium"
|
||||
version = "4.27.1"
|
||||
version = "4.28.0"
|
||||
description = "Official Python bindings for Selenium WebDriver"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "selenium-4.27.1-py3-none-any.whl", hash = "sha256:b89b1f62b5cfe8025868556fe82360d6b649d464f75d2655cb966c8f8447ea18"},
|
||||
{file = "selenium-4.27.1.tar.gz", hash = "sha256:5296c425a75ff1b44d0d5199042b36a6d1ef76c04fb775b97b40be739a9caae2"},
|
||||
{file = "selenium-4.28.0-py3-none-any.whl", hash = "sha256:3d6a2e8e1b850a1078884ea19f4e011ecdc12263434d87a0b78769836fb82dd8"},
|
||||
{file = "selenium-4.28.0.tar.gz", hash = "sha256:a9fae6eef48d470a1b0c6e45185d96f0dafb025e8da4b346cc41e4da3ac54fa0"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -2617,15 +2617,15 @@ typing-extensions = ">=3.7.4"
|
||||
|
||||
[[package]]
|
||||
name = "tzdata"
|
||||
version = "2024.2"
|
||||
version = "2025.1"
|
||||
description = "Provider of IANA time zone data"
|
||||
optional = false
|
||||
python-versions = ">=2"
|
||||
groups = ["main"]
|
||||
markers = "platform_system == \"Windows\""
|
||||
files = [
|
||||
{file = "tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd"},
|
||||
{file = "tzdata-2024.2.tar.gz", hash = "sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc"},
|
||||
{file = "tzdata-2025.1-py2.py3-none-any.whl", hash = "sha256:7e127113816800496f027041c570f50bcd464a020098a3b6b199517772303639"},
|
||||
{file = "tzdata-2025.1.tar.gz", hash = "sha256:24894909e88cdb28bd1636c6887801df64cb485bd593f2fd83ef29075a81d694"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2868,81 +2868,81 @@ test = ["websockets"]
|
||||
|
||||
[[package]]
|
||||
name = "websockets"
|
||||
version = "14.1"
|
||||
version = "14.2"
|
||||
description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main", "docs"]
|
||||
files = [
|
||||
{file = "websockets-14.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a0adf84bc2e7c86e8a202537b4fd50e6f7f0e4a6b6bf64d7ccb96c4cd3330b29"},
|
||||
{file = "websockets-14.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90b5d9dfbb6d07a84ed3e696012610b6da074d97453bd01e0e30744b472c8179"},
|
||||
{file = "websockets-14.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2177ee3901075167f01c5e335a6685e71b162a54a89a56001f1c3e9e3d2ad250"},
|
||||
{file = "websockets-14.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f14a96a0034a27f9d47fd9788913924c89612225878f8078bb9d55f859272b0"},
|
||||
{file = "websockets-14.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f874ba705deea77bcf64a9da42c1f5fc2466d8f14daf410bc7d4ceae0a9fcb0"},
|
||||
{file = "websockets-14.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9607b9a442392e690a57909c362811184ea429585a71061cd5d3c2b98065c199"},
|
||||
{file = "websockets-14.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:bea45f19b7ca000380fbd4e02552be86343080120d074b87f25593ce1700ad58"},
|
||||
{file = "websockets-14.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:219c8187b3ceeadbf2afcf0f25a4918d02da7b944d703b97d12fb01510869078"},
|
||||
{file = "websockets-14.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ad2ab2547761d79926effe63de21479dfaf29834c50f98c4bf5b5480b5838434"},
|
||||
{file = "websockets-14.1-cp310-cp310-win32.whl", hash = "sha256:1288369a6a84e81b90da5dbed48610cd7e5d60af62df9851ed1d1d23a9069f10"},
|
||||
{file = "websockets-14.1-cp310-cp310-win_amd64.whl", hash = "sha256:e0744623852f1497d825a49a99bfbec9bea4f3f946df6eb9d8a2f0c37a2fec2e"},
|
||||
{file = "websockets-14.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:449d77d636f8d9c17952628cc7e3b8faf6e92a17ec581ec0c0256300717e1512"},
|
||||
{file = "websockets-14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a35f704be14768cea9790d921c2c1cc4fc52700410b1c10948511039be824aac"},
|
||||
{file = "websockets-14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b1f3628a0510bd58968c0f60447e7a692933589b791a6b572fcef374053ca280"},
|
||||
{file = "websockets-14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c3deac3748ec73ef24fc7be0b68220d14d47d6647d2f85b2771cb35ea847aa1"},
|
||||
{file = "websockets-14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7048eb4415d46368ef29d32133134c513f507fff7d953c18c91104738a68c3b3"},
|
||||
{file = "websockets-14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6cf0ad281c979306a6a34242b371e90e891bce504509fb6bb5246bbbf31e7b6"},
|
||||
{file = "websockets-14.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cc1fc87428c1d18b643479caa7b15db7d544652e5bf610513d4a3478dbe823d0"},
|
||||
{file = "websockets-14.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f95ba34d71e2fa0c5d225bde3b3bdb152e957150100e75c86bc7f3964c450d89"},
|
||||
{file = "websockets-14.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9481a6de29105d73cf4515f2bef8eb71e17ac184c19d0b9918a3701c6c9c4f23"},
|
||||
{file = "websockets-14.1-cp311-cp311-win32.whl", hash = "sha256:368a05465f49c5949e27afd6fbe0a77ce53082185bbb2ac096a3a8afaf4de52e"},
|
||||
{file = "websockets-14.1-cp311-cp311-win_amd64.whl", hash = "sha256:6d24fc337fc055c9e83414c94e1ee0dee902a486d19d2a7f0929e49d7d604b09"},
|
||||
{file = "websockets-14.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ed907449fe5e021933e46a3e65d651f641975a768d0649fee59f10c2985529ed"},
|
||||
{file = "websockets-14.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:87e31011b5c14a33b29f17eb48932e63e1dcd3fa31d72209848652310d3d1f0d"},
|
||||
{file = "websockets-14.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bc6ccf7d54c02ae47a48ddf9414c54d48af9c01076a2e1023e3b486b6e72c707"},
|
||||
{file = "websockets-14.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9777564c0a72a1d457f0848977a1cbe15cfa75fa2f67ce267441e465717dcf1a"},
|
||||
{file = "websockets-14.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a655bde548ca98f55b43711b0ceefd2a88a71af6350b0c168aa77562104f3f45"},
|
||||
{file = "websockets-14.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a3dfff83ca578cada2d19e665e9c8368e1598d4e787422a460ec70e531dbdd58"},
|
||||
{file = "websockets-14.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6a6c9bcf7cdc0fd41cc7b7944447982e8acfd9f0d560ea6d6845428ed0562058"},
|
||||
{file = "websockets-14.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4b6caec8576e760f2c7dd878ba817653144d5f369200b6ddf9771d64385b84d4"},
|
||||
{file = "websockets-14.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:eb6d38971c800ff02e4a6afd791bbe3b923a9a57ca9aeab7314c21c84bf9ff05"},
|
||||
{file = "websockets-14.1-cp312-cp312-win32.whl", hash = "sha256:1d045cbe1358d76b24d5e20e7b1878efe578d9897a25c24e6006eef788c0fdf0"},
|
||||
{file = "websockets-14.1-cp312-cp312-win_amd64.whl", hash = "sha256:90f4c7a069c733d95c308380aae314f2cb45bd8a904fb03eb36d1a4983a4993f"},
|
||||
{file = "websockets-14.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:3630b670d5057cd9e08b9c4dab6493670e8e762a24c2c94ef312783870736ab9"},
|
||||
{file = "websockets-14.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:36ebd71db3b89e1f7b1a5deaa341a654852c3518ea7a8ddfdf69cc66acc2db1b"},
|
||||
{file = "websockets-14.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5b918d288958dc3fa1c5a0b9aa3256cb2b2b84c54407f4813c45d52267600cd3"},
|
||||
{file = "websockets-14.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00fe5da3f037041da1ee0cf8e308374e236883f9842c7c465aa65098b1c9af59"},
|
||||
{file = "websockets-14.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8149a0f5a72ca36720981418eeffeb5c2729ea55fa179091c81a0910a114a5d2"},
|
||||
{file = "websockets-14.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:77569d19a13015e840b81550922056acabc25e3f52782625bc6843cfa034e1da"},
|
||||
{file = "websockets-14.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cf5201a04550136ef870aa60ad3d29d2a59e452a7f96b94193bee6d73b8ad9a9"},
|
||||
{file = "websockets-14.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:88cf9163ef674b5be5736a584c999e98daf3aabac6e536e43286eb74c126b9c7"},
|
||||
{file = "websockets-14.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:836bef7ae338a072e9d1863502026f01b14027250a4545672673057997d5c05a"},
|
||||
{file = "websockets-14.1-cp313-cp313-win32.whl", hash = "sha256:0d4290d559d68288da9f444089fd82490c8d2744309113fc26e2da6e48b65da6"},
|
||||
{file = "websockets-14.1-cp313-cp313-win_amd64.whl", hash = "sha256:8621a07991add373c3c5c2cf89e1d277e49dc82ed72c75e3afc74bd0acc446f0"},
|
||||
{file = "websockets-14.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:01bb2d4f0a6d04538d3c5dfd27c0643269656c28045a53439cbf1c004f90897a"},
|
||||
{file = "websockets-14.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:414ffe86f4d6f434a8c3b7913655a1a5383b617f9bf38720e7c0799fac3ab1c6"},
|
||||
{file = "websockets-14.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8fda642151d5affdee8a430bd85496f2e2517be3a2b9d2484d633d5712b15c56"},
|
||||
{file = "websockets-14.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd7c11968bc3860d5c78577f0dbc535257ccec41750675d58d8dc66aa47fe52c"},
|
||||
{file = "websockets-14.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a032855dc7db987dff813583d04f4950d14326665d7e714d584560b140ae6b8b"},
|
||||
{file = "websockets-14.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7e7ea2f782408c32d86b87a0d2c1fd8871b0399dd762364c731d86c86069a78"},
|
||||
{file = "websockets-14.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:39450e6215f7d9f6f7bc2a6da21d79374729f5d052333da4d5825af8a97e6735"},
|
||||
{file = "websockets-14.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ceada5be22fa5a5a4cdeec74e761c2ee7db287208f54c718f2df4b7e200b8d4a"},
|
||||
{file = "websockets-14.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:3fc753451d471cff90b8f467a1fc0ae64031cf2d81b7b34e1811b7e2691bc4bc"},
|
||||
{file = "websockets-14.1-cp39-cp39-win32.whl", hash = "sha256:14839f54786987ccd9d03ed7f334baec0f02272e7ec4f6e9d427ff584aeea8b4"},
|
||||
{file = "websockets-14.1-cp39-cp39-win_amd64.whl", hash = "sha256:d9fd19ecc3a4d5ae82ddbfb30962cf6d874ff943e56e0c81f5169be2fda62979"},
|
||||
{file = "websockets-14.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:e5dc25a9dbd1a7f61eca4b7cb04e74ae4b963d658f9e4f9aad9cd00b688692c8"},
|
||||
{file = "websockets-14.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:04a97aca96ca2acedf0d1f332c861c5a4486fdcba7bcef35873820f940c4231e"},
|
||||
{file = "websockets-14.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:df174ece723b228d3e8734a6f2a6febbd413ddec39b3dc592f5a4aa0aff28098"},
|
||||
{file = "websockets-14.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:034feb9f4286476f273b9a245fb15f02c34d9586a5bc936aff108c3ba1b21beb"},
|
||||
{file = "websockets-14.1-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:660c308dabd2b380807ab64b62985eaccf923a78ebc572bd485375b9ca2b7dc7"},
|
||||
{file = "websockets-14.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5a42d3ecbb2db5080fc578314439b1d79eef71d323dc661aa616fb492436af5d"},
|
||||
{file = "websockets-14.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:ddaa4a390af911da6f680be8be4ff5aaf31c4c834c1a9147bc21cbcbca2d4370"},
|
||||
{file = "websockets-14.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:a4c805c6034206143fbabd2d259ec5e757f8b29d0a2f0bf3d2fe5d1f60147a4a"},
|
||||
{file = "websockets-14.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:205f672a6c2c671a86d33f6d47c9b35781a998728d2c7c2a3e1cf3333fcb62b7"},
|
||||
{file = "websockets-14.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef440054124728cc49b01c33469de06755e5a7a4e83ef61934ad95fc327fbb0"},
|
||||
{file = "websockets-14.1-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7591d6f440af7f73c4bd9404f3772bfee064e639d2b6cc8c94076e71b2471c1"},
|
||||
{file = "websockets-14.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:25225cc79cfebc95ba1d24cd3ab86aaa35bcd315d12fa4358939bd55e9bd74a5"},
|
||||
{file = "websockets-14.1-py3-none-any.whl", hash = "sha256:4d4fc827a20abe6d544a119896f6b78ee13fe81cbfef416f3f2ddf09a03f0e2e"},
|
||||
{file = "websockets-14.1.tar.gz", hash = "sha256:398b10c77d471c0aab20a845e7a60076b6390bfdaac7a6d2edb0d2c59d75e8d8"},
|
||||
{file = "websockets-14.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e8179f95323b9ab1c11723e5d91a89403903f7b001828161b480a7810b334885"},
|
||||
{file = "websockets-14.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0d8c3e2cdb38f31d8bd7d9d28908005f6fa9def3324edb9bf336d7e4266fd397"},
|
||||
{file = "websockets-14.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:714a9b682deb4339d39ffa674f7b674230227d981a37d5d174a4a83e3978a610"},
|
||||
{file = "websockets-14.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2e53c72052f2596fb792a7acd9704cbc549bf70fcde8a99e899311455974ca3"},
|
||||
{file = "websockets-14.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e3fbd68850c837e57373d95c8fe352203a512b6e49eaae4c2f4088ef8cf21980"},
|
||||
{file = "websockets-14.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b27ece32f63150c268593d5fdb82819584831a83a3f5809b7521df0685cd5d8"},
|
||||
{file = "websockets-14.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4daa0faea5424d8713142b33825fff03c736f781690d90652d2c8b053345b0e7"},
|
||||
{file = "websockets-14.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:bc63cee8596a6ec84d9753fd0fcfa0452ee12f317afe4beae6b157f0070c6c7f"},
|
||||
{file = "websockets-14.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7a570862c325af2111343cc9b0257b7119b904823c675b22d4ac547163088d0d"},
|
||||
{file = "websockets-14.2-cp310-cp310-win32.whl", hash = "sha256:75862126b3d2d505e895893e3deac0a9339ce750bd27b4ba515f008b5acf832d"},
|
||||
{file = "websockets-14.2-cp310-cp310-win_amd64.whl", hash = "sha256:cc45afb9c9b2dc0852d5c8b5321759cf825f82a31bfaf506b65bf4668c96f8b2"},
|
||||
{file = "websockets-14.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3bdc8c692c866ce5fefcaf07d2b55c91d6922ac397e031ef9b774e5b9ea42166"},
|
||||
{file = "websockets-14.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c93215fac5dadc63e51bcc6dceca72e72267c11def401d6668622b47675b097f"},
|
||||
{file = "websockets-14.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1c9b6535c0e2cf8a6bf938064fb754aaceb1e6a4a51a80d884cd5db569886910"},
|
||||
{file = "websockets-14.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a52a6d7cf6938e04e9dceb949d35fbdf58ac14deea26e685ab6368e73744e4c"},
|
||||
{file = "websockets-14.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9f05702e93203a6ff5226e21d9b40c037761b2cfb637187c9802c10f58e40473"},
|
||||
{file = "websockets-14.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22441c81a6748a53bfcb98951d58d1af0661ab47a536af08920d129b4d1c3473"},
|
||||
{file = "websockets-14.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:efd9b868d78b194790e6236d9cbc46d68aba4b75b22497eb4ab64fa640c3af56"},
|
||||
{file = "websockets-14.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1a5a20d5843886d34ff8c57424cc65a1deda4375729cbca4cb6b3353f3ce4142"},
|
||||
{file = "websockets-14.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:34277a29f5303d54ec6468fb525d99c99938607bc96b8d72d675dee2b9f5bf1d"},
|
||||
{file = "websockets-14.2-cp311-cp311-win32.whl", hash = "sha256:02687db35dbc7d25fd541a602b5f8e451a238ffa033030b172ff86a93cb5dc2a"},
|
||||
{file = "websockets-14.2-cp311-cp311-win_amd64.whl", hash = "sha256:862e9967b46c07d4dcd2532e9e8e3c2825e004ffbf91a5ef9dde519ee2effb0b"},
|
||||
{file = "websockets-14.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1f20522e624d7ffbdbe259c6b6a65d73c895045f76a93719aa10cd93b3de100c"},
|
||||
{file = "websockets-14.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:647b573f7d3ada919fd60e64d533409a79dcf1ea21daeb4542d1d996519ca967"},
|
||||
{file = "websockets-14.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6af99a38e49f66be5a64b1e890208ad026cda49355661549c507152113049990"},
|
||||
{file = "websockets-14.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:091ab63dfc8cea748cc22c1db2814eadb77ccbf82829bac6b2fbe3401d548eda"},
|
||||
{file = "websockets-14.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b374e8953ad477d17e4851cdc66d83fdc2db88d9e73abf755c94510ebddceb95"},
|
||||
{file = "websockets-14.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a39d7eceeea35db85b85e1169011bb4321c32e673920ae9c1b6e0978590012a3"},
|
||||
{file = "websockets-14.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0a6f3efd47ffd0d12080594f434faf1cd2549b31e54870b8470b28cc1d3817d9"},
|
||||
{file = "websockets-14.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:065ce275e7c4ffb42cb738dd6b20726ac26ac9ad0a2a48e33ca632351a737267"},
|
||||
{file = "websockets-14.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e9d0e53530ba7b8b5e389c02282f9d2aa47581514bd6049d3a7cffe1385cf5fe"},
|
||||
{file = "websockets-14.2-cp312-cp312-win32.whl", hash = "sha256:20e6dd0984d7ca3037afcb4494e48c74ffb51e8013cac71cf607fffe11df7205"},
|
||||
{file = "websockets-14.2-cp312-cp312-win_amd64.whl", hash = "sha256:44bba1a956c2c9d268bdcdf234d5e5ff4c9b6dc3e300545cbe99af59dda9dcce"},
|
||||
{file = "websockets-14.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6f1372e511c7409a542291bce92d6c83320e02c9cf392223272287ce55bc224e"},
|
||||
{file = "websockets-14.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4da98b72009836179bb596a92297b1a61bb5a830c0e483a7d0766d45070a08ad"},
|
||||
{file = "websockets-14.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8a86a269759026d2bde227652b87be79f8a734e582debf64c9d302faa1e9f03"},
|
||||
{file = "websockets-14.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86cf1aaeca909bf6815ea714d5c5736c8d6dd3a13770e885aafe062ecbd04f1f"},
|
||||
{file = "websockets-14.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9b0f6c3ba3b1240f602ebb3971d45b02cc12bd1845466dd783496b3b05783a5"},
|
||||
{file = "websockets-14.2-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:669c3e101c246aa85bc8534e495952e2ca208bd87994650b90a23d745902db9a"},
|
||||
{file = "websockets-14.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:eabdb28b972f3729348e632ab08f2a7b616c7e53d5414c12108c29972e655b20"},
|
||||
{file = "websockets-14.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2066dc4cbcc19f32c12a5a0e8cc1b7ac734e5b64ac0a325ff8353451c4b15ef2"},
|
||||
{file = "websockets-14.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ab95d357cd471df61873dadf66dd05dd4709cae001dd6342edafc8dc6382f307"},
|
||||
{file = "websockets-14.2-cp313-cp313-win32.whl", hash = "sha256:a9e72fb63e5f3feacdcf5b4ff53199ec8c18d66e325c34ee4c551ca748623bbc"},
|
||||
{file = "websockets-14.2-cp313-cp313-win_amd64.whl", hash = "sha256:b439ea828c4ba99bb3176dc8d9b933392a2413c0f6b149fdcba48393f573377f"},
|
||||
{file = "websockets-14.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7cd5706caec1686c5d233bc76243ff64b1c0dc445339bd538f30547e787c11fe"},
|
||||
{file = "websockets-14.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ec607328ce95a2f12b595f7ae4c5d71bf502212bddcea528290b35c286932b12"},
|
||||
{file = "websockets-14.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:da85651270c6bfb630136423037dd4975199e5d4114cae6d3066641adcc9d1c7"},
|
||||
{file = "websockets-14.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3ecadc7ce90accf39903815697917643f5b7cfb73c96702318a096c00aa71f5"},
|
||||
{file = "websockets-14.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1979bee04af6a78608024bad6dfcc0cc930ce819f9e10342a29a05b5320355d0"},
|
||||
{file = "websockets-14.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2dddacad58e2614a24938a50b85969d56f88e620e3f897b7d80ac0d8a5800258"},
|
||||
{file = "websockets-14.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:89a71173caaf75fa71a09a5f614f450ba3ec84ad9fca47cb2422a860676716f0"},
|
||||
{file = "websockets-14.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:6af6a4b26eea4fc06c6818a6b962a952441e0e39548b44773502761ded8cc1d4"},
|
||||
{file = "websockets-14.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:80c8efa38957f20bba0117b48737993643204645e9ec45512579132508477cfc"},
|
||||
{file = "websockets-14.2-cp39-cp39-win32.whl", hash = "sha256:2e20c5f517e2163d76e2729104abc42639c41cf91f7b1839295be43302713661"},
|
||||
{file = "websockets-14.2-cp39-cp39-win_amd64.whl", hash = "sha256:b4c8cef610e8d7c70dea92e62b6814a8cd24fbd01d7103cc89308d2bfe1659ef"},
|
||||
{file = "websockets-14.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:d7d9cafbccba46e768be8a8ad4635fa3eae1ffac4c6e7cb4eb276ba41297ed29"},
|
||||
{file = "websockets-14.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:c76193c1c044bd1e9b3316dcc34b174bbf9664598791e6fb606d8d29000e070c"},
|
||||
{file = "websockets-14.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd475a974d5352390baf865309fe37dec6831aafc3014ffac1eea99e84e83fc2"},
|
||||
{file = "websockets-14.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2c6c0097a41968b2e2b54ed3424739aab0b762ca92af2379f152c1aef0187e1c"},
|
||||
{file = "websockets-14.2-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d7ff794c8b36bc402f2e07c0b2ceb4a2424147ed4785ff03e2a7af03711d60a"},
|
||||
{file = "websockets-14.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:dec254fcabc7bd488dab64846f588fc5b6fe0d78f641180030f8ea27b76d72c3"},
|
||||
{file = "websockets-14.2-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:bbe03eb853e17fd5b15448328b4ec7fb2407d45fb0245036d06a3af251f8e48f"},
|
||||
{file = "websockets-14.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:a3c4aa3428b904d5404a0ed85f3644d37e2cb25996b7f096d77caeb0e96a3b42"},
|
||||
{file = "websockets-14.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:577a4cebf1ceaf0b65ffc42c54856214165fb8ceeba3935852fc33f6b0c55e7f"},
|
||||
{file = "websockets-14.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ad1c1d02357b7665e700eca43a31d52814ad9ad9b89b58118bdabc365454b574"},
|
||||
{file = "websockets-14.2-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f390024a47d904613577df83ba700bd189eedc09c57af0a904e5c39624621270"},
|
||||
{file = "websockets-14.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3c1426c021c38cf92b453cdf371228d3430acd775edee6bac5a4d577efc72365"},
|
||||
{file = "websockets-14.2-py3-none-any.whl", hash = "sha256:7a6ceec4ea84469f15cf15807a747e9efe57e369c384fa86e022b3bea679b79b"},
|
||||
{file = "websockets-14.2.tar.gz", hash = "sha256:5059ed9c54945efb321f097084b4c7e52c246f2c869815876a69d1efc4ad6eb5"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||
@@ -4,4 +4,9 @@
|
||||
|
||||
# cannot import ArchivingOrchestrator/Config to avoid circular dep
|
||||
# from .orchestrator import ArchivingOrchestrator
|
||||
# from .config import Config
|
||||
# from .config import Config
|
||||
|
||||
from .media import Media
|
||||
from .step import Step
|
||||
from .context import ArchivingContext
|
||||
from .metadata import Metadata
|
||||
|
||||
@@ -190,7 +190,6 @@ class ArchivingOrchestrator:
|
||||
|
||||
yaml_config = read_yaml(basic_config.config_file)
|
||||
|
||||
breakpoint()
|
||||
self.setup_complete_parser(basic_config, yaml_config, unused_args)
|
||||
|
||||
self.install_modules()
|
||||
|
||||
@@ -3,8 +3,3 @@
|
||||
|
||||
"""
|
||||
from .database import Database
|
||||
from .gsheet_db.gsheet_db import GsheetsDb
|
||||
from .console_db.console_db import ConsoleDb
|
||||
from .csv_db.csv_db import CSVDb
|
||||
from .api_db.api_db import AAApiDb
|
||||
from .atlos_db.atlos_db import AtlosDb
|
||||
@@ -1,70 +0,0 @@
|
||||
from typing import Union
|
||||
import requests, os
|
||||
from loguru import logger
|
||||
|
||||
from .. import Database
|
||||
from ...core import Metadata
|
||||
|
||||
|
||||
class AAApiDb(Database):
|
||||
"""
|
||||
Connects to auto-archiver-api instance
|
||||
"""
|
||||
name = "auto_archiver_api_db"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
self.allow_rearchive = bool(self.allow_rearchive)
|
||||
self.store_results = bool(self.store_results)
|
||||
self.assert_valid_string("api_endpoint")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
|
||||
"api_token": {"default": None, "help": "API Bearer token."},
|
||||
"public": {"default": False, "help": "whether the URL should be publicly available via the API"},
|
||||
"author_id": {"default": None, "help": "which email to assign as author"},
|
||||
"group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
|
||||
"allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"},
|
||||
"store_results": {"default": True, "help": "when set, will send the results to the API database."},
|
||||
"tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))},
|
||||
}
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
""" query the database for the existence of this item.
|
||||
Helps avoid re-archiving the same URL multiple times.
|
||||
"""
|
||||
if not self.allow_rearchive: return
|
||||
|
||||
params = {"url": item.get_url(), "limit": 15}
|
||||
headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"}
|
||||
response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers)
|
||||
|
||||
if response.status_code == 200:
|
||||
if len(response.json()):
|
||||
logger.success(f"API returned {len(response.json())} previously archived instance(s)")
|
||||
fetched_metadata = [Metadata.from_dict(r["result"]) for r in response.json()]
|
||||
return Metadata.choose_most_complete(fetched_metadata)
|
||||
else:
|
||||
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
|
||||
return False
|
||||
|
||||
|
||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
if not self.store_results: return
|
||||
if cached:
|
||||
logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached")
|
||||
return
|
||||
logger.debug(f"saving archive of {item.get_url()} to the AA API.")
|
||||
|
||||
payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
|
||||
headers = {"Authorization": f"Bearer {self.api_token}"}
|
||||
response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, headers=headers)
|
||||
|
||||
if response.status_code == 200:
|
||||
logger.success(f"AA API: {response.json()}")
|
||||
else:
|
||||
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
|
||||
|
||||
@@ -1,79 +0,0 @@
|
||||
import os
|
||||
from typing import Union
|
||||
from loguru import logger
|
||||
from csv import DictWriter
|
||||
from dataclasses import asdict
|
||||
import requests
|
||||
|
||||
from .. import Database
|
||||
from ...core import Metadata
|
||||
from ...utils import get_atlos_config_options
|
||||
|
||||
|
||||
class AtlosDb(Database):
|
||||
"""
|
||||
Outputs results to Atlos
|
||||
"""
|
||||
|
||||
name = "atlos_db"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return get_atlos_config_options()
|
||||
|
||||
def failed(self, item: Metadata, reason: str) -> None:
|
||||
"""Update DB accordingly for failure"""
|
||||
# If the item has no Atlos ID, there's nothing for us to do
|
||||
if not item.metadata.get("atlos_id"):
|
||||
logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
|
||||
return
|
||||
|
||||
requests.post(
|
||||
f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver",
|
||||
headers={"Authorization": f"Bearer {self.api_token}"},
|
||||
json={"metadata": {"processed": True, "status": "error", "error": reason}},
|
||||
).raise_for_status()
|
||||
logger.info(
|
||||
f"Stored failure for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos: {reason}"
|
||||
)
|
||||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
"""check and fetch if the given item has been archived already, each
|
||||
database should handle its own caching, and configuration mechanisms"""
|
||||
return False
|
||||
|
||||
def _process_metadata(self, item: Metadata) -> dict:
|
||||
"""Process metadata for storage on Atlos. Will convert any datetime
|
||||
objects to ISO format."""
|
||||
|
||||
return {
|
||||
k: v.isoformat() if hasattr(v, "isoformat") else v
|
||||
for k, v in item.metadata.items()
|
||||
}
|
||||
|
||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
|
||||
if not item.metadata.get("atlos_id"):
|
||||
logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
|
||||
return
|
||||
|
||||
requests.post(
|
||||
f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver",
|
||||
headers={"Authorization": f"Bearer {self.api_token}"},
|
||||
json={
|
||||
"metadata": dict(
|
||||
processed=True,
|
||||
status="success",
|
||||
results=self._process_metadata(item),
|
||||
)
|
||||
},
|
||||
).raise_for_status()
|
||||
|
||||
logger.info(
|
||||
f"Stored success for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos"
|
||||
)
|
||||
@@ -1,32 +0,0 @@
|
||||
from loguru import logger
|
||||
|
||||
from .. import Database
|
||||
from ...core import Metadata
|
||||
|
||||
|
||||
class ConsoleDb(Database):
|
||||
"""
|
||||
Outputs results to the console
|
||||
"""
|
||||
name = "console_db"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
|
||||
def failed(self, item: Metadata, reason:str) -> None:
|
||||
logger.error(f"FAILED {item}: {reason}")
|
||||
|
||||
def aborted(self, item: Metadata) -> None:
|
||||
logger.warning(f"ABORTED {item}")
|
||||
|
||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
logger.success(f"DONE {item}")
|
||||
@@ -1,34 +0,0 @@
|
||||
import os
|
||||
from loguru import logger
|
||||
from csv import DictWriter
|
||||
from dataclasses import asdict
|
||||
|
||||
from .. import Database
|
||||
from ...core import Metadata
|
||||
|
||||
|
||||
class CSVDb(Database):
|
||||
"""
|
||||
Outputs results to a CSV file
|
||||
"""
|
||||
name = "csv_db"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
self.assert_valid_string("csv_file")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"csv_file": {"default": "db.csv", "help": "CSV file name"}
|
||||
}
|
||||
|
||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
logger.success(f"DONE {item}")
|
||||
is_empty = not os.path.isfile(self.csv_file) or os.path.getsize(self.csv_file) == 0
|
||||
with open(self.csv_file, "a", encoding="utf-8") as outf:
|
||||
writer = DictWriter(outf, fieldnames=asdict(Metadata()))
|
||||
if is_empty: writer.writeheader()
|
||||
writer.writerow(asdict(item))
|
||||
@@ -1,112 +0,0 @@
|
||||
from typing import Union, Tuple
|
||||
import datetime
|
||||
from urllib.parse import quote
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from .. import Database
|
||||
from ...core import Metadata, Media, ArchivingContext
|
||||
from ...utils import GWorksheet
|
||||
|
||||
|
||||
class GsheetsDb(Database):
|
||||
"""
|
||||
NB: only works if GsheetFeeder is used.
|
||||
could be updated in the future to support non-GsheetFeeder metadata
|
||||
"""
|
||||
name = "gsheet_db"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, 'status', 'Archive in progress')
|
||||
|
||||
def failed(self, item: Metadata, reason:str) -> None:
|
||||
logger.error(f"FAILED {item}")
|
||||
self._safe_status_update(item, f'Archive failed {reason}')
|
||||
|
||||
def aborted(self, item: Metadata) -> None:
|
||||
logger.warning(f"ABORTED {item}")
|
||||
self._safe_status_update(item, '')
|
||||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
"""check if the given item has been archived already"""
|
||||
return False
|
||||
|
||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
logger.success(f"DONE {item.get_url()}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
# self._safe_status_update(item, 'done')
|
||||
|
||||
cell_updates = []
|
||||
row_values = gw.get_row(row)
|
||||
|
||||
def batch_if_valid(col, val, final_value=None):
|
||||
final_value = final_value or val
|
||||
try:
|
||||
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
|
||||
cell_updates.append((row, col, final_value))
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to batch {col}={final_value} due to {e}")
|
||||
status_message = item.status
|
||||
if cached:
|
||||
status_message = f"[cached] {status_message}"
|
||||
cell_updates.append((row, 'status', status_message))
|
||||
|
||||
media: Media = item.get_final_media()
|
||||
if hasattr(media, "urls"):
|
||||
batch_if_valid('archive', "\n".join(media.urls))
|
||||
batch_if_valid('date', True, datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat())
|
||||
batch_if_valid('title', item.get_title())
|
||||
batch_if_valid('text', item.get("content", ""))
|
||||
batch_if_valid('timestamp', item.get_timestamp())
|
||||
if media: batch_if_valid('hash', media.get("hash", "not-calculated"))
|
||||
|
||||
# merge all pdq hashes into a single string, if present
|
||||
pdq_hashes = []
|
||||
all_media = item.get_all_media()
|
||||
for m in all_media:
|
||||
if pdq := m.get("pdq_hash"):
|
||||
pdq_hashes.append(pdq)
|
||||
if len(pdq_hashes):
|
||||
batch_if_valid('pdq_hash', ",".join(pdq_hashes))
|
||||
|
||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
|
||||
batch_if_valid('screenshot', "\n".join(screenshot.urls))
|
||||
|
||||
if (thumbnail := item.get_first_image("thumbnail")):
|
||||
if hasattr(thumbnail, "urls"):
|
||||
batch_if_valid('thumbnail', f'=IMAGE("{thumbnail.urls[0]}")')
|
||||
|
||||
if (browsertrix := item.get_media_by_id("browsertrix")):
|
||||
batch_if_valid('wacz', "\n".join(browsertrix.urls))
|
||||
batch_if_valid('replaywebpage', "\n".join([f'https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}' for wacz in browsertrix.urls]))
|
||||
|
||||
gw.batch_set_cell(cell_updates)
|
||||
|
||||
def _safe_status_update(self, item: Metadata, new_status: str) -> None:
|
||||
try:
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, 'status', new_status)
|
||||
except Exception as e:
|
||||
logger.debug(f"Unable to update sheet: {e}")
|
||||
|
||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
|
||||
if gsheet := ArchivingContext.get("gsheet"):
|
||||
gw: GWorksheet = gsheet.get("worksheet")
|
||||
row: int = gsheet.get("row")
|
||||
elif self.sheet_id:
|
||||
print(self.sheet_id)
|
||||
|
||||
|
||||
return gw, row
|
||||
@@ -2,8 +2,8 @@ from typing import Union
|
||||
import requests, os
|
||||
from loguru import logger
|
||||
|
||||
from . import Database
|
||||
from ..core import Metadata
|
||||
from auto_archiver.databases import Database
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
class AAApiDb(Database):
|
||||
@@ -19,18 +19,7 @@ class AAApiDb(Database):
|
||||
self.store_results = bool(self.store_results)
|
||||
self.assert_valid_string("api_endpoint")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
|
||||
"api_token": {"default": None, "help": "API Bearer token."},
|
||||
"public": {"default": False, "help": "whether the URL should be publicly available via the API"},
|
||||
"author_id": {"default": None, "help": "which email to assign as author"},
|
||||
"group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
|
||||
"allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"},
|
||||
"store_results": {"default": True, "help": "when set, will send the results to the API database."},
|
||||
"tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))},
|
||||
}
|
||||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
""" query the database for the existence of this item.
|
||||
Helps avoid re-archiving the same URL multiple times.
|
||||
@@ -7,7 +7,18 @@
|
||||
{"python": ["loguru",
|
||||
""],
|
||||
"bin": [""]},
|
||||
"configs": {},
|
||||
"configs": {
|
||||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Handles integration with the Atlos platform for managing archival results.
|
||||
|
||||
@@ -5,9 +5,9 @@ from csv import DictWriter
|
||||
from dataclasses import asdict
|
||||
import requests
|
||||
|
||||
from . import Database
|
||||
from ..core import Metadata
|
||||
from ..utils import get_atlos_config_options
|
||||
from auto_archiver.databases import Database
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.utils import get_atlos_config_options
|
||||
|
||||
|
||||
class AtlosDb(Database):
|
||||
@@ -21,6 +21,7 @@ class AtlosDb(Database):
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
# TODO
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return get_atlos_config_options()
|
||||
34
src/auto_archiver/modules/atlos_feeder/__manifest__.py
Normal file
34
src/auto_archiver/modules/atlos_feeder/__manifest__.py
Normal file
@@ -0,0 +1,34 @@
|
||||
{
|
||||
"name": "Atlos Feeder",
|
||||
"type": ["feeder"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "requests"],
|
||||
},
|
||||
"configs": {
|
||||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
AtlosFeeder: A feeder module that integrates with the Atlos API to fetch source material URLs for archival.
|
||||
|
||||
### Features
|
||||
- Connects to the Atlos API to retrieve a list of source material URLs.
|
||||
- Filters source materials based on visibility, processing status, and metadata.
|
||||
- Converts filtered source materials into `Metadata` objects with the relevant `atlos_id` and URL.
|
||||
- Iterates through paginated results using a cursor for efficient API interaction.
|
||||
|
||||
### Notes
|
||||
- Requires an Atlos API endpoint and a valid API token for authentication.
|
||||
- Ensures only unprocessed, visible, and ready-to-archive URLs are returned.
|
||||
- Handles pagination transparently when retrieving data from the Atlos API.
|
||||
"""
|
||||
}
|
||||
@@ -1,9 +1,9 @@
|
||||
from loguru import logger
|
||||
import requests
|
||||
|
||||
from . import Feeder
|
||||
from ..core import Metadata, ArchivingContext
|
||||
from ..utils import get_atlos_config_options
|
||||
from auto_archiver.feeders import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
from auto_archiver.utils import get_atlos_config_options
|
||||
|
||||
|
||||
class AtlosFeeder(Feeder):
|
||||
@@ -15,6 +15,7 @@ class AtlosFeeder(Feeder):
|
||||
if type(self.api_token) != str:
|
||||
raise Exception("Atlos Feeder did not receive an Atlos API token")
|
||||
|
||||
# TODO
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return get_atlos_config_options()
|
||||
24
src/auto_archiver/modules/cli_feeder/__manifest__.py
Normal file
24
src/auto_archiver/modules/cli_feeder/__manifest__.py
Normal file
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"name": "CLI Feeder",
|
||||
"type": ["feeder"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru"],
|
||||
},
|
||||
"configs": {
|
||||
"urls": {
|
||||
"default": None,
|
||||
"help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
||||
"cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Processes URLs to archive passed via the command line and feeds them into the archiving pipeline.
|
||||
|
||||
### Features
|
||||
- Takes a single URL or a list of URLs provided via the command line.
|
||||
- Converts each URL into a `Metadata` object and yields it for processing.
|
||||
- Ensures URLs are processed only if they are explicitly provided.
|
||||
|
||||
"""
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
from loguru import logger
|
||||
|
||||
from . import Feeder
|
||||
from ..core import Metadata, ArchivingContext
|
||||
from auto_archiver.feeders import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
|
||||
|
||||
class CLIFeeder(Feeder):
|
||||
@@ -13,15 +13,15 @@ class CLIFeeder(Feeder):
|
||||
if type(self.urls) != list or len(self.urls) == 0:
|
||||
raise Exception("CLI Feeder did not receive any URL to process")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"urls": {
|
||||
"default": None,
|
||||
"help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
||||
"cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
|
||||
},
|
||||
}
|
||||
# @staticmethod
|
||||
# def configs() -> dict:
|
||||
# return {
|
||||
# "urls": {
|
||||
# "default": None,
|
||||
# "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
||||
# "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
|
||||
# },
|
||||
# }
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
for url in self.urls:
|
||||
@@ -1,7 +1,7 @@
|
||||
from loguru import logger
|
||||
|
||||
from . import Database
|
||||
from ..core import Metadata
|
||||
from auto_archiver.databases import Database
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
class ConsoleDb(Database):
|
||||
@@ -14,10 +14,6 @@ class ConsoleDb(Database):
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
|
||||
0
src/auto_archiver/modules/csv_db/__init__.py
Normal file
0
src/auto_archiver/modules/csv_db/__init__.py
Normal file
@@ -3,8 +3,8 @@ from loguru import logger
|
||||
from csv import DictWriter
|
||||
from dataclasses import asdict
|
||||
|
||||
from . import Database
|
||||
from ..core import Metadata
|
||||
from auto_archiver.databases import Database
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
class CSVDb(Database):
|
||||
@@ -18,11 +18,6 @@ class CSVDb(Database):
|
||||
super().__init__(config)
|
||||
self.assert_valid_string("csv_file")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"csv_file": {"default": "db.csv", "help": "CSV file name"}
|
||||
}
|
||||
|
||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
0
src/auto_archiver/modules/csv_feeder/__init__.py
Normal file
0
src/auto_archiver/modules/csv_feeder/__init__.py
Normal file
33
src/auto_archiver/modules/csv_feeder/__manifest__.py
Normal file
33
src/auto_archiver/modules/csv_feeder/__manifest__.py
Normal file
@@ -0,0 +1,33 @@
|
||||
{
|
||||
"name": "CSV Feeder",
|
||||
"type": ["feeder"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru"],
|
||||
"bin": [""]
|
||||
},
|
||||
"configs": {
|
||||
"files": {
|
||||
"default": None,
|
||||
"help": "Path to the input file(s) to read the URLs from, comma separated. \
|
||||
Input files should be formatted with one URL per line",
|
||||
"cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
|
||||
},
|
||||
"column": {
|
||||
"default": None,
|
||||
"help": "Column number or name to read the URLs from, 0-indexed",
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
Reads URLs from CSV files and feeds them into the archiving process.
|
||||
|
||||
### Features
|
||||
- Supports reading URLs from multiple input files, specified as a comma-separated list.
|
||||
- Allows specifying the column number or name to extract URLs from.
|
||||
- Skips header rows if the first value is not a valid URL.
|
||||
- Integrates with the `ArchivingContext` to manage URL feeding.
|
||||
|
||||
### Setu N
|
||||
- Input files should be formatted with one URL per line.
|
||||
"""
|
||||
}
|
||||
@@ -1,12 +1,15 @@
|
||||
from loguru import logger
|
||||
import csv
|
||||
|
||||
from . import Feeder
|
||||
from ..core import Metadata, ArchivingContext
|
||||
from ..utils import url_or_none
|
||||
from auto_archiver.feeders import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
from auto_archiver.utils import url_or_none
|
||||
|
||||
class CSVFeeder(Feeder):
|
||||
|
||||
name = "csv_feeder"
|
||||
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
0
src/auto_archiver/modules/gsheet_db/__init__.py
Normal file
0
src/auto_archiver/modules/gsheet_db/__init__.py
Normal file
@@ -4,9 +4,9 @@ from urllib.parse import quote
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from . import Database
|
||||
from ..core import Metadata, Media, ArchivingContext
|
||||
from ..utils import GWorksheet
|
||||
from auto_archiver.databases import Database
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.utils import GWorksheet
|
||||
|
||||
|
||||
class GsheetsDb(Database):
|
||||
@@ -20,10 +20,6 @@ class GsheetsDb(Database):
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
0
src/auto_archiver/modules/gsheet_feeder/__init__.py
Normal file
0
src/auto_archiver/modules/gsheet_feeder/__init__.py
Normal file
40
src/auto_archiver/modules/gsheet_feeder/__manifest__.py
Normal file
40
src/auto_archiver/modules/gsheet_feeder/__manifest__.py
Normal file
@@ -0,0 +1,40 @@
|
||||
{
|
||||
"name": "Google Sheets Feeder",
|
||||
"type": ["feeder"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "gspread", "python-slugify"],
|
||||
},
|
||||
"configs": {
|
||||
"allow_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": True,
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver.
|
||||
|
||||
This reads data from Google Sheets and filters rows based on user-defined rules.
|
||||
The filtered rows are processed into `Metadata` objects.
|
||||
|
||||
### Features
|
||||
- Validates the sheet structure and filters rows based on input configurations.
|
||||
- Processes only worksheets allowed by the `allow_worksheets` and `block_worksheets` configurations.
|
||||
- Ensures only rows with valid URLs and unprocessed statuses are included for archival.
|
||||
- Supports organizing stored files into folder paths based on sheet and worksheet names.
|
||||
|
||||
### Notes
|
||||
- Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`.
|
||||
- Create the sheet using the template provided in the docs.
|
||||
"""
|
||||
}
|
||||
@@ -14,9 +14,9 @@ from loguru import logger
|
||||
from slugify import slugify
|
||||
|
||||
# from . import Enricher
|
||||
from . import Feeder
|
||||
from ..core import Metadata, ArchivingContext
|
||||
from ..utils import Gsheets, GWorksheet
|
||||
from auto_archiver.feeders import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
from auto_archiver.utils import Gsheets, GWorksheet
|
||||
|
||||
|
||||
class GsheetsFeeder(Gsheets, Feeder):
|
||||
@@ -27,26 +27,26 @@ class GsheetsFeeder(Gsheets, Feeder):
|
||||
super().__init__(config)
|
||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return dict(
|
||||
Gsheets.configs(),
|
||||
** {
|
||||
"allow_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": True,
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
}
|
||||
})
|
||||
# @staticmethod
|
||||
# def configs() -> dict:
|
||||
# return dict(
|
||||
# Gsheets.configs(),
|
||||
# ** {
|
||||
# "allow_worksheets": {
|
||||
# "default": set(),
|
||||
# "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||
# "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
# },
|
||||
# "block_worksheets": {
|
||||
# "default": set(),
|
||||
# "help": "(CSV) explicitly block some worksheets from being processed",
|
||||
# "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
# },
|
||||
# "use_sheet_names_in_stored_paths": {
|
||||
# "default": True,
|
||||
# "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
# }
|
||||
# })
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
sh = self.open_sheet()
|
||||
0
src/auto_archiver/modules/hash_enricher/__init__.py
Normal file
0
src/auto_archiver/modules/hash_enricher/__init__.py
Normal file
27
src/auto_archiver/modules/hash_enricher/__manifest__.py
Normal file
27
src/auto_archiver/modules/hash_enricher/__manifest__.py
Normal file
@@ -0,0 +1,27 @@
|
||||
{
|
||||
"name": "Hash Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru"],
|
||||
},
|
||||
"configs": {
|
||||
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
|
||||
"chunksize": {"default": int(1.6e7), "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
|
||||
},
|
||||
"description": """
|
||||
Generates cryptographic hashes for media files to ensure data integrity and authenticity.
|
||||
|
||||
### Features
|
||||
- Calculates cryptographic hashes (SHA-256 or SHA3-512) for media files stored in `Metadata` objects.
|
||||
- Ensures content authenticity, integrity validation, and duplicate identification.
|
||||
- Efficiently processes large files by reading file bytes in configurable chunk sizes.
|
||||
- Supports dynamic configuration of hash algorithms and chunk sizes.
|
||||
- Updates media metadata with the computed hash value in the format `<algorithm>:<hash>`.
|
||||
|
||||
### Notes
|
||||
- Default hash algorithm is SHA-256, but SHA3-512 is also supported.
|
||||
- Chunk size defaults to 16 MB but can be adjusted based on memory requirements.
|
||||
- Useful for workflows requiring hash-based content validation or deduplication.
|
||||
""",
|
||||
}
|
||||
@@ -10,8 +10,8 @@ making it suitable for handling large files efficiently.
|
||||
import hashlib
|
||||
from loguru import logger
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Metadata, ArchivingContext
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
|
||||
|
||||
class HashEnricher(Enricher):
|
||||
@@ -45,13 +45,6 @@ class HashEnricher(Enricher):
|
||||
|
||||
ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
|
||||
"chunksize": {"default": int(1.6e7), "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
|
||||
@@ -8,7 +8,7 @@
|
||||
"retrying",
|
||||
"tqdm",],
|
||||
},
|
||||
"no_setup_required": False,
|
||||
"requires_setup": True,
|
||||
"configs": {
|
||||
"access_token": {"default": None, "help": "a valid instagrapi-api token"},
|
||||
"api_endpoint": {"default": None, "help": "API endpoint to use"},
|
||||
@@ -25,5 +25,22 @@
|
||||
"help": "if true, will remove empty values from the json output",
|
||||
},
|
||||
},
|
||||
"description": "",
|
||||
"description": """
|
||||
Archives various types of Instagram content using the Instagrapi API.
|
||||
|
||||
### Features
|
||||
- Connects to an Instagrapi API deployment to fetch Instagram profiles, posts, stories, highlights, reels, and tagged content.
|
||||
- Supports advanced configuration options, including:
|
||||
- Full profile download (all posts, stories, highlights, and tagged content).
|
||||
- Limiting the number of posts to fetch for large profiles.
|
||||
- Minimising JSON output to remove empty fields and redundant data.
|
||||
- Provides robust error handling and retries for API calls.
|
||||
- Ensures efficient media scraping, including handling nested or carousel media items.
|
||||
- Adds downloaded media and metadata to the result for further processing.
|
||||
|
||||
### Notes
|
||||
- Requires a valid Instagrapi API token (`access_token`) and API endpoint (`api_endpoint`).
|
||||
- Full-profile downloads can be limited by setting `full_profile_max_posts`.
|
||||
- Designed to fetch content in batches for large profiles, minimising API load.
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -45,25 +45,6 @@ class InstagramAPIArchiver(Archiver):
|
||||
self.full_profile = bool(self.full_profile)
|
||||
self.minimize_json_output = bool(self.minimize_json_output)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"access_token": {"default": None, "help": "a valid instagrapi-api token"},
|
||||
"api_endpoint": {"default": None, "help": "API endpoint to use"},
|
||||
"full_profile": {
|
||||
"default": False,
|
||||
"help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.",
|
||||
},
|
||||
"full_profile_max_posts": {
|
||||
"default": 0,
|
||||
"help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights",
|
||||
},
|
||||
"minimize_json_output": {
|
||||
"default": True,
|
||||
"help": "if true, will remove empty values from the json output",
|
||||
},
|
||||
}
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
|
||||
@@ -3,10 +3,12 @@
|
||||
"type": ["extractor"],
|
||||
"entry_point": "instagram_archiver:InstagramArchiver",
|
||||
"external_dependencies": {
|
||||
"python": ["instaloader",
|
||||
"loguru",],
|
||||
"python": [
|
||||
"instaloader",
|
||||
"loguru",
|
||||
],
|
||||
},
|
||||
"no_setup_required": False,
|
||||
"requires_setup": True,
|
||||
"configs": {
|
||||
"username": {"default": None, "help": "a valid Instagram username"},
|
||||
"password": {
|
||||
|
||||
@@ -45,16 +45,7 @@ class InstagramArchiver(Archiver):
|
||||
except Exception as e2:
|
||||
logger.error(f"Unable to finish login (retrying from file): {e2}\n{traceback.format_exc()}")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"username": {"default": None, "help": "a valid Instagram username"},
|
||||
"password": {"default": None, "help": "the corresponding Instagram account password"},
|
||||
"download_folder": {"default": "instaloader", "help": "name of a folder to temporarily download content to"},
|
||||
"session_file": {"default": "secrets/instaloader.session", "help": "path to the instagram session which saves session credentials"},
|
||||
#TODO: fine-grain
|
||||
# "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"},
|
||||
}
|
||||
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
@@ -34,15 +34,6 @@ class InstagramTbotArchiver(Archiver):
|
||||
self.assert_valid_string("api_hash")
|
||||
self.timeout = int(self.timeout)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
|
||||
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
||||
"session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
|
||||
"timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."},
|
||||
}
|
||||
|
||||
def setup(self) -> None:
|
||||
"""
|
||||
1. makes a copy of session_file that is removed in cleanup
|
||||
|
||||
0
src/auto_archiver/modules/meta_enricher/__init__.py
Normal file
0
src/auto_archiver/modules/meta_enricher/__init__.py
Normal file
22
src/auto_archiver/modules/meta_enricher/__manifest__.py
Normal file
22
src/auto_archiver/modules/meta_enricher/__manifest__.py
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"name": "Archive Metadata Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru"],
|
||||
},
|
||||
"description": """
|
||||
Adds metadata information about the archive operations, Adds metadata about archive operations, including file sizes and archive duration./
|
||||
To be included at the end of all enrichments.
|
||||
|
||||
### Features
|
||||
- Calculates the total size of all archived media files, storing the result in human-readable and byte formats.
|
||||
- Computes the duration of the archival process, storing the elapsed time in seconds.
|
||||
- Ensures all enrichments are performed only if the `Metadata` object contains valid data.
|
||||
- Adds detailed metadata to provide insights into file sizes and archival performance.
|
||||
|
||||
### Notes
|
||||
- Skips enrichment if no media or metadata is available in the `Metadata` object.
|
||||
- File sizes are calculated using the `os.stat` module, ensuring accurate byte-level reporting.
|
||||
""",
|
||||
}
|
||||
@@ -2,8 +2,8 @@ import datetime
|
||||
import os
|
||||
from loguru import logger
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Metadata
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
class MetaEnricher(Enricher):
|
||||
@@ -17,10 +17,6 @@ class MetaEnricher(Enricher):
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
if to_enrich.is_empty():
|
||||
@@ -28,7 +24,7 @@ class MetaEnricher(Enricher):
|
||||
return
|
||||
|
||||
logger.debug(f"calculating archive metadata information for {url=}")
|
||||
|
||||
|
||||
self.enrich_file_sizes(to_enrich)
|
||||
self.enrich_archive_duration(to_enrich)
|
||||
|
||||
@@ -40,10 +36,10 @@ class MetaEnricher(Enricher):
|
||||
media.set("bytes", file_stats.st_size)
|
||||
media.set("size", self.human_readable_bytes(file_stats.st_size))
|
||||
total_size += file_stats.st_size
|
||||
|
||||
|
||||
to_enrich.set("total_bytes", total_size)
|
||||
to_enrich.set("total_size", self.human_readable_bytes(total_size))
|
||||
|
||||
|
||||
|
||||
def human_readable_bytes(self, size: int) -> str:
|
||||
# receives number of bytes and returns human readble size
|
||||
22
src/auto_archiver/modules/metadata_enricher/__manifest__.py
Normal file
22
src/auto_archiver/modules/metadata_enricher/__manifest__.py
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"name": "Media Metadata Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru"],
|
||||
"bin": ["exiftool"]
|
||||
|
||||
},
|
||||
"description": """
|
||||
Extracts metadata information from files using ExifTool.
|
||||
|
||||
### Features
|
||||
- Uses ExifTool to extract detailed metadata from media files.
|
||||
- Processes file-specific data like camera settings, geolocation, timestamps, and other embedded metadata.
|
||||
- Adds extracted metadata to the corresponding `Media` object within the `Metadata`.
|
||||
|
||||
### Notes
|
||||
- Requires ExifTool to be installed and accessible via the system's PATH.
|
||||
- Skips enrichment for files where metadata extraction fails.
|
||||
"""
|
||||
}
|
||||
@@ -2,8 +2,8 @@ import subprocess
|
||||
import traceback
|
||||
from loguru import logger
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Metadata
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
class MetadataEnricher(Enricher):
|
||||
@@ -16,9 +16,6 @@ class MetadataEnricher(Enricher):
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
21
src/auto_archiver/modules/pdq_hash_enricher/__manifest__.py
Normal file
21
src/auto_archiver/modules/pdq_hash_enricher/__manifest__.py
Normal file
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"name": "PDQ Hash Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "pdqhash", "numpy", "Pillow"],
|
||||
},
|
||||
"description": """
|
||||
PDQ Hash Enricher for generating perceptual hashes of media files.
|
||||
|
||||
### Features
|
||||
- Calculates perceptual hashes for image files using the PDQ hashing algorithm.
|
||||
- Enables detection of duplicate or near-duplicate visual content.
|
||||
- Processes images stored in `Metadata` objects, adding computed hashes to the corresponding `Media` entries.
|
||||
- Skips non-image media or files unsuitable for hashing (e.g., corrupted or unsupported formats).
|
||||
|
||||
### Notes
|
||||
- Best used after enrichers like `thumbnail_enricher` or `screenshot_enricher` to ensure images are available.
|
||||
- Uses the `pdqhash` library to compute 256-bit perceptual hashes, which are stored as hexadecimal strings.
|
||||
"""
|
||||
}
|
||||
@@ -16,8 +16,8 @@ import numpy as np
|
||||
from PIL import Image, UnidentifiedImageError
|
||||
from loguru import logger
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Metadata
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
class PdqHashEnricher(Enricher):
|
||||
@@ -31,10 +31,6 @@ class PdqHashEnricher(Enricher):
|
||||
# Without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"calculating perceptual hashes for {url=}")
|
||||
@@ -0,0 +1,30 @@
|
||||
{
|
||||
"name": "Screenshot Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "selenium"],
|
||||
"bin": ["chromedriver"]
|
||||
},
|
||||
"configs": {
|
||||
"width": {"default": 1280, "help": "width of the screenshots"},
|
||||
"height": {"default": 720, "help": "height of the screenshots"},
|
||||
"timeout": {"default": 60, "help": "timeout for taking the screenshot"},
|
||||
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
|
||||
"http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
|
||||
"save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
|
||||
"print_options": {"default": {}, "help": "options to pass to the pdf printer"}
|
||||
},
|
||||
"description": """
|
||||
Captures screenshots and optionally saves web pages as PDFs using a WebDriver.
|
||||
|
||||
### Features
|
||||
- Takes screenshots of web pages, with configurable width, height, and timeout settings.
|
||||
- Optionally saves pages as PDFs, with additional configuration for PDF printing options.
|
||||
- Bypasses URLs detected as authentication walls.
|
||||
- Integrates seamlessly with the metadata enrichment pipeline, adding screenshots and PDFs as media.
|
||||
|
||||
### Notes
|
||||
- Requires a WebDriver (e.g., ChromeDriver) installed and accessible via the system's PATH.
|
||||
"""
|
||||
}
|
||||
@@ -5,24 +5,30 @@ import base64
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
|
||||
|
||||
from . import Enricher
|
||||
from ..utils import Webdriver, UrlUtil, random_str
|
||||
from ..core import Media, Metadata, ArchivingContext
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.utils import Webdriver, UrlUtil, random_str
|
||||
from auto_archiver.core import Media, Metadata, ArchivingContext
|
||||
|
||||
class ScreenshotEnricher(Enricher):
|
||||
name = "screenshot_enricher"
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"width": {"default": 1280, "help": "width of the screenshots"},
|
||||
"height": {"default": 720, "help": "height of the screenshots"},
|
||||
"timeout": {"default": 60, "help": "timeout for taking the screenshot"},
|
||||
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
|
||||
"http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
|
||||
"save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
|
||||
"print_options": {"default": {}, "help": "options to pass to the pdf printer"}
|
||||
}
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
# TODO?
|
||||
|
||||
|
||||
|
||||
# @staticmethod
|
||||
# def configs() -> dict:
|
||||
# return {
|
||||
# "width": {"default": 1280, "help": "width of the screenshots"},
|
||||
# "height": {"default": 720, "help": "height of the screenshots"},
|
||||
# "timeout": {"default": 60, "help": "timeout for taking the screenshot"},
|
||||
# "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
|
||||
# "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
|
||||
# "save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
|
||||
# "print_options": {"default": {}, "help": "options to pass to the pdf printer"}
|
||||
# }
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
0
src/auto_archiver/modules/ssl_enricher/__init__.py
Normal file
0
src/auto_archiver/modules/ssl_enricher/__init__.py
Normal file
22
src/auto_archiver/modules/ssl_enricher/__manifest__.py
Normal file
22
src/auto_archiver/modules/ssl_enricher/__manifest__.py
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"name": "SSL Certificate Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "python-slugify"],
|
||||
},
|
||||
"configs": {
|
||||
"skip_when_nothing_archived": {"default": True, "help": "if true, will skip enriching when no media is archived"},
|
||||
},
|
||||
"description": """
|
||||
Retrieves SSL certificate information for a domain and stores it as a file.
|
||||
|
||||
### Features
|
||||
- Fetches SSL certificates for domains using the HTTPS protocol.
|
||||
- Stores certificates in PEM format and adds them as media to the metadata.
|
||||
- Skips enrichment if no media has been archived, based on the `skip_when_nothing_archived` configuration.
|
||||
|
||||
### Notes
|
||||
- Requires the target URL to use the HTTPS scheme; other schemes are not supported.
|
||||
"""
|
||||
}
|
||||
@@ -3,8 +3,8 @@ from slugify import slugify
|
||||
from urllib.parse import urlparse
|
||||
from loguru import logger
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Metadata, ArchivingContext, Media
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.core import Metadata, ArchivingContext, Media
|
||||
|
||||
|
||||
class SSLEnricher(Enricher):
|
||||
@@ -15,13 +15,7 @@ class SSLEnricher(Enricher):
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
self. skip_when_nothing_archived = bool(self.skip_when_nothing_archived)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"skip_when_nothing_archived": {"default": True, "help": "if true, will skip enriching when no media is archived"},
|
||||
}
|
||||
self.skip_when_nothing_archived = bool(self.skip_when_nothing_archived)
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
if not to_enrich.media and self.skip_when_nothing_archived: return
|
||||
@@ -16,9 +16,6 @@ class TelegramArchiver(Archiver):
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {}
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
"default": {},
|
||||
"help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
|
||||
# TODO
|
||||
#"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
|
||||
"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
|
||||
@@ -23,20 +23,6 @@ class TelethonArchiver(Archiver):
|
||||
self.assert_valid_string("api_id")
|
||||
self.assert_valid_string("api_hash")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
|
||||
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
||||
"bot_token": {"default": None, "help": "optional, but allows access to more content such as large videos, talk to @botfather"},
|
||||
"session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
|
||||
"join_channels": {"default": True, "help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
|
||||
"channel_invites": {
|
||||
"default": {},
|
||||
"help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
|
||||
"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
|
||||
}
|
||||
}
|
||||
|
||||
def setup(self) -> None:
|
||||
"""
|
||||
|
||||
27
src/auto_archiver/modules/thumbnail_enricher/__manifest__.py
Normal file
27
src/auto_archiver/modules/thumbnail_enricher/__manifest__.py
Normal file
@@ -0,0 +1,27 @@
|
||||
{
|
||||
"name": "Thumbnail Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": False,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "ffmpeg-python"],
|
||||
"bin": ["ffmpeg"]
|
||||
},
|
||||
"configs": {
|
||||
"thumbnails_per_minute": {"default": 60, "help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
|
||||
"max_thumbnails": {"default": 16, "help": "limit the number of thumbnails to generate per video, 0 means no limit"},
|
||||
},
|
||||
"description": """
|
||||
Generates thumbnails for video files to provide visual previews.
|
||||
|
||||
### Features
|
||||
- Processes video files and generates evenly distributed thumbnails.
|
||||
- Calculates the number of thumbnails based on video duration, `thumbnails_per_minute`, and `max_thumbnails`.
|
||||
- Distributes thumbnails equally across the video's duration and stores them as media objects.
|
||||
- Adds metadata for each thumbnail, including timestamps and IDs.
|
||||
|
||||
### Notes
|
||||
- Requires `ffmpeg` to be installed and accessible via the system's PATH.
|
||||
- Handles videos without pre-existing duration metadata by probing with `ffmpeg`.
|
||||
- Skips enrichment for non-video media files.
|
||||
"""
|
||||
}
|
||||
@@ -9,9 +9,9 @@ and identify important moments without watching the entire video.
|
||||
import ffmpeg, os
|
||||
from loguru import logger
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Media, Metadata, ArchivingContext
|
||||
from ..utils.misc import random_str
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.core import Media, Metadata, ArchivingContext
|
||||
from auto_archiver.utils.misc import random_str
|
||||
|
||||
|
||||
class ThumbnailEnricher(Enricher):
|
||||
@@ -25,13 +25,6 @@ class ThumbnailEnricher(Enricher):
|
||||
super().__init__(config)
|
||||
self.thumbnails_per_second = int(self.thumbnails_per_minute) / 60
|
||||
self.max_thumbnails = int(self.max_thumbnails)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"thumbnails_per_minute": {"default": 60, "help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
|
||||
"max_thumbnails": {"default": 16, "help": "limit the number of thumbnails to generate per video, 0 means no limit"},
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
"""
|
||||
@@ -0,0 +1,40 @@
|
||||
{
|
||||
"name": "Timestamping Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": [
|
||||
"loguru",
|
||||
"slugify",
|
||||
"tsp_client",
|
||||
"asn1crypto",
|
||||
"certvalidator",
|
||||
"certifi"
|
||||
],
|
||||
},
|
||||
"configs": {
|
||||
"tsa_urls": {
|
||||
"default": [
|
||||
"http://timestamp.digicert.com",
|
||||
"http://timestamp.identrust.com",
|
||||
"http://timestamp.globalsign.com/tsa/r6advanced1",
|
||||
"http://tss.accv.es:8318/tsa"
|
||||
],
|
||||
"help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
|
||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
Generates RFC3161-compliant timestamp tokens using Time Stamp Authorities (TSA) for archived files.
|
||||
|
||||
### Features
|
||||
- Creates timestamp tokens to prove the existence of files at a specific time, useful for legal and authenticity purposes.
|
||||
- Aggregates file hashes into a text file and timestamps the concatenated data.
|
||||
- Uses multiple Time Stamp Authorities (TSAs) to ensure reliability and redundancy.
|
||||
- Validates timestamping certificates against trusted Certificate Authorities (CAs) using the `certifi` trust store.
|
||||
|
||||
### Notes
|
||||
- Should be run after the `hash_enricher` to ensure file hashes are available.
|
||||
- Requires internet access to interact with the configured TSAs.
|
||||
"""
|
||||
}
|
||||
@@ -8,9 +8,9 @@ from certvalidator import CertificateValidator, ValidationContext
|
||||
from asn1crypto import pem
|
||||
import certifi
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Metadata, ArchivingContext, Media
|
||||
from ..archivers import Archiver
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.core import Metadata, ArchivingContext, Media
|
||||
from auto_archiver.archivers import Archiver
|
||||
|
||||
|
||||
class TimestampingEnricher(Enricher):
|
||||
@@ -26,36 +26,36 @@ class TimestampingEnricher(Enricher):
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"tsa_urls": {
|
||||
"default": [
|
||||
# [Adobe Approved Trust List] and [Windows Cert Store]
|
||||
"http://timestamp.digicert.com",
|
||||
"http://timestamp.identrust.com",
|
||||
# "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping
|
||||
# "https://timestamp.sectigo.com", # wait 15 seconds between each request.
|
||||
|
||||
# [Adobe: European Union Trusted Lists].
|
||||
# "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request.
|
||||
|
||||
# [Windows Cert Store]
|
||||
"http://timestamp.globalsign.com/tsa/r6advanced1",
|
||||
|
||||
# [Adobe: European Union Trusted Lists] and [Windows Cert Store]
|
||||
# "http://ts.quovadisglobal.com/eu", # not valid for timestamping
|
||||
# "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain
|
||||
# "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain
|
||||
# "http://tsa.sep.bg", # self-signed certificate in certificate chain
|
||||
# "http://tsa.izenpe.com", #unable to get local issuer certificate
|
||||
# "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate
|
||||
"http://tss.accv.es:8318/tsa",
|
||||
],
|
||||
"help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
|
||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
}
|
||||
}
|
||||
# @staticmethod
|
||||
# def configs() -> dict:
|
||||
# return {
|
||||
# "tsa_urls": {
|
||||
# "default": [
|
||||
# # [Adobe Approved Trust List] and [Windows Cert Store]
|
||||
# "http://timestamp.digicert.com",
|
||||
# "http://timestamp.identrust.com",
|
||||
# # "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping
|
||||
# # "https://timestamp.sectigo.com", # wait 15 seconds between each request.
|
||||
#
|
||||
# # [Adobe: European Union Trusted Lists].
|
||||
# # "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request.
|
||||
#
|
||||
# # [Windows Cert Store]
|
||||
# "http://timestamp.globalsign.com/tsa/r6advanced1",
|
||||
#
|
||||
# # [Adobe: European Union Trusted Lists] and [Windows Cert Store]
|
||||
# # "http://ts.quovadisglobal.com/eu", # not valid for timestamping
|
||||
# # "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain
|
||||
# # "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain
|
||||
# # "http://tsa.sep.bg", # self-signed certificate in certificate chain
|
||||
# # "http://tsa.izenpe.com", #unable to get local issuer certificate
|
||||
# # "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate
|
||||
# "http://tss.accv.es:8318/tsa",
|
||||
# ],
|
||||
# "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
|
||||
# "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
# }
|
||||
# }
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
@@ -12,7 +12,8 @@
|
||||
},
|
||||
"configs": {
|
||||
"bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
|
||||
"bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line"},
|
||||
"bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line",
|
||||
"cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))},
|
||||
"consumer_key": {"default": None, "help": "twitter API consumer_key"},
|
||||
"consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
|
||||
"access_token": {"default": None, "help": "twitter API access_token"},
|
||||
|
||||
@@ -34,17 +34,6 @@ class TwitterApiArchiver(Archiver):
|
||||
access_token=self.access_token, access_secret=self.access_secret))
|
||||
assert self.api_client is not None, "Missing Twitter API configurations, please provide either AND/OR (consumer_key, consumer_secret, access_token, access_secret) to use this archiver, you can provide both for better rate-limit results."
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
|
||||
"bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))},
|
||||
"consumer_key": {"default": None, "help": "twitter API consumer_key"},
|
||||
"consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
|
||||
"access_token": {"default": None, "help": "twitter API access_token"},
|
||||
"access_secret": {"default": None, "help": "twitter API access_secret"},
|
||||
}
|
||||
|
||||
@property # getter .mimetype
|
||||
def api_client(self) -> str:
|
||||
return self.apis[self.api_index]
|
||||
|
||||
@@ -19,14 +19,6 @@ class VkArchiver(Archiver):
|
||||
self.assert_valid_string("password")
|
||||
self.vks = VkScraper(self.username, self.password, session_file=self.session_file)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"username": {"default": None, "help": "valid VKontakte username"},
|
||||
"password": {"default": None, "help": "valid VKontakte password"},
|
||||
"session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
|
||||
}
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
|
||||
0
src/auto_archiver/modules/wacz_enricher/__init__.py
Normal file
0
src/auto_archiver/modules/wacz_enricher/__init__.py
Normal file
39
src/auto_archiver/modules/wacz_enricher/__manifest__.py
Normal file
39
src/auto_archiver/modules/wacz_enricher/__manifest__.py
Normal file
@@ -0,0 +1,39 @@
|
||||
{
|
||||
"name": "WACZ Enricher",
|
||||
"type": ["enricher", "archiver"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": [
|
||||
"loguru",
|
||||
"jsonlines",
|
||||
"warcio"
|
||||
],
|
||||
# TODO?
|
||||
"bin": [
|
||||
"docker"
|
||||
]
|
||||
},
|
||||
"configs": {
|
||||
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
|
||||
"docker_commands": {"default": None, "help":"if a custom docker invocation is needed"},
|
||||
"timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"},
|
||||
"extract_media": {"default": False, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
|
||||
"extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
|
||||
"socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
|
||||
"socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
|
||||
"proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"},
|
||||
},
|
||||
"description": """
|
||||
Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving.
|
||||
|
||||
### Features
|
||||
- Archives web pages into .WACZ format using Docker or direct invocation of `browsertrix-crawler`.
|
||||
- Supports custom profiles for archiving private or dynamic content.
|
||||
- Extracts media (images, videos, audio) and screenshots from the archive, optionally adding them to the enrichment pipeline.
|
||||
- Generates metadata from the archived page's content and structure (e.g., titles, text).
|
||||
|
||||
### Notes
|
||||
- Requires Docker for running `browsertrix-crawler` unless explicitly disabled.
|
||||
- Configurable via parameters for timeout, media extraction, screenshots, and proxy settings.
|
||||
"""
|
||||
}
|
||||
@@ -5,10 +5,10 @@ from zipfile import ZipFile
|
||||
from loguru import logger
|
||||
from warcio.archiveiterator import ArchiveIterator
|
||||
|
||||
from ..core import Media, Metadata, ArchivingContext
|
||||
from . import Enricher
|
||||
from ..archivers import Archiver
|
||||
from ..utils import UrlUtil, random_str
|
||||
from auto_archiver.core import Media, Metadata, ArchivingContext
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.archivers import Archiver
|
||||
from auto_archiver.utils import UrlUtil, random_str
|
||||
|
||||
|
||||
class WaczArchiverEnricher(Enricher, Archiver):
|
||||
@@ -24,19 +24,6 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
|
||||
"docker_commands": {"default": None, "help":"if a custom docker invocation is needed"},
|
||||
"timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"},
|
||||
"extract_media": {"default": False, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
|
||||
"extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
|
||||
"socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
|
||||
"socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
|
||||
"proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"},
|
||||
}
|
||||
|
||||
def setup(self) -> None:
|
||||
self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')
|
||||
self.docker_in_docker = os.environ.get('WACZ_ENABLE_DOCKER') and os.environ.get('RUNNING_IN_DOCKER')
|
||||
29
src/auto_archiver/modules/wayback_enricher/__manifest__.py
Normal file
29
src/auto_archiver/modules/wayback_enricher/__manifest__.py
Normal file
@@ -0,0 +1,29 @@
|
||||
{
|
||||
"name": "Wayback Machine Enricher",
|
||||
"type": ["enricher", "archiver"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "requests"],
|
||||
},
|
||||
"configs": {
|
||||
"timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."},
|
||||
"if_not_archived_within": {"default": None, "help": "only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA"},
|
||||
"key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"},
|
||||
"secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"},
|
||||
"proxy_http": {"default": None, "help": "http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port"},
|
||||
"proxy_https": {"default": None, "help": "https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port"},
|
||||
},
|
||||
"description": """
|
||||
Submits the current URL to the Wayback Machine for archiving and returns either a job ID or the completed archive URL.
|
||||
|
||||
### Features
|
||||
- Archives URLs using the Internet Archive's Wayback Machine API.
|
||||
- Supports conditional archiving based on the existence of prior archives within a specified time range.
|
||||
- Provides proxies for HTTP and HTTPS requests.
|
||||
- Fetches and confirms the archive URL or provides a job ID for later status checks.
|
||||
|
||||
### Notes
|
||||
- Requires a valid Wayback Machine API key and secret.
|
||||
- Handles rate-limiting by Wayback Machine and retries status checks with exponential backoff.
|
||||
"""
|
||||
}
|
||||
@@ -2,10 +2,10 @@ import json
|
||||
from loguru import logger
|
||||
import time, requests
|
||||
|
||||
from . import Enricher
|
||||
from ..archivers import Archiver
|
||||
from ..utils import UrlUtil
|
||||
from ..core import Metadata
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.archivers import Archiver
|
||||
from auto_archiver.utils import UrlUtil
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
class WaybackArchiverEnricher(Enricher, Archiver):
|
||||
"""
|
||||
@@ -21,17 +21,6 @@ class WaybackArchiverEnricher(Enricher, Archiver):
|
||||
assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API key"
|
||||
assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API secret"
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."},
|
||||
"if_not_archived_within": {"default": None, "help": "only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA"},
|
||||
"key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"},
|
||||
"secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"},
|
||||
"proxy_http": {"default": None, "help": "http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port"},
|
||||
"proxy_https": {"default": None, "help": "https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port"},
|
||||
}
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
# this new Metadata object is required to avoid duplication
|
||||
result = Metadata()
|
||||
30
src/auto_archiver/modules/whisper_enricher/__manifest__.py
Normal file
30
src/auto_archiver/modules/whisper_enricher/__manifest__.py
Normal file
@@ -0,0 +1,30 @@
|
||||
{
|
||||
"name": "Whisper Enricher",
|
||||
"type": ["enricher"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "requests"],
|
||||
},
|
||||
"configs": {
|
||||
"api_endpoint": {"default": None, "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
|
||||
"api_key": {"default": None, "help": "WhisperApi api key for authentication"},
|
||||
"include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
|
||||
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
|
||||
"action": {"default": "translate", "help": "which Whisper operation to execute", "choices": ["transcribe", "translate", "language_detection"]},
|
||||
},
|
||||
"description": """
|
||||
Integrates with a Whisper API service to transcribe, translate, or detect the language of audio and video files.
|
||||
|
||||
### Features
|
||||
- Submits audio or video files to a Whisper API deployment for processing.
|
||||
- Supports operations such as transcription, translation, and language detection.
|
||||
- Optionally generates SRT subtitle files for video content.
|
||||
- Integrates with S3-compatible storage systems to make files publicly accessible for processing.
|
||||
- Handles job submission, status checking, artifact retrieval, and cleanup.
|
||||
|
||||
### Notes
|
||||
- Requires a Whisper API endpoint and API key for authentication.
|
||||
- Only compatible with S3-compatible storage systems for media file accessibility.
|
||||
- Handles multiple jobs and retries for failed or incomplete processing.
|
||||
"""
|
||||
}
|
||||
@@ -2,9 +2,9 @@ import traceback
|
||||
import requests, time
|
||||
from loguru import logger
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Metadata, Media, ArchivingContext
|
||||
from ..storages import S3Storage
|
||||
from auto_archiver.enrichers import Enricher
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.storages import S3Storage
|
||||
|
||||
|
||||
class WhisperEnricher(Enricher):
|
||||
@@ -22,17 +22,6 @@ class WhisperEnricher(Enricher):
|
||||
assert type(self.api_key) == str and len(self.api_key) > 0, "please provide a value for the whisper_enricher api_key"
|
||||
self.timeout = int(self.timeout)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"api_endpoint": {"default": None, "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
|
||||
"api_key": {"default": None, "help": "WhisperApi api key for authentication"},
|
||||
"include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
|
||||
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
|
||||
"action": {"default": "translate", "help": "which Whisper operation to execute", "choices": ["transcribe", "translate", "language_detection"]},
|
||||
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
if not self._get_s3_storage():
|
||||
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
|
||||
@@ -1,5 +1,5 @@
|
||||
|
||||
from auto_archiver.databases.csv_db import CSVDb
|
||||
from auto_archiver.modules.csv_db import CSVDb
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import pytest
|
||||
|
||||
from auto_archiver.enrichers.hash_enricher import HashEnricher
|
||||
from auto_archiver.modules.hash_enricher import HashEnricher
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
@pytest.mark.parametrize("algorithm, filename, expected_hash", [
|
||||
|
||||
Reference in New Issue
Block a user