From 282f33eff33166a8100180293d51d26eadc3ef0f Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Fri, 1 Apr 2022 01:30:49 -0500 Subject: [PATCH] implemented deferred media archiving for all scrapers, and implemented tests for them. Refactored archiving methods of Instagram and Gettr scrapers to be able to use default archiving method --- Pipfile | 2 +- Pipfile.lock | 254 +++++++++++-------------- cisticola/scraper/base.py | 19 +- cisticola/scraper/bitchute.py | 10 +- cisticola/scraper/gab.py | 27 ++- cisticola/scraper/gettr.py | 33 ++-- cisticola/scraper/instagram.py | 51 +++-- cisticola/scraper/odysee.py | 28 ++- cisticola/scraper/rumble.py | 28 ++- cisticola/scraper/telegram_snscrape.py | 4 +- cisticola/scraper/twitter.py | 12 +- cisticola/scraper/vkontakte.py | 60 ++++-- cisticola/scraper/youtube.py | 58 +++++- pytest.ini | 7 +- tests/conftest.py | 8 +- tests/scraper/bitchute.py | 7 + tests/scraper/gab.py | 7 + tests/scraper/gettr.py | 7 + tests/scraper/instagram.py | 7 + tests/scraper/odysee.py | 7 + tests/scraper/rumble.py | 7 + tests/scraper/telegram_snscrape.py | 7 + tests/scraper/telegram_telethon.py | 7 + tests/scraper/twitter.py | 7 + tests/scraper/vkontakte.py | 7 + tests/scraper/youtube.py | 7 + 26 files changed, 417 insertions(+), 261 deletions(-) diff --git a/Pipfile b/Pipfile index df21bef..c913ecb 100644 --- a/Pipfile +++ b/Pipfile @@ -36,7 +36,7 @@ sphinx = "*" sphinx_rtd_theme = "*" [requires] -python_version = "3.8" +python_version = "3.9" [pipenv] allow_prereleases = true diff --git a/Pipfile.lock b/Pipfile.lock index ea45b5e..2d95b71 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,11 +1,11 @@ { "_meta": { "hash": { - "sha256": "e57f79178ac0e05f9753a29f97e08d2ae96b7775044bb4c6ba616baae1d21183" + "sha256": "b9fc02f3ecaa2199480c4fcba30f02780860dfbc2e10c026889c78f639709fb4" }, "pipfile-spec": 6, "requires": { - "python_version": "3.8" + "python_version": "3.9" }, "sources": [ { @@ -16,28 +16,6 @@ ] }, "default": { - "backports.zoneinfo": { - "hashes": [ - "sha256:17746bd546106fa389c51dbea67c8b7c8f0d14b5526a579ca6ccf5ed72c526cf", - "sha256:1b13e654a55cd45672cb54ed12148cd33628f672548f373963b0bff67b217328", - "sha256:1c5742112073a563c81f786e77514969acb58649bcdf6cdf0b4ed31a348d4546", - "sha256:4a0f800587060bf8880f954dbef70de6c11bbe59c673c3d818921f042f9954a6", - "sha256:5c144945a7752ca544b4b78c8c41544cdfaf9786f25fe5ffb10e838e19a27570", - "sha256:7b0a64cda4145548fed9efc10322770f929b944ce5cee6c0dfe0c87bf4c0c8c9", - "sha256:8439c030a11780786a2002261569bdf362264f605dfa4d65090b64b05c9f79a7", - "sha256:8961c0f32cd0336fb8e8ead11a1f8cd99ec07145ec2931122faaac1c8f7fd987", - "sha256:89a48c0d158a3cc3f654da4c2de1ceba85263fafb861b98b59040a5086259722", - "sha256:a76b38c52400b762e48131494ba26be363491ac4f9a04c1b7e92483d169f6582", - "sha256:da6013fd84a690242c310d77ddb8441a559e9cb3d3d59ebac9aca1a57b2e18bc", - "sha256:e55b384612d93be96506932a786bbcde5a2db7a9e6a4bb4bffe8b733f5b9036b", - "sha256:e81b76cace8eda1fca50e345242ba977f9be6ae3945af8d46326d776b4cf78d1", - "sha256:e8236383a20872c0cdf5a62b554b27538db7fa1bbec52429d8d106effbaeca08", - "sha256:f04e857b59d9d1ccc39ce2da1021d196e47234873820cbeaad210724b1ee28ac", - "sha256:fadbfe37f74051d024037f223b8e001611eac868b5c5b06144ef4d8b799862f2" - ], - "markers": "python_version >= '3.6' and python_version < '3.9'", - "version": "==0.2.1" - }, "beautifulsoup4": { "hashes": [ "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf", @@ -48,19 +26,19 @@ }, "boto3": { "hashes": [ - "sha256:ef210f8e85cdb6d26a38ebad1cfe9cefdef2ab269207e5987653555375a7ef6b", - "sha256:f0af8f4ef5fe6353c794cd3cce627d469a618b58ace7ca75a63cfd719df615ce" + "sha256:35f68b60652bff50e7bc926238443cb578f29f120908bb945e5640e90c6dd53e", + "sha256:7f3f93ee97215862ccd1a216f37deb7d64055c71f826b821805904df7b84ee6a" ], "index": "pypi", - "version": "==1.21.30" + "version": "==1.21.31" }, "botocore": { "hashes": [ - "sha256:af4bdc51eeecbe9fdcdadbed9ad58c5c91380ef30f3560022bbc2ee1d78f0ad6", - "sha256:c622751093e3d0bf61343e66d6d06190ef30bf42b1557d5070ca84e9efa06d4b" + "sha256:3bb21e3ee5e4de3ed76bb99b4496a46e9b5c82e7b7fdb62702f11dda1b57b769", + "sha256:424fd94bef86a11f5340dc15eb50602dedec2ecc01c3a25c4fea23a2c8195500" ], "markers": "python_version >= '3.6'", - "version": "==1.24.30" + "version": "==1.24.31" }, "brotli": { "hashes": [ @@ -217,11 +195,11 @@ }, "click": { "hashes": [ - "sha256:5e0d195c2067da3136efb897449ec1e9e6c98282fbf30d7f9e164af9be901a6b", - "sha256:7ab900e38149c9872376e8f9b5986ddcaf68c0f413cf73678a0bca5547e6f976" + "sha256:24e1a4a9ec5bf6299411369b208c1df2188d9eb8d916302fe6bf03faed227f1e", + "sha256:479707fe14d9ec9a0757618b7a100a0ae4c4e236fac5b7f80ca68028141a1a72" ], "markers": "python_version >= '3.7'", - "version": "==8.1.1" + "version": "==8.1.2" }, "cryptg": { "hashes": [ @@ -324,64 +302,63 @@ }, "greenlet": { "hashes": [ - "sha256:0051c6f1f27cb756ffc0ffbac7d2cd48cb0362ac1736871399a739b2885134d3", - "sha256:00e44c8afdbe5467e4f7b5851be223be68adb4272f44696ee71fe46b7036a711", - "sha256:013d61294b6cd8fe3242932c1c5e36e5d1db2c8afb58606c5a67efce62c1f5fd", - "sha256:049fe7579230e44daef03a259faa24511d10ebfa44f69411d99e6a184fe68073", - "sha256:14d4f3cd4e8b524ae9b8aa567858beed70c392fdec26dbdb0a8a418392e71708", - "sha256:166eac03e48784a6a6e0e5f041cfebb1ab400b394db188c48b3a84737f505b67", - "sha256:17ff94e7a83aa8671a25bf5b59326ec26da379ace2ebc4411d690d80a7fbcf23", - "sha256:1e12bdc622676ce47ae9abbf455c189e442afdde8818d9da983085df6312e7a1", - "sha256:21915eb821a6b3d9d8eefdaf57d6c345b970ad722f856cd71739493ce003ad08", - "sha256:288c6a76705dc54fba69fbcb59904ae4ad768b4c768839b8ca5fdadec6dd8cfd", - "sha256:2bde6792f313f4e918caabc46532aa64aa27a0db05d75b20edfc5c6f46479de2", - "sha256:32ca72bbc673adbcfecb935bb3fb1b74e663d10a4b241aaa2f5a75fe1d1f90aa", - "sha256:356b3576ad078c89a6107caa9c50cc14e98e3a6c4874a37c3e0273e4baf33de8", - "sha256:40b951f601af999a8bf2ce8c71e8aaa4e8c6f78ff8afae7b808aae2dc50d4c40", - "sha256:572e1787d1460da79590bf44304abbc0a2da944ea64ec549188fa84d89bba7ab", - "sha256:58df5c2a0e293bf665a51f8a100d3e9956febfbf1d9aaf8c0677cf70218910c6", - "sha256:64e6175c2e53195278d7388c454e0b30997573f3f4bd63697f88d855f7a6a1fc", - "sha256:7227b47e73dedaa513cdebb98469705ef0d66eb5a1250144468e9c3097d6b59b", - "sha256:7418b6bfc7fe3331541b84bb2141c9baf1ec7132a7ecd9f375912eca810e714e", - "sha256:7cbd7574ce8e138bda9df4efc6bf2ab8572c9aff640d8ecfece1b006b68da963", - "sha256:7ff61ff178250f9bb3cd89752df0f1dd0e27316a8bd1465351652b1b4a4cdfd3", - "sha256:833e1551925ed51e6b44c800e71e77dacd7e49181fdc9ac9a0bf3714d515785d", - "sha256:8639cadfda96737427330a094476d4c7a56ac03de7265622fcf4cfe57c8ae18d", - "sha256:8c5d5b35f789a030ebb95bff352f1d27a93d81069f2adb3182d99882e095cefe", - "sha256:8c790abda465726cfb8bb08bd4ca9a5d0a7bd77c7ac1ca1b839ad823b948ea28", - "sha256:8d2f1fb53a421b410751887eb4ff21386d119ef9cde3797bf5e7ed49fb51a3b3", - "sha256:903bbd302a2378f984aef528f76d4c9b1748f318fe1294961c072bdc7f2ffa3e", - "sha256:93f81b134a165cc17123626ab8da2e30c0455441d4ab5576eed73a64c025b25c", - "sha256:95e69877983ea39b7303570fa6760f81a3eec23d0e3ab2021b7144b94d06202d", - "sha256:9633b3034d3d901f0a46b7939f8c4d64427dfba6bbc5a36b1a67364cf148a1b0", - "sha256:97e5306482182170ade15c4b0d8386ded995a07d7cc2ca8f27958d34d6736497", - "sha256:9f3cba480d3deb69f6ee2c1825060177a22c7826431458c697df88e6aeb3caee", - "sha256:aa5b467f15e78b82257319aebc78dd2915e4c1436c3c0d1ad6f53e47ba6e2713", - "sha256:abb7a75ed8b968f3061327c433a0fbd17b729947b400747c334a9c29a9af6c58", - "sha256:aec52725173bd3a7b56fe91bc56eccb26fbdff1386ef123abb63c84c5b43b63a", - "sha256:b11548073a2213d950c3f671aa88e6f83cda6e2fb97a8b6317b1b5b33d850e06", - "sha256:b1692f7d6bc45e3200844be0dba153612103db241691088626a33ff1f24a0d88", - "sha256:b336501a05e13b616ef81ce329c0e09ac5ed8c732d9ba7e3e983fcc1a9e86965", - "sha256:b8c008de9d0daba7b6666aa5bbfdc23dcd78cafc33997c9b7741ff6353bafb7f", - "sha256:b92e29e58bef6d9cfd340c72b04d74c4b4e9f70c9fa7c78b674d1fec18896dc4", - "sha256:be5f425ff1f5f4b3c1e33ad64ab994eed12fc284a6ea71c5243fd564502ecbe5", - "sha256:dd0b1e9e891f69e7675ba5c92e28b90eaa045f6ab134ffe70b52e948aa175b3c", - "sha256:e30f5ea4ae2346e62cedde8794a56858a67b878dd79f7df76a0767e356b1744a", - "sha256:e6a36bb9474218c7a5b27ae476035497a6990e21d04c279884eb10d9b290f1b1", - "sha256:e859fcb4cbe93504ea18008d1df98dee4f7766db66c435e4882ab35cf70cac43", - "sha256:eb6ea6da4c787111adf40f697b4e58732ee0942b5d3bd8f435277643329ba627", - "sha256:ec8c433b3ab0419100bd45b47c9c8551248a5aee30ca5e9d399a0b57ac04651b", - "sha256:eff9d20417ff9dcb0d25e2defc2574d10b491bf2e693b4e491914738b7908168", - "sha256:f0214eb2a23b85528310dad848ad2ac58e735612929c8072f6093f3585fd342d", - "sha256:f276df9830dba7a333544bd41070e8175762a7ac20350786b322b714b0e654f5", - "sha256:f3acda1924472472ddd60c29e5b9db0cec629fbe3c5c5accb74d6d6d14773478", - "sha256:f70a9e237bb792c7cc7e44c531fd48f5897961701cdaa06cf22fc14965c496cf", - "sha256:f9d29ca8a77117315101425ec7ec2a47a22ccf59f5593378fc4077ac5b754fce", - "sha256:fa877ca7f6b48054f847b61d6fa7bed5cebb663ebc55e018fda12db09dcc664c", - "sha256:fdcec0b8399108577ec290f55551d926d9a1fa6cad45882093a7a07ac5ec147b" + "sha256:004aed447382d80a56ecc354a6d807f305e6c808714ce6ccbca4839c94fae81d", + "sha256:068d68fad6bd623e29a2d36e74538c9b9d6dc6464931cd27d93da6cfc6a7f242", + "sha256:06fd4075754009c9817c6b4e1dc0af4616de52757b6ca973a81c3c1aadc28257", + "sha256:1004cb542451814b12a4f38e835a47734e2b2c683acbf463d5ae76282a3974cf", + "sha256:10c358633a8b27bfc32d27114ef2ca2ddc9f1f89f1643d1157b85e1fdd695315", + "sha256:115bc25fefbdc692c4483e9ddb9011ccd0251590ed59dbfff0f4eb7050bf99c4", + "sha256:1d987a2579336792f73ae6b106c2f087e32afc8573fbf9566f123ac6d8cfb72f", + "sha256:2128d727fd1e8afba8e68feb2cdcf88c90163b69ddc9707722a3e491c5280720", + "sha256:230132c241fe284f93f2e7b3969e9b22bbd76ef98cf93e382c945d378907f5a4", + "sha256:23558f7bd08a663386c032ab8d302d613d2d02ae0c9758ad410bab6035b58d3d", + "sha256:255d520d3e4a5f16883b182e1a94219fe455ab4f50aaaf534bfd6d64ee728397", + "sha256:2a6bc19a728f6f643cfc89b876159a1a25a8f7d8700c013d48a73691f80b4550", + "sha256:379bed346ef8ba0a0e698b3c5975a44d15dd4a5bbff40bbd7fd548b445d5550b", + "sha256:3b12d0866759db93b0a893b4e50a7d7d1681519d2346c26695bb8bb2c652230e", + "sha256:40d491944f69e350e1e8b25f6ca49459824ede1678ec0cd4b5541f41edc06614", + "sha256:471484c7b9d7b7867263051aa81cdeed6e06b455e629a7f05eb91a6cb8bd0836", + "sha256:488c557080557bc01aabb3e1bda7225c68455b853733a8652857ac0d810dad1b", + "sha256:49c2e76e7aa81ba889b3c183e2341af3cc6161ee38852085110ae49d5b5d9a40", + "sha256:52d13ec90236e5935ed6da044e78faa1371d5116cc43fe6d7ca8994dd619ef96", + "sha256:57898c69a253d81f487787bdd538629fabd671fab8a9e31b041ca30965fd9556", + "sha256:5d577eef5beb5730ef01ab39983eb852a97c359b7a546809adf70c409f4b2ecc", + "sha256:6a41987c1474c9158a0c0c96611530a8f299bc547d35bee8add981b8b2534f74", + "sha256:6ae67b7df8db3626af8e042e9c6949cfa27d1a3bbbfdff29e45b72bb6673a650", + "sha256:6c42c27e9d12e8a481aff469ffe8dd4ce0484c354a418470960f760f6ae41e7c", + "sha256:6c4a90c9f6128b4d0905a89930bd325e0491574e5cb453f606bb7094a3197587", + "sha256:6e64518e5833ac2d9359b6d9bd4df2c0cf441a0f3a4eca9e735fbea99009fa70", + "sha256:6fd3a270c23c5b42d86a9c7c6b0229f23ee4a7a4cabdaaa1693ad7a0982d13cb", + "sha256:70db73351e0fcf11a76288c47a0469d9a330bcb2e7618c5eb57432b8caa82403", + "sha256:771f401692046845626cbdf1dd0f04e999413ede0ee9ad39033fe30b5fa2e845", + "sha256:7935026ec61b967cbc6b746c0ca75c1651ea118d7fee4d259cff9e6866153374", + "sha256:7b76b1cac9baac1980210e29145800954e7b42e91ef69c4d695de1cab87ce41f", + "sha256:7e3f37c11b6699b1a1e0fcc0e88829dba4f2866546381b05ab8b3f4db645a823", + "sha256:8370fa65ad421484894f559055f951843754153b72b9bca2ebdc5288efe2e3f0", + "sha256:8ae9c443d44a4e23252632e4d7775f419f992d0df3eff923e23775f5cc551d39", + "sha256:8b31d85f2781e44f1ffaaf7ea07f484e7d42317c677c355fa77b4a1a4bea7394", + "sha256:8b450336b27f3b375cadc474c6704838eaa8dd3ca312aac3bb69d92264a8e638", + "sha256:9ce84357388a76d886febff4e50e321c212ffd3248b590960b2da6e02404a5c9", + "sha256:a23e986fb0ba8e7407286add41fa0d4207be44e3dce1b04789f4757800eca1cf", + "sha256:a81610ee00d0da9cd2c8679479b7791149365b6dfb3971b01b22ee29b04787ce", + "sha256:b4e40444975e5ab0ed3004369209c39a28e084951daaeee4919f164b6b849b14", + "sha256:b66600de16702b9dfa74bea34524b55183a2183e5fd92f20fe6c2fcae550a64c", + "sha256:ba6ee18694d3673796b7a31b7d21254e87e9e43ca5be56f323fd396111255315", + "sha256:bd03837da28293baa39bdfc3cada69e2f8807f423ae06168aa28d2b32c63a6b6", + "sha256:bd2192070f88c0778ae1d68a0980fdece3473498c1db37f3794e3454f91e3ecf", + "sha256:c1f6f1a3cc013012cd1da913c40b13e6d721046a8c8a0ea0cde94069645a75db", + "sha256:ce10a8e7e067bde3c1fbf494d2b8859db510206030b0b67bc3af90b0eb1887b9", + "sha256:d31386d208303a5a6cf0819ef9f6db6680bab9e4ca8e48adb3d4b26ead89beb7", + "sha256:d83b3af53b201970973c5574b39df226746194063bb248a53fd12b470ac34319", + "sha256:df9657b212c054ac6d803290d7c4bcd7790af0b725984fce1eeb0a1e3f2d9798", + "sha256:e576e5fd3f129e6b3595dc734ac7f2b8c548f19ef07781194bc538dc9c0cdbbc", + "sha256:e7400358558094c1bcedc75f3b3c4f400c53130b44833848890a99968dee6a64", + "sha256:eb6a385f8577d30e4cb43dd555fb134ddaae1edeb84205e09dabec332bf49fd0", + "sha256:f27f0875e0873f6bf5df09a456bfcac0667824cabac4cad30b43f36e0382ffe7", + "sha256:fcd4a6d04995f1d66bc78b503e4e59ae72fd32aaec4f661657fe5ae5c1aa4ce3" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", - "version": "==1.1.2" + "markers": "python_version >= '3' and platform_machine == 'aarch64' or (platform_machine == 'ppc64le' or (platform_machine == 'x86_64' or (platform_machine == 'amd64' or (platform_machine == 'AMD64' or (platform_machine == 'win32' or platform_machine == 'WIN32')))))", + "version": "==2.0.0a2" }, "gspread": { "hashes": [ @@ -416,11 +393,11 @@ }, "loguru": { "hashes": [ - "sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c", - "sha256:4e2414d534a2ab57573365b3e6d0234dfb1d84b68b7f3b948e6fb743860a77c3" + "sha256:b28e72ac7a98be3d28ad28570299a393dfcd32e5e3f6a353dec94675767b6319", + "sha256:f8087ac396b5ee5f67c963b495d615ebbceac2796379599820e324419d53667c" ], "index": "pypi", - "version": "==0.6.0" + "version": "==0.5.3" }, "lxml": { "hashes": [ @@ -895,9 +872,7 @@ "version": "==2022.3.2" }, "requests": { - "extras": [ - "socks" - ], + "extras": [], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" @@ -918,7 +893,7 @@ "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17", "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb" ], - "markers": "python_version >= '3.6'", + "markers": "python_version >= '3.6' and python_version < '4'", "version": "==4.8" }, "s3transfer": { @@ -951,44 +926,45 @@ }, "sqlalchemy": { "hashes": [ - "sha256:04164e0063feb7aedd9d073db0fd496edb244be40d46ea1f0d8990815e4b8c34", - "sha256:159c2f69dd6efd28e894f261ffca1100690f28210f34cfcd70b895e0ea7a64f3", - "sha256:199dc6d0068753b6a8c0bd3aceb86a3e782df118260ebc1fa981ea31ee054674", - "sha256:1bbac3e8293b34c4403d297e21e8f10d2a57756b75cff101dc62186adec725f5", - "sha256:20e9eba7fd86ef52e0df25bea83b8b518dfdf0bce09b336cfe51671f52aaaa3f", - "sha256:290cbdf19129ae520d4bdce392648c6fcdbee763bc8f750b53a5ab51880cb9c9", - "sha256:316270e5867566376e69a0ac738b863d41396e2b63274616817e1d34156dff0e", - "sha256:3f88a4ee192142eeed3fe173f673ea6ab1f5a863810a9d85dbf6c67a9bd08f97", - "sha256:4aa96e957141006181ca58e792e900ee511085b8dae06c2d08c00f108280fb8a", - "sha256:4b2bcab3a914715d332ca783e9bda13bc570d8b9ef087563210ba63082c18c16", - "sha256:576684771456d02e24078047c2567025f2011977aa342063468577d94e194b00", - "sha256:5a2e73508f939175363d8a4be9dcdc84cf16a92578d7fa86e6e4ca0e6b3667b2", - "sha256:5ba59761c19b800bc2e1c9324da04d35ef51e4ee9621ff37534bc2290d258f71", - "sha256:5dc9801ae9884e822ba942ca493642fb50f049c06b6dbe3178691fce48ceb089", - "sha256:6fdd2dc5931daab778c2b65b03df6ae68376e028a3098eb624d0909d999885bc", - "sha256:708973b5d9e1e441188124aaf13c121e5b03b6054c2df59b32219175a25aa13e", - "sha256:7ff72b3cc9242d1a1c9b84bd945907bf174d74fc2519efe6184d6390a8df478b", - "sha256:8679f9aba5ac22e7bce54ccd8a77641d3aea3e2d96e73e4356c887ebf8ff1082", - "sha256:8b9a395122770a6f08ebfd0321546d7379f43505882c7419d7886856a07caa13", - "sha256:8e1e5d96b744a4f91163290b01045430f3f32579e46d87282449e5b14d27d4ac", - "sha256:9a0195af6b9050c9322a97cf07514f66fe511968e623ca87b2df5e3cf6349615", - "sha256:9cb5698c896fa72f88e7ef04ef62572faf56809093180771d9be8d9f2e264a13", - "sha256:b3f1d9b3aa09ab9adc7f8c4b40fc3e081eb903054c9a6f9ae1633fe15ae503b4", - "sha256:bb42f9b259c33662c6a9b866012f6908a91731a419e69304e1261ba3ab87b8d1", - "sha256:bca714d831e5b8860c3ab134c93aec63d1a4f493bed20084f54e3ce9f0a3bf99", - "sha256:bedd89c34ab62565d44745212814e4b57ef1c24ad4af9b29c504ce40f0dc6558", - "sha256:bfec934aac7f9fa95fc82147a4ba5db0a8bdc4ebf1e33b585ab8860beb10232f", - "sha256:c7046f7aa2db445daccc8424f50b47a66c4039c9f058246b43796aa818f8b751", - "sha256:d7e483f4791fbda60e23926b098702340504f7684ce7e1fd2c1bf02029288423", - "sha256:dd93162615870c976dba43963a24bb418b28448fef584f30755990c134a06a55", - "sha256:e4607d2d16330757818c9d6fba322c2e80b4b112ff24295d1343a80b876eb0ed", - "sha256:e9a680d9665f88346ed339888781f5236347933906c5a56348abb8261282ec48", - "sha256:edfcf93fd92e2f9eef640b3a7a40db20fe3c1d7c2c74faa41424c63dead61b76", - "sha256:f7e4a3c0c3c596296b37f8427c467c8e4336dc8d50f8ed38042e8ba79507b2c9", - "sha256:fff677fa4522dafb5a5e2c0cf909790d5d367326321aeabc0dffc9047cb235bd" + "sha256:045d6a26c262929af0b9cb25441aae675ac04db4ea8bd2446b355617cd6b6b7d", + "sha256:07f4dab2deb6d34618a2ccfff3971a85923ad7c3a9a45401818870fc51d3f0cc", + "sha256:08aaad905aba8940f27aeb9f1f851bf63f18ef97b0062ca41f64afc4b64e0e8c", + "sha256:27a42894a2751e438eaed12fc0dcfe741ff2f66c14760d081222c5adc5460064", + "sha256:2a3e4dc7c452ba3c0f3175ad5a8e0ba49c2b0570a8d07272cf50844c8d78e74f", + "sha256:345306707bb0e51e7cd6e7573adafbce018894ee5e3b9c31134545f704936db0", + "sha256:36f08d94670315ca04c8139bd80b3e02b9dd9cc66fc11bcb96fd10ad51a051ab", + "sha256:3ebb97ed96f4506e2f212e1fcf0ec07a103bb194938627660a5acb4d9feae49c", + "sha256:40b995d7aeeb6f88a1927ce6692c0f626b59d8effd3e1d597f125e141707b37c", + "sha256:4414ace6e3a5e39523e55a5d9f3b215699b2ead4ff91fca98f1b659b7ab2d92a", + "sha256:50107d8183da3fbe5715957aa3954cd9d82aed555c5b4d3fd37fac861af422fa", + "sha256:50174e173d03209c34e07e7b57cca48d0082ac2390edf927aafc706c111da11e", + "sha256:5e88912bf192e7b5739c446d2276e1cba74cfa6c1c93eea2b2534404f6be1dbd", + "sha256:621d3f6c0ba2407bb97e82b649be5ca7d5b6c201dcfb964ce13f517bf1cb6305", + "sha256:623bac2d6bdca3f3e61cf1e1c466c5fb9f5cf08735736ee1111187b7a4108891", + "sha256:671f61c3db4595b0e86cc4b30f675a7c0206d9ce99f041b4f6761c7ddd1e0074", + "sha256:67c1c27c48875afc950bee5ee24582794f20b545e64e4f9ca94071a9b514d6ed", + "sha256:6a6cfd468f54d65324fd3847cfd0148b0610efa6a43e5f5fcc89f455696ae9e7", + "sha256:70048a83f0a1ece1fcd7189891c888e20af2c57fbd33eb760d8cece9843b896c", + "sha256:7ee14a7f9f76d1ef9d5e5b760c9252617c839b87eee04d1ce8325ac66ae155c4", + "sha256:804cf491437f3e4ce31247ab4b309b181f06ecc97d309b746d10f09439b4eb85", + "sha256:878c7beaafa365602762c19f638282e1885454fed1aed86f8fae038933c7c671", + "sha256:954ea8c527c4322afb6885944904714893af81fe9167e421273770991bf08a4a", + "sha256:a47bf6b7ca6c28e4f4e262fabcf5be6b907af81be36de77839c9eeda2cdf3bb3", + "sha256:a4fb5c6ee84a6bba4ff6f9f5379f0b3a0ffe9de7ba5a0945659b3da8d519709b", + "sha256:b34bbc683789559f1bc9bb685fc162e0956dbbdfbe2fbd6755a9f5982c113610", + "sha256:c025d45318b73c0601cca451532556cbab532b2742839ebb8cb58f9ebf06811e", + "sha256:c3ad7f5b61ba014f5045912aea15b03c473bb02b1c07fd92c9d2c794fa183276", + "sha256:c9218e3519398129e364121e0d89823e6ba2a2b77c28bfc661face0829c41433", + "sha256:cd5cffd1dd753828f1069f33062f3896e51c990acd957c264f40e051b3e19887", + "sha256:d8efcaa709ea8e7c08c3d3e7639c39b36083f5a995f397f9e6eedf5f5e4e4946", + "sha256:e297a5cc625e3f1367a82deedf2d48ee4d2b2bd263b8b8d2efbaaf5608b5229e", + "sha256:e67278ceb63270cdac0a7b89fc3c29a56f7dac9616a7ee48e7ad6b52e3b631e5", + "sha256:eb6558ba07409dafa18c793c34292b3265be455904966f0724c10198829477e3", + "sha256:f197c66663ed0f9e1178d51141d864688fb244a83f6b17f667d521e482537b2e", + "sha256:f47996b1810894f766c9ee689607077c6c0e0fd6761e04c12ba13efb56d50c1d" ], "index": "pypi", - "version": "==1.4.32" + "version": "==1.4.34" }, "telethon": { "hashes": [ @@ -1163,11 +1139,11 @@ }, "click": { "hashes": [ - "sha256:5e0d195c2067da3136efb897449ec1e9e6c98282fbf30d7f9e164af9be901a6b", - "sha256:7ab900e38149c9872376e8f9b5986ddcaf68c0f413cf73678a0bca5547e6f976" + "sha256:24e1a4a9ec5bf6299411369b208c1df2188d9eb8d916302fe6bf03faed227f1e", + "sha256:479707fe14d9ec9a0757618b7a100a0ae4c4e236fac5b7f80ca68028141a1a72" ], "markers": "python_version >= '3.7'", - "version": "==8.1.1" + "version": "==8.1.2" }, "coverage": { "extras": [ @@ -1415,9 +1391,7 @@ "version": "==2022.1" }, "requests": { - "extras": [ - "socks" - ], + "extras": [], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" @@ -1501,7 +1475,7 @@ "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f" ], - "markers": "python_full_version < '3.11.0'", + "markers": "python_version >= '3.7'", "version": "==2.0.1" }, "typing-extensions": { diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index fb25b58..ccbfa39 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -235,6 +235,20 @@ class Scraper: return archived_url def archive_files(self, result: ScraperResult) -> ScraperResult: + """Archive files corresponding to ``archived_url`` dict keys, if the + files have not previously been archived. + + Parameters + ---------- + result: ScraperResult + Previously scraped ScraperResult run with ``archive_media=False``. + + Returns + ------- + ScraperResult + Same ScraperResult as ``result``, but with all URLs in ``archived_url`` dict archived. + """ + for url in result.archived_urls: if result.archived_urls[url] is None: media_blob, content_type, key = self.url_to_blob(url) @@ -244,7 +258,6 @@ class Scraper: result.media_archived = True return result - def can_handle(self, channel: Channel) -> bool: """Whether or not the scraper can scrape the specified channel. @@ -365,6 +378,10 @@ class ScraperController: else: since = None + # TODO currently, if channels haven't been added to the database, if channel.id is None, the `since` returns the most recently scraped ScraperResult with channel.id == None, which can be from a different platform and channel. Maybe add check in above query logic that channel.id isn't null. + if channel.id is None: + since = None + posts = scraper.get_posts(channel, since=since, archive_media=archive_media) for post in posts: diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index 034e3ac..f318b1e 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -43,9 +43,12 @@ class BitchuteScraper(Scraper): archived_urls = {} - if archive_media: - if 'video_url' in post: - url = post['video_url'] + if 'video_url' in post: + url = post['video_url'] + archived_urls[url] = None + + if archive_media: + media_blob, content_type, key = self.url_to_blob(url) archived_url = self.archive_blob(media_blob, content_type, key) archived_urls[url] = archived_url @@ -112,6 +115,7 @@ class BitchuteScraper(Scraper): channel=channel.id, raw_data=json.dumps(profile), date_archived=datetime.now(timezone.utc)) + #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# def strip_tags(html, convert_newlines=True): diff --git a/cisticola/scraper/gab.py b/cisticola/scraper/gab.py index f66d562..36baf67 100644 --- a/cisticola/scraper/gab.py +++ b/cisticola/scraper/gab.py @@ -50,25 +50,24 @@ class GabScraper(Scraper): if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): break - media_urls = [] archived_urls = {} - if archive_media: - - for attachment in post.get('media_attachments'): + for attachment in post.get('media_attachments'): + if attachment.get('type') == 'video': + archived_urls[attachment['source_mp4']] = None + else: + archived_urls[attachment['url']] = None + + if post.get('reblog') is not None: + for attachment in post['reblog'].get('media_attachments'): if attachment.get('type') == 'video': - media_urls.append(attachment['source_mp4']) + archived_urls[attachment['source_mp4']] = None else: - media_urls.append(attachment['url']) - - if post.get('reblog') is not None: - for attachment in post['reblog'].get('media_attachments'): - if attachment.get('type') == 'video': - media_urls.append(attachment['source_mp4']) - else: - media_urls.append(attachment['url']) + archived_urls[attachment['url']] = None - for url in media_urls: + for url in archived_urls.keys(): + + if archive_media: media_blob, content_type, key = self.url_to_blob(url) archived_url = self.archive_blob(media_blob, content_type, key) archived_urls[url] = archived_url diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index a5088cd..43bc095 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -30,26 +30,25 @@ class GettrScraper(Scraper): archived_urls = {} - if archive_media: + if 'imgs' in post: + for img in post['imgs']: + url = "https://media.gettr.com/" + img + archived_urls[url] = None - if 'imgs' in post: - for img in post['imgs']: - url = "https://media.gettr.com/" + img - media_blob, content_type, key = self.url_to_blob(url) - archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[img] = archived_url + if 'main' in post: + url = "https://media.gettr.com/" + post['main'] + archived_urls[url] = None - if 'main' in post: - url = "https://media.gettr.com/" + post['main'] + if 'ovid' in post: + url = "https://media.gettr.com/" + post['ovid'] + archived_urls[url] = None + + for url in archived_urls.keys(): + + if archive_media: media_blob, content_type, key = self.url_to_blob(url) archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[post['main']] = archived_url - - if 'vid' in post: - url = "https://media.gettr.com/" + post['vid'] - media_blob, content_type, key = self.m3u8_url_to_blob(url) - archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[post['vid']] = archived_url + archived_urls[url] = archived_url yield ScraperResult( scraper=self.__version__, @@ -72,7 +71,7 @@ class GettrScraper(Scraper): return key def get_profile(self, channel: Channel) -> RawChannelInfo: - client = client = PublicClient() + client = PublicClient() username = self.get_username_from_url(channel.url) profile = client.user_info(username) diff --git a/cisticola/scraper/instagram.py b/cisticola/scraper/instagram.py index dfe0304..435d69d 100644 --- a/cisticola/scraper/instagram.py +++ b/cisticola/scraper/instagram.py @@ -1,4 +1,4 @@ -from typing import Generator +from typing import Generator, List from datetime import datetime, timezone import os import json @@ -49,28 +49,14 @@ class InstagramScraper(Scraper): post_url = f'{BASE_URL}p/{post.shortcode}/' - archived_urls = {} + archived_urls = get_archived_urls_from_post(post = post) - if archive_media: + for url in archived_urls.keys(): - with tempfile.TemporaryDirectory() as temp_dir: - - loader.download_post(post = post, target = Path(temp_dir)) - - files = os.listdir(temp_dir) - files = [f for f in files if not f.endswith('.txt')] - - for file in files: - ext = file.split('.')[-1] - content_type = CONTENT_TYPES[ext] - filename = Path(temp_dir, file) - key = f'{post.shortcode}__{file}' - - with open(filename, 'rb') as f: - blob = f.read() - - archived_url = self.archive_blob(blob = blob, content_type = content_type, key = key) - archived_urls[post_url] = archived_url + if archive_media: + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_blob(media_blob, content_type, key) + archived_urls[url] = archived_url yield ScraperResult( scraper=self.__version__, @@ -98,7 +84,7 @@ class InstagramScraper(Scraper): date_archived=datetime.now(timezone.utc), raw_posts=json.dumps(comment_dict, default=str), archived_urls={}, - media_archived=archive_media) + media_archived=True) def can_handle(self, channel): if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None: @@ -126,7 +112,20 @@ class InstagramScraper(Scraper): profile['followees'] = user_profile.followees return RawChannelInfo(scraper=self.__version__, - platform=channel.platform, - channel=channel.id, - raw_data=json.dumps(profile), - date_archived=datetime.now(timezone.utc)) + platform=channel.platform, + channel=channel.id, + raw_data=json.dumps(profile), + date_archived=datetime.now(timezone.utc)) + +def get_archived_urls_from_post(post: instaloader.Post) -> List[str]: + typename = post._node['__typename'] + if typename == 'GraphImage': + urls = [post._node['display_url']] + elif typename == 'GraphVideo': + urls = [post._node['video_url']] + elif typename == 'GraphSidecar': + urls = [edge['node']['display_url'] for edge in post._node['edge_sidecar_to_children']['edges']] + else: + raise NotImplementedError(f'post of type {typename} is currently not supported.') + + return {url : None for url in urls} \ No newline at end of file diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py index 0f5db65..020a2ba 100644 --- a/cisticola/scraper/odysee.py +++ b/cisticola/scraper/odysee.py @@ -36,10 +36,11 @@ class OdyseeScraper(Scraper): if since is not None and datetime.fromtimestamp(video.info['created']) <= since.date: break - archived_urls = {} + url = video.info['streaming_url'] + + archived_urls = {url: None} if archive_media: - url = video.info['streaming_url'] # Check if file is a video file or an m3u8 file r = requests.head(url) @@ -77,6 +78,21 @@ class OdyseeScraper(Scraper): archived_urls={}, media_archived=True) + def archive_files(self, result: ScraperResult) -> ScraperResult: + for url in result.archived_urls: + if result.archived_urls[url] is None: + r = requests.head(url) + if r.headers['Content-Type'] == 'text/html; charset=utf-8': + media_blob, content_type, key = self.m3u8_url_to_blob(url) + else: + media_blob, content_type, key = self.url_to_blob(url) + + archived_url = self.archive_blob(media_blob, content_type, key) + result.archived_urls[url] = archived_url + + result.media_archived = True + return result + def can_handle(self, channel): if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None: return True @@ -94,7 +110,7 @@ class OdyseeScraper(Scraper): profile = odysee_channel.info return RawChannelInfo(scraper=self.__version__, - platform=channel.platform, - channel=channel.id, - raw_data=json.dumps(profile), - date_archived=datetime.now(timezone.utc)) \ No newline at end of file + platform=channel.platform, + channel=channel.id, + raw_data=json.dumps(profile), + date_archived=datetime.now(timezone.utc)) \ No newline at end of file diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index cb24c57..2a4d968 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -19,18 +19,18 @@ class RumbleScraper(Scraper): scraper = get_channel_videos(channel.url) for post in scraper: - if since is not None and post['datetime'].replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc): + if since is not None and post['datetime'].replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): break - archived_urls = {} + url = post['media_url'] + + archived_urls = {url: None} if archive_media: - url = post['media_url'] - media_blob, content_type, key = self.ytdlp_url_to_blob(url) archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[post['media_url']] = archived_url + archived_urls[url] = archived_url yield ScraperResult( scraper=self.__version__, @@ -48,6 +48,16 @@ class RumbleScraper(Scraper): key = urlparse(url).path.split('/')[-2] + ext return key + def archive_files(self, result: ScraperResult) -> ScraperResult: + for url in result.archived_urls: + if result.archived_urls[url] is None: + media_blob, content_type, key = self.ytdlp_url_to_blob(url) + archived_url = self.archive_blob(media_blob, content_type, key) + result.archived_urls[url] = archived_url + + result.media_archived = True + return result + def can_handle(self, channel): if channel.platform == "Rumble" and channel.url is not None: return True @@ -57,10 +67,10 @@ class RumbleScraper(Scraper): profile = get_channel_profile(url = channel.url) return RawChannelInfo(scraper=self.__version__, - platform=channel.platform, - channel=channel.id, - raw_data=json.dumps(profile), - date_archived=datetime.now(timezone.utc)) + platform=channel.platform, + channel=channel.id, + raw_data=json.dumps(profile), + date_archived=datetime.now(timezone.utc)) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py index e683296..e272db5 100644 --- a/cisticola/scraper/telegram_snscrape.py +++ b/cisticola/scraper/telegram_snscrape.py @@ -33,8 +33,8 @@ class TelegramSnscrapeScraper(Scraper): for image_url in post.images: archived_urls[image_url] = None - if post.video: - archived_urls[post.video] = None + for video_url in post.videos: + archived_urls[video_url] = None if archive_media: for url in archived_urls: diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index 6ed37db..1d00d53 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -14,7 +14,7 @@ class TwitterScraper(Scraper): def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: if channel.platform_id: - identifier = channel.platform_id + identifier = int(channel.platform_id) else: identifier = channel.screenname @@ -23,7 +23,7 @@ class TwitterScraper(Scraper): first = True for tweet in scraper.get_items(): - if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc): + if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): # with TwitterProfileScraper, the first tweet could be an old pinned tweet if first: first = False @@ -105,7 +105,7 @@ class TwitterScraper(Scraper): raise ChannelDoesNotExistError(channel.url) else: return RawChannelInfo(scraper=self.__version__, - platform=channel.platform, - channel=channel.id, - raw_data=json.dumps(entity.__dict__, default=str), - date_archived=datetime.now(timezone.utc)) + platform=channel.platform, + channel=channel.id, + raw_data=json.dumps(entity.__dict__, default=str), + date_archived=datetime.now(timezone.utc)) diff --git a/cisticola/scraper/vkontakte.py b/cisticola/scraper/vkontakte.py index 3f23bca..5e3d5d3 100644 --- a/cisticola/scraper/vkontakte.py +++ b/cisticola/scraper/vkontakte.py @@ -1,8 +1,12 @@ from datetime import datetime, timezone from typing import Generator from urllib.parse import urlparse +import json +import re + from snscrape.modules.vkontakte import VKontakteUserScraper from loguru import logger +from yt_dlp.extractor.vk import VKIE from cisticola.base import Channel, ScraperResult, RawChannelInfo from cisticola.scraper.base import Scraper @@ -24,7 +28,7 @@ class VkontakteScraper(Scraper): first = True for post in scraper.get_items(): - if since is not None and datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc): + if since is not None and datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): # with VKontakteUserScraper, the first tweet could be an old pinned tweet if first: first = False @@ -34,23 +38,26 @@ class VkontakteScraper(Scraper): archived_urls = {} - if archive_media: + if post.photos: - if post.photos: + for photo in post.photos: + variant = max( + [v for v in photo.variants], key=lambda v: v.width * v.height) + url = variant.url + if url is not None: + archived_urls[url] = None - for photo in post.photos: - variant = max( - [v for v in photo.variants], key=lambda v: v.width * v.height) - url = variant.url - - if url is not None: - media_blob, content_type, key = self.url_to_blob(url) - archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[url] = archived_url + if post.video: + archived_urls[post.video.url] = None - if post.video: - url = post.video.url - media_blob, content_type, key = self.ytdlp_url_to_blob(url) + for url in archived_urls.keys(): + + if archive_media: + if re.match(VKIE._VALID_URL, url): + # Uses regex from yt_dlp to verify VK video URL + media_blob, content_type, key = self.ytdlp_url_to_blob(url) + else: + media_blob, content_type, key = self.url_to_blob(url) archived_url = self.archive_blob(media_blob, content_type, key) archived_urls[url] = archived_url @@ -65,6 +72,21 @@ class VkontakteScraper(Scraper): archived_urls=archived_urls, media_archived=archive_media) + def archive_files(self, result: ScraperResult) -> ScraperResult: + for url in result.archived_urls: + if result.archived_urls[url] is None: + if re.match(VKIE._VALID_URL, url): + # Uses regex from yt_dlp to verify VK video URL + media_blob, content_type, key = self.ytdlp_url_to_blob(url) + else: + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_blob(media_blob, content_type, key) + result.archived_urls[url] = archived_url + + result.media_archived = True + return result + + def can_handle(self, channel): if channel.platform == "Vkontakte" and channel.platform_id: return True @@ -87,7 +109,7 @@ class VkontakteScraper(Scraper): profile = scraper._get_entity().__dict__ return RawChannelInfo(scraper=self.__version__, - platform=channel.platform, - channel=channel.id, - raw_data=json.dumps(profile), - date_archived=datetime.now(timezone.utc)) + platform=channel.platform, + channel=channel.id, + raw_data=json.dumps(profile), + date_archived=datetime.now(timezone.utc)) diff --git a/cisticola/scraper/youtube.py b/cisticola/scraper/youtube.py index 0d4c8e3..f937d24 100644 --- a/cisticola/scraper/youtube.py +++ b/cisticola/scraper/youtube.py @@ -2,7 +2,11 @@ from datetime import datetime, timezone import json from typing import Generator import tempfile +from pathlib import Path +import os + import yt_dlp +from loguru import logger from cisticola.base import Channel, ScraperResult, RawChannelInfo from cisticola.scraper import Scraper @@ -46,7 +50,10 @@ class YoutubeScraper(Scraper): for video in valid_videos: - archived_urls = {} + url = video['webpage_url'] + + archived_urls = {url: None} + video_id = video["id"] video_ext = video["ext"] @@ -54,11 +61,8 @@ class YoutubeScraper(Scraper): key = f"{video_id}.{video_ext}" - with open(f"{temp_dir}/{key}", "rb") as f: + with open(Path(temp_dir)/key, "rb") as f: media_blob = f.read() - archived_url = self.archive_blob(media_blob, content_type, key) - - url = video['webpage_url'] archived_url = self.archive_blob(media_blob, content_type, key) archived_urls[url] = archived_url @@ -78,6 +82,41 @@ class YoutubeScraper(Scraper): if channel.platform == "Youtube" and channel.url: return True + def archive_files(self, result: ScraperResult) -> ScraperResult: + for url in result.archived_urls: + if result.archived_urls[url] is None: + + media_blob = None + + with tempfile.TemporaryDirectory() as temp_dir: + + ydl_opts = { + "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", + "merge_output_format": "mp4", + "outtmpl": f"{temp_dir}/%(id)s.%(ext)s"} + + ydl = yt_dlp.YoutubeDL(ydl_opts) + + try: + ydl.download(url) + except yt_dlp.utils.DownloadError as e: + raise e + + files = os.listdir(temp_dir) + if len(files) != 1: + logger.warning(f'{len(files)} files downloaded for video: {url}') + key = files[0] + with open(Path(temp_dir, key), 'rb') as f: + media_blob = f.read() + + if media_blob is not None: + content_type = 'video/mp4' + archived_url = self.archive_blob(media_blob, content_type, key) + result.archived_urls[url] = archived_url + + result.media_archived = True + return result + def get_profile(self, channel: Channel) -> RawChannelInfo: ydl_opts = {} ydl = yt_dlp.YoutubeDL(ydl_opts) @@ -87,12 +126,13 @@ class YoutubeScraper(Scraper): meta = ydl.extract_info( channel.url, process=False) + meta.pop('entries') return RawChannelInfo(scraper=self.__version__, - platform=channel.platform, - channel=channel.id, - raw_data=json.dumps(meta), - date_archived=datetime.now(timezone.utc)) + platform=channel.platform, + channel=channel.id, + raw_data=json.dumps(meta), + date_archived=datetime.now(timezone.utc)) except yt_dlp.utils.DownloadError as e: raise e diff --git a/pytest.ini b/pytest.ini index 744f87d..ae2a8b6 100644 --- a/pytest.ini +++ b/pytest.ini @@ -12,10 +12,9 @@ addopts = --html='reports/tests.html' --self-contained-html markers = - profile: marks tests for only extracting channel metadata (deselect with '-m - "not profile"') - media: marks tests for archiving all media attachments (deselect with '-m - "not media"') + profile: marks tests for only extracting channel metadata (deselect with '-m "not profile"') + media: marks tests for archiving all media attachments (deselect with '-m "not media"') + unarchived: marks tests for archiving all unarchived media attachments (deselect with '-m "not unarchived"') filterwarnings = ignore:the imp module is deprecated:DeprecationWarning ignore:The localize method is no longer necessary, as this time zone supports the fold attribute diff --git a/tests/conftest.py b/tests/conftest.py index 3bccf81..684c15d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -99,12 +99,12 @@ RUMBLE_CHANNEL_KWARGS = { 'notes': ''} TELEGRAM_CHANNEL_KWARGS = { - 'name': 'USA Freedom Convoy (test)', - 'platform_id': -1001799578085, + 'name': 'South West Ohio Proud Boys (test)', + 'platform_id': -1001276612436, 'category': 'test', 'platform': 'Telegram', - 'url': 'https://t.me/usafreedomconvoy2022', - 'screenname': 'usafreedomconvoy2022', + 'url': 'https://t.me/SouthwestOhioPB', + 'screenname': 'SouthwestOhioPB', 'country': 'US', 'influencer': None, 'public': True, diff --git a/tests/scraper/bitchute.py b/tests/scraper/bitchute.py index 94707ec..62b3ffe 100644 --- a/tests/scraper/bitchute.py +++ b/tests/scraper/bitchute.py @@ -3,12 +3,19 @@ import pytest from cisticola.base import Channel from cisticola.scraper import BitchuteScraper +@pytest.mark.unarchived def test_scrape_bitchute_channel_no_media(controller, channel_kwargs): channels = [Channel(**channel_kwargs['bitchute'])] controller.register_scraper(scraper = BitchuteScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media +@pytest.mark.unarchived +def test_scrape_bitchute_channel_unarchived_media(controller): + + controller.archive_unarchived_media() + @pytest.mark.media def test_scrape_bitchute_channel(controller, channel_kwargs): diff --git a/tests/scraper/gab.py b/tests/scraper/gab.py index d600429..79ba8d7 100644 --- a/tests/scraper/gab.py +++ b/tests/scraper/gab.py @@ -3,12 +3,19 @@ import pytest from cisticola.base import Channel from cisticola.scraper import GabScraper +@pytest.mark.unarchived def test_scrape_gab_channel_no_media(controller, channel_kwargs): channels = [Channel(**channel_kwargs['gab'])] controller.register_scraper(scraper = GabScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media +@pytest.mark.unarchived +def test_scrape_gab_channel_unarchived_media(controller): + + controller.archive_unarchived_media() + @pytest.mark.media def test_scrape_gab_channel(controller, channel_kwargs): diff --git a/tests/scraper/gettr.py b/tests/scraper/gettr.py index 81a8bb8..352e839 100644 --- a/tests/scraper/gettr.py +++ b/tests/scraper/gettr.py @@ -3,12 +3,19 @@ import pytest from cisticola.base import Channel from cisticola.scraper import GettrScraper +@pytest.mark.unarchived def test_scrape_gettr_channel_no_media(controller, channel_kwargs): channels = [Channel(**channel_kwargs['gettr'])] controller.register_scraper(scraper = GettrScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media +@pytest.mark.unarchived +def test_scrape_gettr_channel_unarchived_media(controller): + + controller.archive_unarchived_media() + @pytest.mark.media def test_scrape_gettr_channel(controller, channel_kwargs): diff --git a/tests/scraper/instagram.py b/tests/scraper/instagram.py index 98a0684..099ab40 100644 --- a/tests/scraper/instagram.py +++ b/tests/scraper/instagram.py @@ -3,12 +3,19 @@ import pytest from cisticola.base import Channel from cisticola.scraper import InstagramScraper +@pytest.mark.unarchived def test_scrape_instagram_channel_no_media(controller, channel_kwargs): channels = [Channel(**channel_kwargs['instagram'])] controller.register_scraper(scraper = InstagramScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media +@pytest.mark.unarchived +def test_scrape_instagram_channel_unarchived_media(controller): + + controller.archive_unarchived_media() + @pytest.mark.media def test_scrape_instagram_channel(controller, channel_kwargs): diff --git a/tests/scraper/odysee.py b/tests/scraper/odysee.py index 84a45f8..9883bdb 100644 --- a/tests/scraper/odysee.py +++ b/tests/scraper/odysee.py @@ -3,12 +3,19 @@ import pytest from cisticola.base import Channel from cisticola.scraper import OdyseeScraper +@pytest.mark.unarchived def test_scrape_odysee_channel_no_media(controller, channel_kwargs): channels = [Channel(**channel_kwargs['odysee'])] controller.register_scraper(scraper = OdyseeScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media +@pytest.mark.unarchived +def test_scrape_odysee_channel_unarchived_media(controller): + + controller.archive_unarchived_media() + @pytest.mark.media def test_scrape_odysee_channel(controller, channel_kwargs): diff --git a/tests/scraper/rumble.py b/tests/scraper/rumble.py index 18c8749..5b01f9c 100644 --- a/tests/scraper/rumble.py +++ b/tests/scraper/rumble.py @@ -3,12 +3,19 @@ import pytest from cisticola.base import Channel from cisticola.scraper import RumbleScraper +@pytest.mark.unarchived def test_scrape_rumble_channel_no_media(controller, channel_kwargs): channels = [Channel(**channel_kwargs['rumble'])] controller.register_scraper(scraper = RumbleScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media +@pytest.mark.unarchived +def test_scrape_rumble_channel_unarchived_media(controller): + + controller.archive_unarchived_media() + @pytest.mark.media def test_scrape_rumble_channel(controller, channel_kwargs): diff --git a/tests/scraper/telegram_snscrape.py b/tests/scraper/telegram_snscrape.py index dbaed43..5dbe151 100644 --- a/tests/scraper/telegram_snscrape.py +++ b/tests/scraper/telegram_snscrape.py @@ -3,12 +3,19 @@ import pytest from cisticola.base import Channel from cisticola.scraper import TelegramSnscrapeScraper +@pytest.mark.unarchived def test_scrape_telegram_snscrape_channel_no_media(controller, channel_kwargs): channels = [Channel(**channel_kwargs['telegram'])] controller.register_scraper(scraper = TelegramSnscrapeScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media +@pytest.mark.unarchived +def test_scrape_telegram_snscrape_channel_unarchived_media(controller): + + controller.archive_unarchived_media() + @pytest.mark.media def test_scrape_telegram_snscrape_channel(controller, channel_kwargs): diff --git a/tests/scraper/telegram_telethon.py b/tests/scraper/telegram_telethon.py index ee994eb..8dbe9ff 100644 --- a/tests/scraper/telegram_telethon.py +++ b/tests/scraper/telegram_telethon.py @@ -3,6 +3,7 @@ import pytest from cisticola.base import Channel from cisticola.scraper import TelegramTelethonScraper +@pytest.mark.unarchived def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs): controller.remove_all_scrapers() @@ -10,6 +11,12 @@ def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = TelegramTelethonScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media +@pytest.mark.unarchived +def test_scrape_telegram_telethon_unarchived_media(controller): + + controller.archive_unarchived_media() + @pytest.mark.media def test_scrape_telegram_telethon_channel(controller, channel_kwargs): diff --git a/tests/scraper/twitter.py b/tests/scraper/twitter.py index 97765aa..0a4ad86 100644 --- a/tests/scraper/twitter.py +++ b/tests/scraper/twitter.py @@ -3,12 +3,19 @@ import pytest from cisticola.base import Channel from cisticola.scraper import TwitterScraper +@pytest.mark.unarchived def test_scrape_twitter_channel_no_media(controller, channel_kwargs): channels = [Channel(**channel_kwargs['twitter'])] controller.register_scraper(scraper = TwitterScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media +@pytest.mark.unarchived +def test_scrape_twitter_channel_unarchived_media(controller): + + controller.archive_unarchived_media() + @pytest.mark.media def test_scrape_twitter_channel(controller, channel_kwargs): diff --git a/tests/scraper/vkontakte.py b/tests/scraper/vkontakte.py index 4209c30..12ff12c 100644 --- a/tests/scraper/vkontakte.py +++ b/tests/scraper/vkontakte.py @@ -3,12 +3,19 @@ import pytest from cisticola.base import Channel from cisticola.scraper import VkontakteScraper +@pytest.mark.unarchived def test_scrape_vkontakte_channel_no_media(controller, channel_kwargs): channels = [Channel(**channel_kwargs['vkontakte'])] controller.register_scraper(scraper = VkontakteScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media +@pytest.mark.unarchived +def test_scrape_vkontakte_channel_unarchived_media(controller): + + controller.archive_unarchived_media() + @pytest.mark.media def test_scrape_vkontakte_channel(controller, channel_kwargs): diff --git a/tests/scraper/youtube.py b/tests/scraper/youtube.py index 1750b08..79ba7c7 100644 --- a/tests/scraper/youtube.py +++ b/tests/scraper/youtube.py @@ -3,12 +3,19 @@ import pytest from cisticola.base import Channel from cisticola.scraper import YoutubeScraper +@pytest.mark.unarchived def test_scrape_youtube_channel_no_media(controller, channel_kwargs): channels = [Channel(**channel_kwargs['youtube'])] controller.register_scraper(scraper = YoutubeScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media +@pytest.mark.unarchived +def test_scrape_youtube_channel_unarchived_media(controller): + + controller.archive_unarchived_media() + @pytest.mark.media def test_scrape_youtube_channel(controller, channel_kwargs):