mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-11 12:58:33 +03:00
implemented deferred media archiving for all scrapers, and implemented tests for them. Refactored archiving methods of Instagram and Gettr scrapers to be able to use default archiving method
This commit is contained in:
2
Pipfile
2
Pipfile
@@ -36,7 +36,7 @@ sphinx = "*"
|
||||
sphinx_rtd_theme = "*"
|
||||
|
||||
[requires]
|
||||
python_version = "3.8"
|
||||
python_version = "3.9"
|
||||
|
||||
[pipenv]
|
||||
allow_prereleases = true
|
||||
|
||||
254
Pipfile.lock
generated
254
Pipfile.lock
generated
@@ -1,11 +1,11 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "e57f79178ac0e05f9753a29f97e08d2ae96b7775044bb4c6ba616baae1d21183"
|
||||
"sha256": "b9fc02f3ecaa2199480c4fcba30f02780860dfbc2e10c026889c78f639709fb4"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
"python_version": "3.8"
|
||||
"python_version": "3.9"
|
||||
},
|
||||
"sources": [
|
||||
{
|
||||
@@ -16,28 +16,6 @@
|
||||
]
|
||||
},
|
||||
"default": {
|
||||
"backports.zoneinfo": {
|
||||
"hashes": [
|
||||
"sha256:17746bd546106fa389c51dbea67c8b7c8f0d14b5526a579ca6ccf5ed72c526cf",
|
||||
"sha256:1b13e654a55cd45672cb54ed12148cd33628f672548f373963b0bff67b217328",
|
||||
"sha256:1c5742112073a563c81f786e77514969acb58649bcdf6cdf0b4ed31a348d4546",
|
||||
"sha256:4a0f800587060bf8880f954dbef70de6c11bbe59c673c3d818921f042f9954a6",
|
||||
"sha256:5c144945a7752ca544b4b78c8c41544cdfaf9786f25fe5ffb10e838e19a27570",
|
||||
"sha256:7b0a64cda4145548fed9efc10322770f929b944ce5cee6c0dfe0c87bf4c0c8c9",
|
||||
"sha256:8439c030a11780786a2002261569bdf362264f605dfa4d65090b64b05c9f79a7",
|
||||
"sha256:8961c0f32cd0336fb8e8ead11a1f8cd99ec07145ec2931122faaac1c8f7fd987",
|
||||
"sha256:89a48c0d158a3cc3f654da4c2de1ceba85263fafb861b98b59040a5086259722",
|
||||
"sha256:a76b38c52400b762e48131494ba26be363491ac4f9a04c1b7e92483d169f6582",
|
||||
"sha256:da6013fd84a690242c310d77ddb8441a559e9cb3d3d59ebac9aca1a57b2e18bc",
|
||||
"sha256:e55b384612d93be96506932a786bbcde5a2db7a9e6a4bb4bffe8b733f5b9036b",
|
||||
"sha256:e81b76cace8eda1fca50e345242ba977f9be6ae3945af8d46326d776b4cf78d1",
|
||||
"sha256:e8236383a20872c0cdf5a62b554b27538db7fa1bbec52429d8d106effbaeca08",
|
||||
"sha256:f04e857b59d9d1ccc39ce2da1021d196e47234873820cbeaad210724b1ee28ac",
|
||||
"sha256:fadbfe37f74051d024037f223b8e001611eac868b5c5b06144ef4d8b799862f2"
|
||||
],
|
||||
"markers": "python_version >= '3.6' and python_version < '3.9'",
|
||||
"version": "==0.2.1"
|
||||
},
|
||||
"beautifulsoup4": {
|
||||
"hashes": [
|
||||
"sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf",
|
||||
@@ -48,19 +26,19 @@
|
||||
},
|
||||
"boto3": {
|
||||
"hashes": [
|
||||
"sha256:ef210f8e85cdb6d26a38ebad1cfe9cefdef2ab269207e5987653555375a7ef6b",
|
||||
"sha256:f0af8f4ef5fe6353c794cd3cce627d469a618b58ace7ca75a63cfd719df615ce"
|
||||
"sha256:35f68b60652bff50e7bc926238443cb578f29f120908bb945e5640e90c6dd53e",
|
||||
"sha256:7f3f93ee97215862ccd1a216f37deb7d64055c71f826b821805904df7b84ee6a"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.21.30"
|
||||
"version": "==1.21.31"
|
||||
},
|
||||
"botocore": {
|
||||
"hashes": [
|
||||
"sha256:af4bdc51eeecbe9fdcdadbed9ad58c5c91380ef30f3560022bbc2ee1d78f0ad6",
|
||||
"sha256:c622751093e3d0bf61343e66d6d06190ef30bf42b1557d5070ca84e9efa06d4b"
|
||||
"sha256:3bb21e3ee5e4de3ed76bb99b4496a46e9b5c82e7b7fdb62702f11dda1b57b769",
|
||||
"sha256:424fd94bef86a11f5340dc15eb50602dedec2ecc01c3a25c4fea23a2c8195500"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==1.24.30"
|
||||
"version": "==1.24.31"
|
||||
},
|
||||
"brotli": {
|
||||
"hashes": [
|
||||
@@ -217,11 +195,11 @@
|
||||
},
|
||||
"click": {
|
||||
"hashes": [
|
||||
"sha256:5e0d195c2067da3136efb897449ec1e9e6c98282fbf30d7f9e164af9be901a6b",
|
||||
"sha256:7ab900e38149c9872376e8f9b5986ddcaf68c0f413cf73678a0bca5547e6f976"
|
||||
"sha256:24e1a4a9ec5bf6299411369b208c1df2188d9eb8d916302fe6bf03faed227f1e",
|
||||
"sha256:479707fe14d9ec9a0757618b7a100a0ae4c4e236fac5b7f80ca68028141a1a72"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==8.1.1"
|
||||
"version": "==8.1.2"
|
||||
},
|
||||
"cryptg": {
|
||||
"hashes": [
|
||||
@@ -324,64 +302,63 @@
|
||||
},
|
||||
"greenlet": {
|
||||
"hashes": [
|
||||
"sha256:0051c6f1f27cb756ffc0ffbac7d2cd48cb0362ac1736871399a739b2885134d3",
|
||||
"sha256:00e44c8afdbe5467e4f7b5851be223be68adb4272f44696ee71fe46b7036a711",
|
||||
"sha256:013d61294b6cd8fe3242932c1c5e36e5d1db2c8afb58606c5a67efce62c1f5fd",
|
||||
"sha256:049fe7579230e44daef03a259faa24511d10ebfa44f69411d99e6a184fe68073",
|
||||
"sha256:14d4f3cd4e8b524ae9b8aa567858beed70c392fdec26dbdb0a8a418392e71708",
|
||||
"sha256:166eac03e48784a6a6e0e5f041cfebb1ab400b394db188c48b3a84737f505b67",
|
||||
"sha256:17ff94e7a83aa8671a25bf5b59326ec26da379ace2ebc4411d690d80a7fbcf23",
|
||||
"sha256:1e12bdc622676ce47ae9abbf455c189e442afdde8818d9da983085df6312e7a1",
|
||||
"sha256:21915eb821a6b3d9d8eefdaf57d6c345b970ad722f856cd71739493ce003ad08",
|
||||
"sha256:288c6a76705dc54fba69fbcb59904ae4ad768b4c768839b8ca5fdadec6dd8cfd",
|
||||
"sha256:2bde6792f313f4e918caabc46532aa64aa27a0db05d75b20edfc5c6f46479de2",
|
||||
"sha256:32ca72bbc673adbcfecb935bb3fb1b74e663d10a4b241aaa2f5a75fe1d1f90aa",
|
||||
"sha256:356b3576ad078c89a6107caa9c50cc14e98e3a6c4874a37c3e0273e4baf33de8",
|
||||
"sha256:40b951f601af999a8bf2ce8c71e8aaa4e8c6f78ff8afae7b808aae2dc50d4c40",
|
||||
"sha256:572e1787d1460da79590bf44304abbc0a2da944ea64ec549188fa84d89bba7ab",
|
||||
"sha256:58df5c2a0e293bf665a51f8a100d3e9956febfbf1d9aaf8c0677cf70218910c6",
|
||||
"sha256:64e6175c2e53195278d7388c454e0b30997573f3f4bd63697f88d855f7a6a1fc",
|
||||
"sha256:7227b47e73dedaa513cdebb98469705ef0d66eb5a1250144468e9c3097d6b59b",
|
||||
"sha256:7418b6bfc7fe3331541b84bb2141c9baf1ec7132a7ecd9f375912eca810e714e",
|
||||
"sha256:7cbd7574ce8e138bda9df4efc6bf2ab8572c9aff640d8ecfece1b006b68da963",
|
||||
"sha256:7ff61ff178250f9bb3cd89752df0f1dd0e27316a8bd1465351652b1b4a4cdfd3",
|
||||
"sha256:833e1551925ed51e6b44c800e71e77dacd7e49181fdc9ac9a0bf3714d515785d",
|
||||
"sha256:8639cadfda96737427330a094476d4c7a56ac03de7265622fcf4cfe57c8ae18d",
|
||||
"sha256:8c5d5b35f789a030ebb95bff352f1d27a93d81069f2adb3182d99882e095cefe",
|
||||
"sha256:8c790abda465726cfb8bb08bd4ca9a5d0a7bd77c7ac1ca1b839ad823b948ea28",
|
||||
"sha256:8d2f1fb53a421b410751887eb4ff21386d119ef9cde3797bf5e7ed49fb51a3b3",
|
||||
"sha256:903bbd302a2378f984aef528f76d4c9b1748f318fe1294961c072bdc7f2ffa3e",
|
||||
"sha256:93f81b134a165cc17123626ab8da2e30c0455441d4ab5576eed73a64c025b25c",
|
||||
"sha256:95e69877983ea39b7303570fa6760f81a3eec23d0e3ab2021b7144b94d06202d",
|
||||
"sha256:9633b3034d3d901f0a46b7939f8c4d64427dfba6bbc5a36b1a67364cf148a1b0",
|
||||
"sha256:97e5306482182170ade15c4b0d8386ded995a07d7cc2ca8f27958d34d6736497",
|
||||
"sha256:9f3cba480d3deb69f6ee2c1825060177a22c7826431458c697df88e6aeb3caee",
|
||||
"sha256:aa5b467f15e78b82257319aebc78dd2915e4c1436c3c0d1ad6f53e47ba6e2713",
|
||||
"sha256:abb7a75ed8b968f3061327c433a0fbd17b729947b400747c334a9c29a9af6c58",
|
||||
"sha256:aec52725173bd3a7b56fe91bc56eccb26fbdff1386ef123abb63c84c5b43b63a",
|
||||
"sha256:b11548073a2213d950c3f671aa88e6f83cda6e2fb97a8b6317b1b5b33d850e06",
|
||||
"sha256:b1692f7d6bc45e3200844be0dba153612103db241691088626a33ff1f24a0d88",
|
||||
"sha256:b336501a05e13b616ef81ce329c0e09ac5ed8c732d9ba7e3e983fcc1a9e86965",
|
||||
"sha256:b8c008de9d0daba7b6666aa5bbfdc23dcd78cafc33997c9b7741ff6353bafb7f",
|
||||
"sha256:b92e29e58bef6d9cfd340c72b04d74c4b4e9f70c9fa7c78b674d1fec18896dc4",
|
||||
"sha256:be5f425ff1f5f4b3c1e33ad64ab994eed12fc284a6ea71c5243fd564502ecbe5",
|
||||
"sha256:dd0b1e9e891f69e7675ba5c92e28b90eaa045f6ab134ffe70b52e948aa175b3c",
|
||||
"sha256:e30f5ea4ae2346e62cedde8794a56858a67b878dd79f7df76a0767e356b1744a",
|
||||
"sha256:e6a36bb9474218c7a5b27ae476035497a6990e21d04c279884eb10d9b290f1b1",
|
||||
"sha256:e859fcb4cbe93504ea18008d1df98dee4f7766db66c435e4882ab35cf70cac43",
|
||||
"sha256:eb6ea6da4c787111adf40f697b4e58732ee0942b5d3bd8f435277643329ba627",
|
||||
"sha256:ec8c433b3ab0419100bd45b47c9c8551248a5aee30ca5e9d399a0b57ac04651b",
|
||||
"sha256:eff9d20417ff9dcb0d25e2defc2574d10b491bf2e693b4e491914738b7908168",
|
||||
"sha256:f0214eb2a23b85528310dad848ad2ac58e735612929c8072f6093f3585fd342d",
|
||||
"sha256:f276df9830dba7a333544bd41070e8175762a7ac20350786b322b714b0e654f5",
|
||||
"sha256:f3acda1924472472ddd60c29e5b9db0cec629fbe3c5c5accb74d6d6d14773478",
|
||||
"sha256:f70a9e237bb792c7cc7e44c531fd48f5897961701cdaa06cf22fc14965c496cf",
|
||||
"sha256:f9d29ca8a77117315101425ec7ec2a47a22ccf59f5593378fc4077ac5b754fce",
|
||||
"sha256:fa877ca7f6b48054f847b61d6fa7bed5cebb663ebc55e018fda12db09dcc664c",
|
||||
"sha256:fdcec0b8399108577ec290f55551d926d9a1fa6cad45882093a7a07ac5ec147b"
|
||||
"sha256:004aed447382d80a56ecc354a6d807f305e6c808714ce6ccbca4839c94fae81d",
|
||||
"sha256:068d68fad6bd623e29a2d36e74538c9b9d6dc6464931cd27d93da6cfc6a7f242",
|
||||
"sha256:06fd4075754009c9817c6b4e1dc0af4616de52757b6ca973a81c3c1aadc28257",
|
||||
"sha256:1004cb542451814b12a4f38e835a47734e2b2c683acbf463d5ae76282a3974cf",
|
||||
"sha256:10c358633a8b27bfc32d27114ef2ca2ddc9f1f89f1643d1157b85e1fdd695315",
|
||||
"sha256:115bc25fefbdc692c4483e9ddb9011ccd0251590ed59dbfff0f4eb7050bf99c4",
|
||||
"sha256:1d987a2579336792f73ae6b106c2f087e32afc8573fbf9566f123ac6d8cfb72f",
|
||||
"sha256:2128d727fd1e8afba8e68feb2cdcf88c90163b69ddc9707722a3e491c5280720",
|
||||
"sha256:230132c241fe284f93f2e7b3969e9b22bbd76ef98cf93e382c945d378907f5a4",
|
||||
"sha256:23558f7bd08a663386c032ab8d302d613d2d02ae0c9758ad410bab6035b58d3d",
|
||||
"sha256:255d520d3e4a5f16883b182e1a94219fe455ab4f50aaaf534bfd6d64ee728397",
|
||||
"sha256:2a6bc19a728f6f643cfc89b876159a1a25a8f7d8700c013d48a73691f80b4550",
|
||||
"sha256:379bed346ef8ba0a0e698b3c5975a44d15dd4a5bbff40bbd7fd548b445d5550b",
|
||||
"sha256:3b12d0866759db93b0a893b4e50a7d7d1681519d2346c26695bb8bb2c652230e",
|
||||
"sha256:40d491944f69e350e1e8b25f6ca49459824ede1678ec0cd4b5541f41edc06614",
|
||||
"sha256:471484c7b9d7b7867263051aa81cdeed6e06b455e629a7f05eb91a6cb8bd0836",
|
||||
"sha256:488c557080557bc01aabb3e1bda7225c68455b853733a8652857ac0d810dad1b",
|
||||
"sha256:49c2e76e7aa81ba889b3c183e2341af3cc6161ee38852085110ae49d5b5d9a40",
|
||||
"sha256:52d13ec90236e5935ed6da044e78faa1371d5116cc43fe6d7ca8994dd619ef96",
|
||||
"sha256:57898c69a253d81f487787bdd538629fabd671fab8a9e31b041ca30965fd9556",
|
||||
"sha256:5d577eef5beb5730ef01ab39983eb852a97c359b7a546809adf70c409f4b2ecc",
|
||||
"sha256:6a41987c1474c9158a0c0c96611530a8f299bc547d35bee8add981b8b2534f74",
|
||||
"sha256:6ae67b7df8db3626af8e042e9c6949cfa27d1a3bbbfdff29e45b72bb6673a650",
|
||||
"sha256:6c42c27e9d12e8a481aff469ffe8dd4ce0484c354a418470960f760f6ae41e7c",
|
||||
"sha256:6c4a90c9f6128b4d0905a89930bd325e0491574e5cb453f606bb7094a3197587",
|
||||
"sha256:6e64518e5833ac2d9359b6d9bd4df2c0cf441a0f3a4eca9e735fbea99009fa70",
|
||||
"sha256:6fd3a270c23c5b42d86a9c7c6b0229f23ee4a7a4cabdaaa1693ad7a0982d13cb",
|
||||
"sha256:70db73351e0fcf11a76288c47a0469d9a330bcb2e7618c5eb57432b8caa82403",
|
||||
"sha256:771f401692046845626cbdf1dd0f04e999413ede0ee9ad39033fe30b5fa2e845",
|
||||
"sha256:7935026ec61b967cbc6b746c0ca75c1651ea118d7fee4d259cff9e6866153374",
|
||||
"sha256:7b76b1cac9baac1980210e29145800954e7b42e91ef69c4d695de1cab87ce41f",
|
||||
"sha256:7e3f37c11b6699b1a1e0fcc0e88829dba4f2866546381b05ab8b3f4db645a823",
|
||||
"sha256:8370fa65ad421484894f559055f951843754153b72b9bca2ebdc5288efe2e3f0",
|
||||
"sha256:8ae9c443d44a4e23252632e4d7775f419f992d0df3eff923e23775f5cc551d39",
|
||||
"sha256:8b31d85f2781e44f1ffaaf7ea07f484e7d42317c677c355fa77b4a1a4bea7394",
|
||||
"sha256:8b450336b27f3b375cadc474c6704838eaa8dd3ca312aac3bb69d92264a8e638",
|
||||
"sha256:9ce84357388a76d886febff4e50e321c212ffd3248b590960b2da6e02404a5c9",
|
||||
"sha256:a23e986fb0ba8e7407286add41fa0d4207be44e3dce1b04789f4757800eca1cf",
|
||||
"sha256:a81610ee00d0da9cd2c8679479b7791149365b6dfb3971b01b22ee29b04787ce",
|
||||
"sha256:b4e40444975e5ab0ed3004369209c39a28e084951daaeee4919f164b6b849b14",
|
||||
"sha256:b66600de16702b9dfa74bea34524b55183a2183e5fd92f20fe6c2fcae550a64c",
|
||||
"sha256:ba6ee18694d3673796b7a31b7d21254e87e9e43ca5be56f323fd396111255315",
|
||||
"sha256:bd03837da28293baa39bdfc3cada69e2f8807f423ae06168aa28d2b32c63a6b6",
|
||||
"sha256:bd2192070f88c0778ae1d68a0980fdece3473498c1db37f3794e3454f91e3ecf",
|
||||
"sha256:c1f6f1a3cc013012cd1da913c40b13e6d721046a8c8a0ea0cde94069645a75db",
|
||||
"sha256:ce10a8e7e067bde3c1fbf494d2b8859db510206030b0b67bc3af90b0eb1887b9",
|
||||
"sha256:d31386d208303a5a6cf0819ef9f6db6680bab9e4ca8e48adb3d4b26ead89beb7",
|
||||
"sha256:d83b3af53b201970973c5574b39df226746194063bb248a53fd12b470ac34319",
|
||||
"sha256:df9657b212c054ac6d803290d7c4bcd7790af0b725984fce1eeb0a1e3f2d9798",
|
||||
"sha256:e576e5fd3f129e6b3595dc734ac7f2b8c548f19ef07781194bc538dc9c0cdbbc",
|
||||
"sha256:e7400358558094c1bcedc75f3b3c4f400c53130b44833848890a99968dee6a64",
|
||||
"sha256:eb6a385f8577d30e4cb43dd555fb134ddaae1edeb84205e09dabec332bf49fd0",
|
||||
"sha256:f27f0875e0873f6bf5df09a456bfcac0667824cabac4cad30b43f36e0382ffe7",
|
||||
"sha256:fcd4a6d04995f1d66bc78b503e4e59ae72fd32aaec4f661657fe5ae5c1aa4ce3"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==1.1.2"
|
||||
"markers": "python_version >= '3' and platform_machine == 'aarch64' or (platform_machine == 'ppc64le' or (platform_machine == 'x86_64' or (platform_machine == 'amd64' or (platform_machine == 'AMD64' or (platform_machine == 'win32' or platform_machine == 'WIN32')))))",
|
||||
"version": "==2.0.0a2"
|
||||
},
|
||||
"gspread": {
|
||||
"hashes": [
|
||||
@@ -416,11 +393,11 @@
|
||||
},
|
||||
"loguru": {
|
||||
"hashes": [
|
||||
"sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c",
|
||||
"sha256:4e2414d534a2ab57573365b3e6d0234dfb1d84b68b7f3b948e6fb743860a77c3"
|
||||
"sha256:b28e72ac7a98be3d28ad28570299a393dfcd32e5e3f6a353dec94675767b6319",
|
||||
"sha256:f8087ac396b5ee5f67c963b495d615ebbceac2796379599820e324419d53667c"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.6.0"
|
||||
"version": "==0.5.3"
|
||||
},
|
||||
"lxml": {
|
||||
"hashes": [
|
||||
@@ -895,9 +872,7 @@
|
||||
"version": "==2022.3.2"
|
||||
},
|
||||
"requests": {
|
||||
"extras": [
|
||||
"socks"
|
||||
],
|
||||
"extras": [],
|
||||
"hashes": [
|
||||
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
|
||||
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
|
||||
@@ -918,7 +893,7 @@
|
||||
"sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17",
|
||||
"sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"markers": "python_version >= '3.6' and python_version < '4'",
|
||||
"version": "==4.8"
|
||||
},
|
||||
"s3transfer": {
|
||||
@@ -951,44 +926,45 @@
|
||||
},
|
||||
"sqlalchemy": {
|
||||
"hashes": [
|
||||
"sha256:04164e0063feb7aedd9d073db0fd496edb244be40d46ea1f0d8990815e4b8c34",
|
||||
"sha256:159c2f69dd6efd28e894f261ffca1100690f28210f34cfcd70b895e0ea7a64f3",
|
||||
"sha256:199dc6d0068753b6a8c0bd3aceb86a3e782df118260ebc1fa981ea31ee054674",
|
||||
"sha256:1bbac3e8293b34c4403d297e21e8f10d2a57756b75cff101dc62186adec725f5",
|
||||
"sha256:20e9eba7fd86ef52e0df25bea83b8b518dfdf0bce09b336cfe51671f52aaaa3f",
|
||||
"sha256:290cbdf19129ae520d4bdce392648c6fcdbee763bc8f750b53a5ab51880cb9c9",
|
||||
"sha256:316270e5867566376e69a0ac738b863d41396e2b63274616817e1d34156dff0e",
|
||||
"sha256:3f88a4ee192142eeed3fe173f673ea6ab1f5a863810a9d85dbf6c67a9bd08f97",
|
||||
"sha256:4aa96e957141006181ca58e792e900ee511085b8dae06c2d08c00f108280fb8a",
|
||||
"sha256:4b2bcab3a914715d332ca783e9bda13bc570d8b9ef087563210ba63082c18c16",
|
||||
"sha256:576684771456d02e24078047c2567025f2011977aa342063468577d94e194b00",
|
||||
"sha256:5a2e73508f939175363d8a4be9dcdc84cf16a92578d7fa86e6e4ca0e6b3667b2",
|
||||
"sha256:5ba59761c19b800bc2e1c9324da04d35ef51e4ee9621ff37534bc2290d258f71",
|
||||
"sha256:5dc9801ae9884e822ba942ca493642fb50f049c06b6dbe3178691fce48ceb089",
|
||||
"sha256:6fdd2dc5931daab778c2b65b03df6ae68376e028a3098eb624d0909d999885bc",
|
||||
"sha256:708973b5d9e1e441188124aaf13c121e5b03b6054c2df59b32219175a25aa13e",
|
||||
"sha256:7ff72b3cc9242d1a1c9b84bd945907bf174d74fc2519efe6184d6390a8df478b",
|
||||
"sha256:8679f9aba5ac22e7bce54ccd8a77641d3aea3e2d96e73e4356c887ebf8ff1082",
|
||||
"sha256:8b9a395122770a6f08ebfd0321546d7379f43505882c7419d7886856a07caa13",
|
||||
"sha256:8e1e5d96b744a4f91163290b01045430f3f32579e46d87282449e5b14d27d4ac",
|
||||
"sha256:9a0195af6b9050c9322a97cf07514f66fe511968e623ca87b2df5e3cf6349615",
|
||||
"sha256:9cb5698c896fa72f88e7ef04ef62572faf56809093180771d9be8d9f2e264a13",
|
||||
"sha256:b3f1d9b3aa09ab9adc7f8c4b40fc3e081eb903054c9a6f9ae1633fe15ae503b4",
|
||||
"sha256:bb42f9b259c33662c6a9b866012f6908a91731a419e69304e1261ba3ab87b8d1",
|
||||
"sha256:bca714d831e5b8860c3ab134c93aec63d1a4f493bed20084f54e3ce9f0a3bf99",
|
||||
"sha256:bedd89c34ab62565d44745212814e4b57ef1c24ad4af9b29c504ce40f0dc6558",
|
||||
"sha256:bfec934aac7f9fa95fc82147a4ba5db0a8bdc4ebf1e33b585ab8860beb10232f",
|
||||
"sha256:c7046f7aa2db445daccc8424f50b47a66c4039c9f058246b43796aa818f8b751",
|
||||
"sha256:d7e483f4791fbda60e23926b098702340504f7684ce7e1fd2c1bf02029288423",
|
||||
"sha256:dd93162615870c976dba43963a24bb418b28448fef584f30755990c134a06a55",
|
||||
"sha256:e4607d2d16330757818c9d6fba322c2e80b4b112ff24295d1343a80b876eb0ed",
|
||||
"sha256:e9a680d9665f88346ed339888781f5236347933906c5a56348abb8261282ec48",
|
||||
"sha256:edfcf93fd92e2f9eef640b3a7a40db20fe3c1d7c2c74faa41424c63dead61b76",
|
||||
"sha256:f7e4a3c0c3c596296b37f8427c467c8e4336dc8d50f8ed38042e8ba79507b2c9",
|
||||
"sha256:fff677fa4522dafb5a5e2c0cf909790d5d367326321aeabc0dffc9047cb235bd"
|
||||
"sha256:045d6a26c262929af0b9cb25441aae675ac04db4ea8bd2446b355617cd6b6b7d",
|
||||
"sha256:07f4dab2deb6d34618a2ccfff3971a85923ad7c3a9a45401818870fc51d3f0cc",
|
||||
"sha256:08aaad905aba8940f27aeb9f1f851bf63f18ef97b0062ca41f64afc4b64e0e8c",
|
||||
"sha256:27a42894a2751e438eaed12fc0dcfe741ff2f66c14760d081222c5adc5460064",
|
||||
"sha256:2a3e4dc7c452ba3c0f3175ad5a8e0ba49c2b0570a8d07272cf50844c8d78e74f",
|
||||
"sha256:345306707bb0e51e7cd6e7573adafbce018894ee5e3b9c31134545f704936db0",
|
||||
"sha256:36f08d94670315ca04c8139bd80b3e02b9dd9cc66fc11bcb96fd10ad51a051ab",
|
||||
"sha256:3ebb97ed96f4506e2f212e1fcf0ec07a103bb194938627660a5acb4d9feae49c",
|
||||
"sha256:40b995d7aeeb6f88a1927ce6692c0f626b59d8effd3e1d597f125e141707b37c",
|
||||
"sha256:4414ace6e3a5e39523e55a5d9f3b215699b2ead4ff91fca98f1b659b7ab2d92a",
|
||||
"sha256:50107d8183da3fbe5715957aa3954cd9d82aed555c5b4d3fd37fac861af422fa",
|
||||
"sha256:50174e173d03209c34e07e7b57cca48d0082ac2390edf927aafc706c111da11e",
|
||||
"sha256:5e88912bf192e7b5739c446d2276e1cba74cfa6c1c93eea2b2534404f6be1dbd",
|
||||
"sha256:621d3f6c0ba2407bb97e82b649be5ca7d5b6c201dcfb964ce13f517bf1cb6305",
|
||||
"sha256:623bac2d6bdca3f3e61cf1e1c466c5fb9f5cf08735736ee1111187b7a4108891",
|
||||
"sha256:671f61c3db4595b0e86cc4b30f675a7c0206d9ce99f041b4f6761c7ddd1e0074",
|
||||
"sha256:67c1c27c48875afc950bee5ee24582794f20b545e64e4f9ca94071a9b514d6ed",
|
||||
"sha256:6a6cfd468f54d65324fd3847cfd0148b0610efa6a43e5f5fcc89f455696ae9e7",
|
||||
"sha256:70048a83f0a1ece1fcd7189891c888e20af2c57fbd33eb760d8cece9843b896c",
|
||||
"sha256:7ee14a7f9f76d1ef9d5e5b760c9252617c839b87eee04d1ce8325ac66ae155c4",
|
||||
"sha256:804cf491437f3e4ce31247ab4b309b181f06ecc97d309b746d10f09439b4eb85",
|
||||
"sha256:878c7beaafa365602762c19f638282e1885454fed1aed86f8fae038933c7c671",
|
||||
"sha256:954ea8c527c4322afb6885944904714893af81fe9167e421273770991bf08a4a",
|
||||
"sha256:a47bf6b7ca6c28e4f4e262fabcf5be6b907af81be36de77839c9eeda2cdf3bb3",
|
||||
"sha256:a4fb5c6ee84a6bba4ff6f9f5379f0b3a0ffe9de7ba5a0945659b3da8d519709b",
|
||||
"sha256:b34bbc683789559f1bc9bb685fc162e0956dbbdfbe2fbd6755a9f5982c113610",
|
||||
"sha256:c025d45318b73c0601cca451532556cbab532b2742839ebb8cb58f9ebf06811e",
|
||||
"sha256:c3ad7f5b61ba014f5045912aea15b03c473bb02b1c07fd92c9d2c794fa183276",
|
||||
"sha256:c9218e3519398129e364121e0d89823e6ba2a2b77c28bfc661face0829c41433",
|
||||
"sha256:cd5cffd1dd753828f1069f33062f3896e51c990acd957c264f40e051b3e19887",
|
||||
"sha256:d8efcaa709ea8e7c08c3d3e7639c39b36083f5a995f397f9e6eedf5f5e4e4946",
|
||||
"sha256:e297a5cc625e3f1367a82deedf2d48ee4d2b2bd263b8b8d2efbaaf5608b5229e",
|
||||
"sha256:e67278ceb63270cdac0a7b89fc3c29a56f7dac9616a7ee48e7ad6b52e3b631e5",
|
||||
"sha256:eb6558ba07409dafa18c793c34292b3265be455904966f0724c10198829477e3",
|
||||
"sha256:f197c66663ed0f9e1178d51141d864688fb244a83f6b17f667d521e482537b2e",
|
||||
"sha256:f47996b1810894f766c9ee689607077c6c0e0fd6761e04c12ba13efb56d50c1d"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.4.32"
|
||||
"version": "==1.4.34"
|
||||
},
|
||||
"telethon": {
|
||||
"hashes": [
|
||||
@@ -1163,11 +1139,11 @@
|
||||
},
|
||||
"click": {
|
||||
"hashes": [
|
||||
"sha256:5e0d195c2067da3136efb897449ec1e9e6c98282fbf30d7f9e164af9be901a6b",
|
||||
"sha256:7ab900e38149c9872376e8f9b5986ddcaf68c0f413cf73678a0bca5547e6f976"
|
||||
"sha256:24e1a4a9ec5bf6299411369b208c1df2188d9eb8d916302fe6bf03faed227f1e",
|
||||
"sha256:479707fe14d9ec9a0757618b7a100a0ae4c4e236fac5b7f80ca68028141a1a72"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==8.1.1"
|
||||
"version": "==8.1.2"
|
||||
},
|
||||
"coverage": {
|
||||
"extras": [
|
||||
@@ -1415,9 +1391,7 @@
|
||||
"version": "==2022.1"
|
||||
},
|
||||
"requests": {
|
||||
"extras": [
|
||||
"socks"
|
||||
],
|
||||
"extras": [],
|
||||
"hashes": [
|
||||
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
|
||||
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
|
||||
@@ -1501,7 +1475,7 @@
|
||||
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
|
||||
"sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
|
||||
],
|
||||
"markers": "python_full_version < '3.11.0'",
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==2.0.1"
|
||||
},
|
||||
"typing-extensions": {
|
||||
|
||||
@@ -235,6 +235,20 @@ class Scraper:
|
||||
return archived_url
|
||||
|
||||
def archive_files(self, result: ScraperResult) -> ScraperResult:
|
||||
"""Archive files corresponding to ``archived_url`` dict keys, if the
|
||||
files have not previously been archived.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
result: ScraperResult
|
||||
Previously scraped ScraperResult run with ``archive_media=False``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ScraperResult
|
||||
Same ScraperResult as ``result``, but with all URLs in ``archived_url`` dict archived.
|
||||
"""
|
||||
|
||||
for url in result.archived_urls:
|
||||
if result.archived_urls[url] is None:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
@@ -244,7 +258,6 @@ class Scraper:
|
||||
result.media_archived = True
|
||||
return result
|
||||
|
||||
|
||||
def can_handle(self, channel: Channel) -> bool:
|
||||
"""Whether or not the scraper can scrape the specified channel.
|
||||
|
||||
@@ -365,6 +378,10 @@ class ScraperController:
|
||||
else:
|
||||
since = None
|
||||
|
||||
# TODO currently, if channels haven't been added to the database, if channel.id is None, the `since` returns the most recently scraped ScraperResult with channel.id == None, which can be from a different platform and channel. Maybe add check in above query logic that channel.id isn't null.
|
||||
if channel.id is None:
|
||||
since = None
|
||||
|
||||
posts = scraper.get_posts(channel, since=since, archive_media=archive_media)
|
||||
|
||||
for post in posts:
|
||||
|
||||
@@ -43,9 +43,12 @@ class BitchuteScraper(Scraper):
|
||||
|
||||
archived_urls = {}
|
||||
|
||||
if archive_media:
|
||||
if 'video_url' in post:
|
||||
url = post['video_url']
|
||||
if 'video_url' in post:
|
||||
url = post['video_url']
|
||||
archived_urls[url] = None
|
||||
|
||||
if archive_media:
|
||||
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
@@ -112,6 +115,7 @@ class BitchuteScraper(Scraper):
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
def strip_tags(html, convert_newlines=True):
|
||||
|
||||
@@ -50,25 +50,24 @@ class GabScraper(Scraper):
|
||||
if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
||||
break
|
||||
|
||||
media_urls = []
|
||||
archived_urls = {}
|
||||
|
||||
if archive_media:
|
||||
|
||||
for attachment in post.get('media_attachments'):
|
||||
for attachment in post.get('media_attachments'):
|
||||
if attachment.get('type') == 'video':
|
||||
archived_urls[attachment['source_mp4']] = None
|
||||
else:
|
||||
archived_urls[attachment['url']] = None
|
||||
|
||||
if post.get('reblog') is not None:
|
||||
for attachment in post['reblog'].get('media_attachments'):
|
||||
if attachment.get('type') == 'video':
|
||||
media_urls.append(attachment['source_mp4'])
|
||||
archived_urls[attachment['source_mp4']] = None
|
||||
else:
|
||||
media_urls.append(attachment['url'])
|
||||
|
||||
if post.get('reblog') is not None:
|
||||
for attachment in post['reblog'].get('media_attachments'):
|
||||
if attachment.get('type') == 'video':
|
||||
media_urls.append(attachment['source_mp4'])
|
||||
else:
|
||||
media_urls.append(attachment['url'])
|
||||
archived_urls[attachment['url']] = None
|
||||
|
||||
for url in media_urls:
|
||||
for url in archived_urls.keys():
|
||||
|
||||
if archive_media:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
@@ -30,26 +30,25 @@ class GettrScraper(Scraper):
|
||||
|
||||
archived_urls = {}
|
||||
|
||||
if archive_media:
|
||||
if 'imgs' in post:
|
||||
for img in post['imgs']:
|
||||
url = "https://media.gettr.com/" + img
|
||||
archived_urls[url] = None
|
||||
|
||||
if 'imgs' in post:
|
||||
for img in post['imgs']:
|
||||
url = "https://media.gettr.com/" + img
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[img] = archived_url
|
||||
if 'main' in post:
|
||||
url = "https://media.gettr.com/" + post['main']
|
||||
archived_urls[url] = None
|
||||
|
||||
if 'main' in post:
|
||||
url = "https://media.gettr.com/" + post['main']
|
||||
if 'ovid' in post:
|
||||
url = "https://media.gettr.com/" + post['ovid']
|
||||
archived_urls[url] = None
|
||||
|
||||
for url in archived_urls.keys():
|
||||
|
||||
if archive_media:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[post['main']] = archived_url
|
||||
|
||||
if 'vid' in post:
|
||||
url = "https://media.gettr.com/" + post['vid']
|
||||
media_blob, content_type, key = self.m3u8_url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[post['vid']] = archived_url
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
@@ -72,7 +71,7 @@ class GettrScraper(Scraper):
|
||||
return key
|
||||
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
client = client = PublicClient()
|
||||
client = PublicClient()
|
||||
username = self.get_username_from_url(channel.url)
|
||||
profile = client.user_info(username)
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from typing import Generator
|
||||
from typing import Generator, List
|
||||
from datetime import datetime, timezone
|
||||
import os
|
||||
import json
|
||||
@@ -49,28 +49,14 @@ class InstagramScraper(Scraper):
|
||||
|
||||
post_url = f'{BASE_URL}p/{post.shortcode}/'
|
||||
|
||||
archived_urls = {}
|
||||
archived_urls = get_archived_urls_from_post(post = post)
|
||||
|
||||
if archive_media:
|
||||
for url in archived_urls.keys():
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
|
||||
loader.download_post(post = post, target = Path(temp_dir))
|
||||
|
||||
files = os.listdir(temp_dir)
|
||||
files = [f for f in files if not f.endswith('.txt')]
|
||||
|
||||
for file in files:
|
||||
ext = file.split('.')[-1]
|
||||
content_type = CONTENT_TYPES[ext]
|
||||
filename = Path(temp_dir, file)
|
||||
key = f'{post.shortcode}__{file}'
|
||||
|
||||
with open(filename, 'rb') as f:
|
||||
blob = f.read()
|
||||
|
||||
archived_url = self.archive_blob(blob = blob, content_type = content_type, key = key)
|
||||
archived_urls[post_url] = archived_url
|
||||
if archive_media:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
@@ -98,7 +84,7 @@ class InstagramScraper(Scraper):
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_posts=json.dumps(comment_dict, default=str),
|
||||
archived_urls={},
|
||||
media_archived=archive_media)
|
||||
media_archived=True)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None:
|
||||
@@ -126,7 +112,20 @@ class InstagramScraper(Scraper):
|
||||
profile['followees'] = user_profile.followees
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
def get_archived_urls_from_post(post: instaloader.Post) -> List[str]:
|
||||
typename = post._node['__typename']
|
||||
if typename == 'GraphImage':
|
||||
urls = [post._node['display_url']]
|
||||
elif typename == 'GraphVideo':
|
||||
urls = [post._node['video_url']]
|
||||
elif typename == 'GraphSidecar':
|
||||
urls = [edge['node']['display_url'] for edge in post._node['edge_sidecar_to_children']['edges']]
|
||||
else:
|
||||
raise NotImplementedError(f'post of type {typename} is currently not supported.')
|
||||
|
||||
return {url : None for url in urls}
|
||||
@@ -36,10 +36,11 @@ class OdyseeScraper(Scraper):
|
||||
if since is not None and datetime.fromtimestamp(video.info['created']) <= since.date:
|
||||
break
|
||||
|
||||
archived_urls = {}
|
||||
url = video.info['streaming_url']
|
||||
|
||||
archived_urls = {url: None}
|
||||
|
||||
if archive_media:
|
||||
url = video.info['streaming_url']
|
||||
|
||||
# Check if file is a video file or an m3u8 file
|
||||
r = requests.head(url)
|
||||
@@ -77,6 +78,21 @@ class OdyseeScraper(Scraper):
|
||||
archived_urls={},
|
||||
media_archived=True)
|
||||
|
||||
def archive_files(self, result: ScraperResult) -> ScraperResult:
|
||||
for url in result.archived_urls:
|
||||
if result.archived_urls[url] is None:
|
||||
r = requests.head(url)
|
||||
if r.headers['Content-Type'] == 'text/html; charset=utf-8':
|
||||
media_blob, content_type, key = self.m3u8_url_to_blob(url)
|
||||
else:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
result.archived_urls[url] = archived_url
|
||||
|
||||
result.media_archived = True
|
||||
return result
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
@@ -94,7 +110,7 @@ class OdyseeScraper(Scraper):
|
||||
profile = odysee_channel.info
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
@@ -19,18 +19,18 @@ class RumbleScraper(Scraper):
|
||||
scraper = get_channel_videos(channel.url)
|
||||
|
||||
for post in scraper:
|
||||
if since is not None and post['datetime'].replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
|
||||
if since is not None and post['datetime'].replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
||||
break
|
||||
|
||||
archived_urls = {}
|
||||
url = post['media_url']
|
||||
|
||||
archived_urls = {url: None}
|
||||
|
||||
if archive_media:
|
||||
|
||||
url = post['media_url']
|
||||
|
||||
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[post['media_url']] = archived_url
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
@@ -48,6 +48,16 @@ class RumbleScraper(Scraper):
|
||||
key = urlparse(url).path.split('/')[-2] + ext
|
||||
return key
|
||||
|
||||
def archive_files(self, result: ScraperResult) -> ScraperResult:
|
||||
for url in result.archived_urls:
|
||||
if result.archived_urls[url] is None:
|
||||
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
result.archived_urls[url] = archived_url
|
||||
|
||||
result.media_archived = True
|
||||
return result
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Rumble" and channel.url is not None:
|
||||
return True
|
||||
@@ -57,10 +67,10 @@ class RumbleScraper(Scraper):
|
||||
profile = get_channel_profile(url = channel.url)
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
|
||||
@@ -33,8 +33,8 @@ class TelegramSnscrapeScraper(Scraper):
|
||||
for image_url in post.images:
|
||||
archived_urls[image_url] = None
|
||||
|
||||
if post.video:
|
||||
archived_urls[post.video] = None
|
||||
for video_url in post.videos:
|
||||
archived_urls[video_url] = None
|
||||
|
||||
if archive_media:
|
||||
for url in archived_urls:
|
||||
|
||||
@@ -14,7 +14,7 @@ class TwitterScraper(Scraper):
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
if channel.platform_id:
|
||||
identifier = channel.platform_id
|
||||
identifier = int(channel.platform_id)
|
||||
else:
|
||||
identifier = channel.screenname
|
||||
|
||||
@@ -23,7 +23,7 @@ class TwitterScraper(Scraper):
|
||||
first = True
|
||||
|
||||
for tweet in scraper.get_items():
|
||||
if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
|
||||
if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
||||
# with TwitterProfileScraper, the first tweet could be an old pinned tweet
|
||||
if first:
|
||||
first = False
|
||||
@@ -105,7 +105,7 @@ class TwitterScraper(Scraper):
|
||||
raise ChannelDoesNotExistError(channel.url)
|
||||
else:
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(entity.__dict__, default=str),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(entity.__dict__, default=str),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
from datetime import datetime, timezone
|
||||
from typing import Generator
|
||||
from urllib.parse import urlparse
|
||||
import json
|
||||
import re
|
||||
|
||||
from snscrape.modules.vkontakte import VKontakteUserScraper
|
||||
from loguru import logger
|
||||
from yt_dlp.extractor.vk import VKIE
|
||||
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
@@ -24,7 +28,7 @@ class VkontakteScraper(Scraper):
|
||||
first = True
|
||||
|
||||
for post in scraper.get_items():
|
||||
if since is not None and datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
|
||||
if since is not None and datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
||||
# with VKontakteUserScraper, the first tweet could be an old pinned tweet
|
||||
if first:
|
||||
first = False
|
||||
@@ -34,23 +38,26 @@ class VkontakteScraper(Scraper):
|
||||
|
||||
archived_urls = {}
|
||||
|
||||
if archive_media:
|
||||
if post.photos:
|
||||
|
||||
if post.photos:
|
||||
for photo in post.photos:
|
||||
variant = max(
|
||||
[v for v in photo.variants], key=lambda v: v.width * v.height)
|
||||
url = variant.url
|
||||
if url is not None:
|
||||
archived_urls[url] = None
|
||||
|
||||
for photo in post.photos:
|
||||
variant = max(
|
||||
[v for v in photo.variants], key=lambda v: v.width * v.height)
|
||||
url = variant.url
|
||||
|
||||
if url is not None:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
if post.video:
|
||||
archived_urls[post.video.url] = None
|
||||
|
||||
if post.video:
|
||||
url = post.video.url
|
||||
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
|
||||
for url in archived_urls.keys():
|
||||
|
||||
if archive_media:
|
||||
if re.match(VKIE._VALID_URL, url):
|
||||
# Uses regex from yt_dlp to verify VK video URL
|
||||
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
|
||||
else:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
@@ -65,6 +72,21 @@ class VkontakteScraper(Scraper):
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
|
||||
def archive_files(self, result: ScraperResult) -> ScraperResult:
|
||||
for url in result.archived_urls:
|
||||
if result.archived_urls[url] is None:
|
||||
if re.match(VKIE._VALID_URL, url):
|
||||
# Uses regex from yt_dlp to verify VK video URL
|
||||
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
|
||||
else:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
result.archived_urls[url] = archived_url
|
||||
|
||||
result.media_archived = True
|
||||
return result
|
||||
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Vkontakte" and channel.platform_id:
|
||||
return True
|
||||
@@ -87,7 +109,7 @@ class VkontakteScraper(Scraper):
|
||||
profile = scraper._get_entity().__dict__
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
@@ -2,7 +2,11 @@ from datetime import datetime, timezone
|
||||
import json
|
||||
from typing import Generator
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
import yt_dlp
|
||||
from loguru import logger
|
||||
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper import Scraper
|
||||
@@ -46,7 +50,10 @@ class YoutubeScraper(Scraper):
|
||||
|
||||
for video in valid_videos:
|
||||
|
||||
archived_urls = {}
|
||||
url = video['webpage_url']
|
||||
|
||||
archived_urls = {url: None}
|
||||
|
||||
video_id = video["id"]
|
||||
video_ext = video["ext"]
|
||||
|
||||
@@ -54,11 +61,8 @@ class YoutubeScraper(Scraper):
|
||||
|
||||
key = f"{video_id}.{video_ext}"
|
||||
|
||||
with open(f"{temp_dir}/{key}", "rb") as f:
|
||||
with open(Path(temp_dir)/key, "rb") as f:
|
||||
media_blob = f.read()
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
|
||||
url = video['webpage_url']
|
||||
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
@@ -78,6 +82,41 @@ class YoutubeScraper(Scraper):
|
||||
if channel.platform == "Youtube" and channel.url:
|
||||
return True
|
||||
|
||||
def archive_files(self, result: ScraperResult) -> ScraperResult:
|
||||
for url in result.archived_urls:
|
||||
if result.archived_urls[url] is None:
|
||||
|
||||
media_blob = None
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
|
||||
ydl_opts = {
|
||||
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
|
||||
"merge_output_format": "mp4",
|
||||
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s"}
|
||||
|
||||
ydl = yt_dlp.YoutubeDL(ydl_opts)
|
||||
|
||||
try:
|
||||
ydl.download(url)
|
||||
except yt_dlp.utils.DownloadError as e:
|
||||
raise e
|
||||
|
||||
files = os.listdir(temp_dir)
|
||||
if len(files) != 1:
|
||||
logger.warning(f'{len(files)} files downloaded for video: {url}')
|
||||
key = files[0]
|
||||
with open(Path(temp_dir, key), 'rb') as f:
|
||||
media_blob = f.read()
|
||||
|
||||
if media_blob is not None:
|
||||
content_type = 'video/mp4'
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
result.archived_urls[url] = archived_url
|
||||
|
||||
result.media_archived = True
|
||||
return result
|
||||
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
ydl_opts = {}
|
||||
ydl = yt_dlp.YoutubeDL(ydl_opts)
|
||||
@@ -87,12 +126,13 @@ class YoutubeScraper(Scraper):
|
||||
meta = ydl.extract_info(
|
||||
channel.url,
|
||||
process=False)
|
||||
meta.pop('entries')
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(meta),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(meta),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
except yt_dlp.utils.DownloadError as e:
|
||||
raise e
|
||||
|
||||
@@ -12,10 +12,9 @@ addopts =
|
||||
--html='reports/tests.html'
|
||||
--self-contained-html
|
||||
markers =
|
||||
profile: marks tests for only extracting channel metadata (deselect with '-m
|
||||
"not profile"')
|
||||
media: marks tests for archiving all media attachments (deselect with '-m
|
||||
"not media"')
|
||||
profile: marks tests for only extracting channel metadata (deselect with '-m "not profile"')
|
||||
media: marks tests for archiving all media attachments (deselect with '-m "not media"')
|
||||
unarchived: marks tests for archiving all unarchived media attachments (deselect with '-m "not unarchived"')
|
||||
filterwarnings =
|
||||
ignore:the imp module is deprecated:DeprecationWarning
|
||||
ignore:The localize method is no longer necessary, as this time zone supports the fold attribute
|
||||
|
||||
@@ -99,12 +99,12 @@ RUMBLE_CHANNEL_KWARGS = {
|
||||
'notes': ''}
|
||||
|
||||
TELEGRAM_CHANNEL_KWARGS = {
|
||||
'name': 'USA Freedom Convoy (test)',
|
||||
'platform_id': -1001799578085,
|
||||
'name': 'South West Ohio Proud Boys (test)',
|
||||
'platform_id': -1001276612436,
|
||||
'category': 'test',
|
||||
'platform': 'Telegram',
|
||||
'url': 'https://t.me/usafreedomconvoy2022',
|
||||
'screenname': 'usafreedomconvoy2022',
|
||||
'url': 'https://t.me/SouthwestOhioPB',
|
||||
'screenname': 'SouthwestOhioPB',
|
||||
'country': 'US',
|
||||
'influencer': None,
|
||||
'public': True,
|
||||
|
||||
@@ -3,12 +3,19 @@ import pytest
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import BitchuteScraper
|
||||
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_bitchute_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['bitchute'])]
|
||||
controller.register_scraper(scraper = BitchuteScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
@pytest.mark.media
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_bitchute_channel_unarchived_media(controller):
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_bitchute_channel(controller, channel_kwargs):
|
||||
|
||||
|
||||
@@ -3,12 +3,19 @@ import pytest
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import GabScraper
|
||||
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_gab_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['gab'])]
|
||||
controller.register_scraper(scraper = GabScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
@pytest.mark.media
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_gab_channel_unarchived_media(controller):
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_gab_channel(controller, channel_kwargs):
|
||||
|
||||
|
||||
@@ -3,12 +3,19 @@ import pytest
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import GettrScraper
|
||||
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_gettr_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['gettr'])]
|
||||
controller.register_scraper(scraper = GettrScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
@pytest.mark.media
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_gettr_channel_unarchived_media(controller):
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_gettr_channel(controller, channel_kwargs):
|
||||
|
||||
|
||||
@@ -3,12 +3,19 @@ import pytest
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import InstagramScraper
|
||||
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_instagram_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['instagram'])]
|
||||
controller.register_scraper(scraper = InstagramScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
@pytest.mark.media
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_instagram_channel_unarchived_media(controller):
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_instagram_channel(controller, channel_kwargs):
|
||||
|
||||
|
||||
@@ -3,12 +3,19 @@ import pytest
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import OdyseeScraper
|
||||
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_odysee_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['odysee'])]
|
||||
controller.register_scraper(scraper = OdyseeScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
@pytest.mark.media
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_odysee_channel_unarchived_media(controller):
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_odysee_channel(controller, channel_kwargs):
|
||||
|
||||
|
||||
@@ -3,12 +3,19 @@ import pytest
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import RumbleScraper
|
||||
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_rumble_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['rumble'])]
|
||||
controller.register_scraper(scraper = RumbleScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
@pytest.mark.media
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_rumble_channel_unarchived_media(controller):
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_rumble_channel(controller, channel_kwargs):
|
||||
|
||||
|
||||
@@ -3,12 +3,19 @@ import pytest
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import TelegramSnscrapeScraper
|
||||
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_telegram_snscrape_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['telegram'])]
|
||||
controller.register_scraper(scraper = TelegramSnscrapeScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
@pytest.mark.media
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_telegram_snscrape_channel_unarchived_media(controller):
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_telegram_snscrape_channel(controller, channel_kwargs):
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ import pytest
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import TelegramTelethonScraper
|
||||
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs):
|
||||
controller.remove_all_scrapers()
|
||||
|
||||
@@ -10,6 +11,12 @@ def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs):
|
||||
controller.register_scraper(scraper = TelegramTelethonScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
@pytest.mark.media
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_telegram_telethon_unarchived_media(controller):
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_telegram_telethon_channel(controller, channel_kwargs):
|
||||
|
||||
|
||||
@@ -3,12 +3,19 @@ import pytest
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import TwitterScraper
|
||||
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_twitter_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['twitter'])]
|
||||
controller.register_scraper(scraper = TwitterScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
@pytest.mark.media
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_twitter_channel_unarchived_media(controller):
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_twitter_channel(controller, channel_kwargs):
|
||||
|
||||
|
||||
@@ -3,12 +3,19 @@ import pytest
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import VkontakteScraper
|
||||
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_vkontakte_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['vkontakte'])]
|
||||
controller.register_scraper(scraper = VkontakteScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
@pytest.mark.media
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_vkontakte_channel_unarchived_media(controller):
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_vkontakte_channel(controller, channel_kwargs):
|
||||
|
||||
|
||||
@@ -3,12 +3,19 @@ import pytest
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import YoutubeScraper
|
||||
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_youtube_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['youtube'])]
|
||||
controller.register_scraper(scraper = YoutubeScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
@pytest.mark.media
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_youtube_channel_unarchived_media(controller):
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_youtube_channel(controller, channel_kwargs):
|
||||
|
||||
|
||||
Reference in New Issue
Block a user