implemented deferred media archiving for all scrapers, and implemented tests for them. Refactored archiving methods of Instagram and Gettr scrapers to be able to use default archiving method

This commit is contained in:
Tristan Lee
2022-04-01 01:30:49 -05:00
parent 16aad4ef2c
commit 282f33eff3
26 changed files with 417 additions and 261 deletions

View File

@@ -36,7 +36,7 @@ sphinx = "*"
sphinx_rtd_theme = "*"
[requires]
python_version = "3.8"
python_version = "3.9"
[pipenv]
allow_prereleases = true

254
Pipfile.lock generated
View File

@@ -1,11 +1,11 @@
{
"_meta": {
"hash": {
"sha256": "e57f79178ac0e05f9753a29f97e08d2ae96b7775044bb4c6ba616baae1d21183"
"sha256": "b9fc02f3ecaa2199480c4fcba30f02780860dfbc2e10c026889c78f639709fb4"
},
"pipfile-spec": 6,
"requires": {
"python_version": "3.8"
"python_version": "3.9"
},
"sources": [
{
@@ -16,28 +16,6 @@
]
},
"default": {
"backports.zoneinfo": {
"hashes": [
"sha256:17746bd546106fa389c51dbea67c8b7c8f0d14b5526a579ca6ccf5ed72c526cf",
"sha256:1b13e654a55cd45672cb54ed12148cd33628f672548f373963b0bff67b217328",
"sha256:1c5742112073a563c81f786e77514969acb58649bcdf6cdf0b4ed31a348d4546",
"sha256:4a0f800587060bf8880f954dbef70de6c11bbe59c673c3d818921f042f9954a6",
"sha256:5c144945a7752ca544b4b78c8c41544cdfaf9786f25fe5ffb10e838e19a27570",
"sha256:7b0a64cda4145548fed9efc10322770f929b944ce5cee6c0dfe0c87bf4c0c8c9",
"sha256:8439c030a11780786a2002261569bdf362264f605dfa4d65090b64b05c9f79a7",
"sha256:8961c0f32cd0336fb8e8ead11a1f8cd99ec07145ec2931122faaac1c8f7fd987",
"sha256:89a48c0d158a3cc3f654da4c2de1ceba85263fafb861b98b59040a5086259722",
"sha256:a76b38c52400b762e48131494ba26be363491ac4f9a04c1b7e92483d169f6582",
"sha256:da6013fd84a690242c310d77ddb8441a559e9cb3d3d59ebac9aca1a57b2e18bc",
"sha256:e55b384612d93be96506932a786bbcde5a2db7a9e6a4bb4bffe8b733f5b9036b",
"sha256:e81b76cace8eda1fca50e345242ba977f9be6ae3945af8d46326d776b4cf78d1",
"sha256:e8236383a20872c0cdf5a62b554b27538db7fa1bbec52429d8d106effbaeca08",
"sha256:f04e857b59d9d1ccc39ce2da1021d196e47234873820cbeaad210724b1ee28ac",
"sha256:fadbfe37f74051d024037f223b8e001611eac868b5c5b06144ef4d8b799862f2"
],
"markers": "python_version >= '3.6' and python_version < '3.9'",
"version": "==0.2.1"
},
"beautifulsoup4": {
"hashes": [
"sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf",
@@ -48,19 +26,19 @@
},
"boto3": {
"hashes": [
"sha256:ef210f8e85cdb6d26a38ebad1cfe9cefdef2ab269207e5987653555375a7ef6b",
"sha256:f0af8f4ef5fe6353c794cd3cce627d469a618b58ace7ca75a63cfd719df615ce"
"sha256:35f68b60652bff50e7bc926238443cb578f29f120908bb945e5640e90c6dd53e",
"sha256:7f3f93ee97215862ccd1a216f37deb7d64055c71f826b821805904df7b84ee6a"
],
"index": "pypi",
"version": "==1.21.30"
"version": "==1.21.31"
},
"botocore": {
"hashes": [
"sha256:af4bdc51eeecbe9fdcdadbed9ad58c5c91380ef30f3560022bbc2ee1d78f0ad6",
"sha256:c622751093e3d0bf61343e66d6d06190ef30bf42b1557d5070ca84e9efa06d4b"
"sha256:3bb21e3ee5e4de3ed76bb99b4496a46e9b5c82e7b7fdb62702f11dda1b57b769",
"sha256:424fd94bef86a11f5340dc15eb50602dedec2ecc01c3a25c4fea23a2c8195500"
],
"markers": "python_version >= '3.6'",
"version": "==1.24.30"
"version": "==1.24.31"
},
"brotli": {
"hashes": [
@@ -217,11 +195,11 @@
},
"click": {
"hashes": [
"sha256:5e0d195c2067da3136efb897449ec1e9e6c98282fbf30d7f9e164af9be901a6b",
"sha256:7ab900e38149c9872376e8f9b5986ddcaf68c0f413cf73678a0bca5547e6f976"
"sha256:24e1a4a9ec5bf6299411369b208c1df2188d9eb8d916302fe6bf03faed227f1e",
"sha256:479707fe14d9ec9a0757618b7a100a0ae4c4e236fac5b7f80ca68028141a1a72"
],
"markers": "python_version >= '3.7'",
"version": "==8.1.1"
"version": "==8.1.2"
},
"cryptg": {
"hashes": [
@@ -324,64 +302,63 @@
},
"greenlet": {
"hashes": [
"sha256:0051c6f1f27cb756ffc0ffbac7d2cd48cb0362ac1736871399a739b2885134d3",
"sha256:00e44c8afdbe5467e4f7b5851be223be68adb4272f44696ee71fe46b7036a711",
"sha256:013d61294b6cd8fe3242932c1c5e36e5d1db2c8afb58606c5a67efce62c1f5fd",
"sha256:049fe7579230e44daef03a259faa24511d10ebfa44f69411d99e6a184fe68073",
"sha256:14d4f3cd4e8b524ae9b8aa567858beed70c392fdec26dbdb0a8a418392e71708",
"sha256:166eac03e48784a6a6e0e5f041cfebb1ab400b394db188c48b3a84737f505b67",
"sha256:17ff94e7a83aa8671a25bf5b59326ec26da379ace2ebc4411d690d80a7fbcf23",
"sha256:1e12bdc622676ce47ae9abbf455c189e442afdde8818d9da983085df6312e7a1",
"sha256:21915eb821a6b3d9d8eefdaf57d6c345b970ad722f856cd71739493ce003ad08",
"sha256:288c6a76705dc54fba69fbcb59904ae4ad768b4c768839b8ca5fdadec6dd8cfd",
"sha256:2bde6792f313f4e918caabc46532aa64aa27a0db05d75b20edfc5c6f46479de2",
"sha256:32ca72bbc673adbcfecb935bb3fb1b74e663d10a4b241aaa2f5a75fe1d1f90aa",
"sha256:356b3576ad078c89a6107caa9c50cc14e98e3a6c4874a37c3e0273e4baf33de8",
"sha256:40b951f601af999a8bf2ce8c71e8aaa4e8c6f78ff8afae7b808aae2dc50d4c40",
"sha256:572e1787d1460da79590bf44304abbc0a2da944ea64ec549188fa84d89bba7ab",
"sha256:58df5c2a0e293bf665a51f8a100d3e9956febfbf1d9aaf8c0677cf70218910c6",
"sha256:64e6175c2e53195278d7388c454e0b30997573f3f4bd63697f88d855f7a6a1fc",
"sha256:7227b47e73dedaa513cdebb98469705ef0d66eb5a1250144468e9c3097d6b59b",
"sha256:7418b6bfc7fe3331541b84bb2141c9baf1ec7132a7ecd9f375912eca810e714e",
"sha256:7cbd7574ce8e138bda9df4efc6bf2ab8572c9aff640d8ecfece1b006b68da963",
"sha256:7ff61ff178250f9bb3cd89752df0f1dd0e27316a8bd1465351652b1b4a4cdfd3",
"sha256:833e1551925ed51e6b44c800e71e77dacd7e49181fdc9ac9a0bf3714d515785d",
"sha256:8639cadfda96737427330a094476d4c7a56ac03de7265622fcf4cfe57c8ae18d",
"sha256:8c5d5b35f789a030ebb95bff352f1d27a93d81069f2adb3182d99882e095cefe",
"sha256:8c790abda465726cfb8bb08bd4ca9a5d0a7bd77c7ac1ca1b839ad823b948ea28",
"sha256:8d2f1fb53a421b410751887eb4ff21386d119ef9cde3797bf5e7ed49fb51a3b3",
"sha256:903bbd302a2378f984aef528f76d4c9b1748f318fe1294961c072bdc7f2ffa3e",
"sha256:93f81b134a165cc17123626ab8da2e30c0455441d4ab5576eed73a64c025b25c",
"sha256:95e69877983ea39b7303570fa6760f81a3eec23d0e3ab2021b7144b94d06202d",
"sha256:9633b3034d3d901f0a46b7939f8c4d64427dfba6bbc5a36b1a67364cf148a1b0",
"sha256:97e5306482182170ade15c4b0d8386ded995a07d7cc2ca8f27958d34d6736497",
"sha256:9f3cba480d3deb69f6ee2c1825060177a22c7826431458c697df88e6aeb3caee",
"sha256:aa5b467f15e78b82257319aebc78dd2915e4c1436c3c0d1ad6f53e47ba6e2713",
"sha256:abb7a75ed8b968f3061327c433a0fbd17b729947b400747c334a9c29a9af6c58",
"sha256:aec52725173bd3a7b56fe91bc56eccb26fbdff1386ef123abb63c84c5b43b63a",
"sha256:b11548073a2213d950c3f671aa88e6f83cda6e2fb97a8b6317b1b5b33d850e06",
"sha256:b1692f7d6bc45e3200844be0dba153612103db241691088626a33ff1f24a0d88",
"sha256:b336501a05e13b616ef81ce329c0e09ac5ed8c732d9ba7e3e983fcc1a9e86965",
"sha256:b8c008de9d0daba7b6666aa5bbfdc23dcd78cafc33997c9b7741ff6353bafb7f",
"sha256:b92e29e58bef6d9cfd340c72b04d74c4b4e9f70c9fa7c78b674d1fec18896dc4",
"sha256:be5f425ff1f5f4b3c1e33ad64ab994eed12fc284a6ea71c5243fd564502ecbe5",
"sha256:dd0b1e9e891f69e7675ba5c92e28b90eaa045f6ab134ffe70b52e948aa175b3c",
"sha256:e30f5ea4ae2346e62cedde8794a56858a67b878dd79f7df76a0767e356b1744a",
"sha256:e6a36bb9474218c7a5b27ae476035497a6990e21d04c279884eb10d9b290f1b1",
"sha256:e859fcb4cbe93504ea18008d1df98dee4f7766db66c435e4882ab35cf70cac43",
"sha256:eb6ea6da4c787111adf40f697b4e58732ee0942b5d3bd8f435277643329ba627",
"sha256:ec8c433b3ab0419100bd45b47c9c8551248a5aee30ca5e9d399a0b57ac04651b",
"sha256:eff9d20417ff9dcb0d25e2defc2574d10b491bf2e693b4e491914738b7908168",
"sha256:f0214eb2a23b85528310dad848ad2ac58e735612929c8072f6093f3585fd342d",
"sha256:f276df9830dba7a333544bd41070e8175762a7ac20350786b322b714b0e654f5",
"sha256:f3acda1924472472ddd60c29e5b9db0cec629fbe3c5c5accb74d6d6d14773478",
"sha256:f70a9e237bb792c7cc7e44c531fd48f5897961701cdaa06cf22fc14965c496cf",
"sha256:f9d29ca8a77117315101425ec7ec2a47a22ccf59f5593378fc4077ac5b754fce",
"sha256:fa877ca7f6b48054f847b61d6fa7bed5cebb663ebc55e018fda12db09dcc664c",
"sha256:fdcec0b8399108577ec290f55551d926d9a1fa6cad45882093a7a07ac5ec147b"
"sha256:004aed447382d80a56ecc354a6d807f305e6c808714ce6ccbca4839c94fae81d",
"sha256:068d68fad6bd623e29a2d36e74538c9b9d6dc6464931cd27d93da6cfc6a7f242",
"sha256:06fd4075754009c9817c6b4e1dc0af4616de52757b6ca973a81c3c1aadc28257",
"sha256:1004cb542451814b12a4f38e835a47734e2b2c683acbf463d5ae76282a3974cf",
"sha256:10c358633a8b27bfc32d27114ef2ca2ddc9f1f89f1643d1157b85e1fdd695315",
"sha256:115bc25fefbdc692c4483e9ddb9011ccd0251590ed59dbfff0f4eb7050bf99c4",
"sha256:1d987a2579336792f73ae6b106c2f087e32afc8573fbf9566f123ac6d8cfb72f",
"sha256:2128d727fd1e8afba8e68feb2cdcf88c90163b69ddc9707722a3e491c5280720",
"sha256:230132c241fe284f93f2e7b3969e9b22bbd76ef98cf93e382c945d378907f5a4",
"sha256:23558f7bd08a663386c032ab8d302d613d2d02ae0c9758ad410bab6035b58d3d",
"sha256:255d520d3e4a5f16883b182e1a94219fe455ab4f50aaaf534bfd6d64ee728397",
"sha256:2a6bc19a728f6f643cfc89b876159a1a25a8f7d8700c013d48a73691f80b4550",
"sha256:379bed346ef8ba0a0e698b3c5975a44d15dd4a5bbff40bbd7fd548b445d5550b",
"sha256:3b12d0866759db93b0a893b4e50a7d7d1681519d2346c26695bb8bb2c652230e",
"sha256:40d491944f69e350e1e8b25f6ca49459824ede1678ec0cd4b5541f41edc06614",
"sha256:471484c7b9d7b7867263051aa81cdeed6e06b455e629a7f05eb91a6cb8bd0836",
"sha256:488c557080557bc01aabb3e1bda7225c68455b853733a8652857ac0d810dad1b",
"sha256:49c2e76e7aa81ba889b3c183e2341af3cc6161ee38852085110ae49d5b5d9a40",
"sha256:52d13ec90236e5935ed6da044e78faa1371d5116cc43fe6d7ca8994dd619ef96",
"sha256:57898c69a253d81f487787bdd538629fabd671fab8a9e31b041ca30965fd9556",
"sha256:5d577eef5beb5730ef01ab39983eb852a97c359b7a546809adf70c409f4b2ecc",
"sha256:6a41987c1474c9158a0c0c96611530a8f299bc547d35bee8add981b8b2534f74",
"sha256:6ae67b7df8db3626af8e042e9c6949cfa27d1a3bbbfdff29e45b72bb6673a650",
"sha256:6c42c27e9d12e8a481aff469ffe8dd4ce0484c354a418470960f760f6ae41e7c",
"sha256:6c4a90c9f6128b4d0905a89930bd325e0491574e5cb453f606bb7094a3197587",
"sha256:6e64518e5833ac2d9359b6d9bd4df2c0cf441a0f3a4eca9e735fbea99009fa70",
"sha256:6fd3a270c23c5b42d86a9c7c6b0229f23ee4a7a4cabdaaa1693ad7a0982d13cb",
"sha256:70db73351e0fcf11a76288c47a0469d9a330bcb2e7618c5eb57432b8caa82403",
"sha256:771f401692046845626cbdf1dd0f04e999413ede0ee9ad39033fe30b5fa2e845",
"sha256:7935026ec61b967cbc6b746c0ca75c1651ea118d7fee4d259cff9e6866153374",
"sha256:7b76b1cac9baac1980210e29145800954e7b42e91ef69c4d695de1cab87ce41f",
"sha256:7e3f37c11b6699b1a1e0fcc0e88829dba4f2866546381b05ab8b3f4db645a823",
"sha256:8370fa65ad421484894f559055f951843754153b72b9bca2ebdc5288efe2e3f0",
"sha256:8ae9c443d44a4e23252632e4d7775f419f992d0df3eff923e23775f5cc551d39",
"sha256:8b31d85f2781e44f1ffaaf7ea07f484e7d42317c677c355fa77b4a1a4bea7394",
"sha256:8b450336b27f3b375cadc474c6704838eaa8dd3ca312aac3bb69d92264a8e638",
"sha256:9ce84357388a76d886febff4e50e321c212ffd3248b590960b2da6e02404a5c9",
"sha256:a23e986fb0ba8e7407286add41fa0d4207be44e3dce1b04789f4757800eca1cf",
"sha256:a81610ee00d0da9cd2c8679479b7791149365b6dfb3971b01b22ee29b04787ce",
"sha256:b4e40444975e5ab0ed3004369209c39a28e084951daaeee4919f164b6b849b14",
"sha256:b66600de16702b9dfa74bea34524b55183a2183e5fd92f20fe6c2fcae550a64c",
"sha256:ba6ee18694d3673796b7a31b7d21254e87e9e43ca5be56f323fd396111255315",
"sha256:bd03837da28293baa39bdfc3cada69e2f8807f423ae06168aa28d2b32c63a6b6",
"sha256:bd2192070f88c0778ae1d68a0980fdece3473498c1db37f3794e3454f91e3ecf",
"sha256:c1f6f1a3cc013012cd1da913c40b13e6d721046a8c8a0ea0cde94069645a75db",
"sha256:ce10a8e7e067bde3c1fbf494d2b8859db510206030b0b67bc3af90b0eb1887b9",
"sha256:d31386d208303a5a6cf0819ef9f6db6680bab9e4ca8e48adb3d4b26ead89beb7",
"sha256:d83b3af53b201970973c5574b39df226746194063bb248a53fd12b470ac34319",
"sha256:df9657b212c054ac6d803290d7c4bcd7790af0b725984fce1eeb0a1e3f2d9798",
"sha256:e576e5fd3f129e6b3595dc734ac7f2b8c548f19ef07781194bc538dc9c0cdbbc",
"sha256:e7400358558094c1bcedc75f3b3c4f400c53130b44833848890a99968dee6a64",
"sha256:eb6a385f8577d30e4cb43dd555fb134ddaae1edeb84205e09dabec332bf49fd0",
"sha256:f27f0875e0873f6bf5df09a456bfcac0667824cabac4cad30b43f36e0382ffe7",
"sha256:fcd4a6d04995f1d66bc78b503e4e59ae72fd32aaec4f661657fe5ae5c1aa4ce3"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==1.1.2"
"markers": "python_version >= '3' and platform_machine == 'aarch64' or (platform_machine == 'ppc64le' or (platform_machine == 'x86_64' or (platform_machine == 'amd64' or (platform_machine == 'AMD64' or (platform_machine == 'win32' or platform_machine == 'WIN32')))))",
"version": "==2.0.0a2"
},
"gspread": {
"hashes": [
@@ -416,11 +393,11 @@
},
"loguru": {
"hashes": [
"sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c",
"sha256:4e2414d534a2ab57573365b3e6d0234dfb1d84b68b7f3b948e6fb743860a77c3"
"sha256:b28e72ac7a98be3d28ad28570299a393dfcd32e5e3f6a353dec94675767b6319",
"sha256:f8087ac396b5ee5f67c963b495d615ebbceac2796379599820e324419d53667c"
],
"index": "pypi",
"version": "==0.6.0"
"version": "==0.5.3"
},
"lxml": {
"hashes": [
@@ -895,9 +872,7 @@
"version": "==2022.3.2"
},
"requests": {
"extras": [
"socks"
],
"extras": [],
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
@@ -918,7 +893,7 @@
"sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17",
"sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"
],
"markers": "python_version >= '3.6'",
"markers": "python_version >= '3.6' and python_version < '4'",
"version": "==4.8"
},
"s3transfer": {
@@ -951,44 +926,45 @@
},
"sqlalchemy": {
"hashes": [
"sha256:04164e0063feb7aedd9d073db0fd496edb244be40d46ea1f0d8990815e4b8c34",
"sha256:159c2f69dd6efd28e894f261ffca1100690f28210f34cfcd70b895e0ea7a64f3",
"sha256:199dc6d0068753b6a8c0bd3aceb86a3e782df118260ebc1fa981ea31ee054674",
"sha256:1bbac3e8293b34c4403d297e21e8f10d2a57756b75cff101dc62186adec725f5",
"sha256:20e9eba7fd86ef52e0df25bea83b8b518dfdf0bce09b336cfe51671f52aaaa3f",
"sha256:290cbdf19129ae520d4bdce392648c6fcdbee763bc8f750b53a5ab51880cb9c9",
"sha256:316270e5867566376e69a0ac738b863d41396e2b63274616817e1d34156dff0e",
"sha256:3f88a4ee192142eeed3fe173f673ea6ab1f5a863810a9d85dbf6c67a9bd08f97",
"sha256:4aa96e957141006181ca58e792e900ee511085b8dae06c2d08c00f108280fb8a",
"sha256:4b2bcab3a914715d332ca783e9bda13bc570d8b9ef087563210ba63082c18c16",
"sha256:576684771456d02e24078047c2567025f2011977aa342063468577d94e194b00",
"sha256:5a2e73508f939175363d8a4be9dcdc84cf16a92578d7fa86e6e4ca0e6b3667b2",
"sha256:5ba59761c19b800bc2e1c9324da04d35ef51e4ee9621ff37534bc2290d258f71",
"sha256:5dc9801ae9884e822ba942ca493642fb50f049c06b6dbe3178691fce48ceb089",
"sha256:6fdd2dc5931daab778c2b65b03df6ae68376e028a3098eb624d0909d999885bc",
"sha256:708973b5d9e1e441188124aaf13c121e5b03b6054c2df59b32219175a25aa13e",
"sha256:7ff72b3cc9242d1a1c9b84bd945907bf174d74fc2519efe6184d6390a8df478b",
"sha256:8679f9aba5ac22e7bce54ccd8a77641d3aea3e2d96e73e4356c887ebf8ff1082",
"sha256:8b9a395122770a6f08ebfd0321546d7379f43505882c7419d7886856a07caa13",
"sha256:8e1e5d96b744a4f91163290b01045430f3f32579e46d87282449e5b14d27d4ac",
"sha256:9a0195af6b9050c9322a97cf07514f66fe511968e623ca87b2df5e3cf6349615",
"sha256:9cb5698c896fa72f88e7ef04ef62572faf56809093180771d9be8d9f2e264a13",
"sha256:b3f1d9b3aa09ab9adc7f8c4b40fc3e081eb903054c9a6f9ae1633fe15ae503b4",
"sha256:bb42f9b259c33662c6a9b866012f6908a91731a419e69304e1261ba3ab87b8d1",
"sha256:bca714d831e5b8860c3ab134c93aec63d1a4f493bed20084f54e3ce9f0a3bf99",
"sha256:bedd89c34ab62565d44745212814e4b57ef1c24ad4af9b29c504ce40f0dc6558",
"sha256:bfec934aac7f9fa95fc82147a4ba5db0a8bdc4ebf1e33b585ab8860beb10232f",
"sha256:c7046f7aa2db445daccc8424f50b47a66c4039c9f058246b43796aa818f8b751",
"sha256:d7e483f4791fbda60e23926b098702340504f7684ce7e1fd2c1bf02029288423",
"sha256:dd93162615870c976dba43963a24bb418b28448fef584f30755990c134a06a55",
"sha256:e4607d2d16330757818c9d6fba322c2e80b4b112ff24295d1343a80b876eb0ed",
"sha256:e9a680d9665f88346ed339888781f5236347933906c5a56348abb8261282ec48",
"sha256:edfcf93fd92e2f9eef640b3a7a40db20fe3c1d7c2c74faa41424c63dead61b76",
"sha256:f7e4a3c0c3c596296b37f8427c467c8e4336dc8d50f8ed38042e8ba79507b2c9",
"sha256:fff677fa4522dafb5a5e2c0cf909790d5d367326321aeabc0dffc9047cb235bd"
"sha256:045d6a26c262929af0b9cb25441aae675ac04db4ea8bd2446b355617cd6b6b7d",
"sha256:07f4dab2deb6d34618a2ccfff3971a85923ad7c3a9a45401818870fc51d3f0cc",
"sha256:08aaad905aba8940f27aeb9f1f851bf63f18ef97b0062ca41f64afc4b64e0e8c",
"sha256:27a42894a2751e438eaed12fc0dcfe741ff2f66c14760d081222c5adc5460064",
"sha256:2a3e4dc7c452ba3c0f3175ad5a8e0ba49c2b0570a8d07272cf50844c8d78e74f",
"sha256:345306707bb0e51e7cd6e7573adafbce018894ee5e3b9c31134545f704936db0",
"sha256:36f08d94670315ca04c8139bd80b3e02b9dd9cc66fc11bcb96fd10ad51a051ab",
"sha256:3ebb97ed96f4506e2f212e1fcf0ec07a103bb194938627660a5acb4d9feae49c",
"sha256:40b995d7aeeb6f88a1927ce6692c0f626b59d8effd3e1d597f125e141707b37c",
"sha256:4414ace6e3a5e39523e55a5d9f3b215699b2ead4ff91fca98f1b659b7ab2d92a",
"sha256:50107d8183da3fbe5715957aa3954cd9d82aed555c5b4d3fd37fac861af422fa",
"sha256:50174e173d03209c34e07e7b57cca48d0082ac2390edf927aafc706c111da11e",
"sha256:5e88912bf192e7b5739c446d2276e1cba74cfa6c1c93eea2b2534404f6be1dbd",
"sha256:621d3f6c0ba2407bb97e82b649be5ca7d5b6c201dcfb964ce13f517bf1cb6305",
"sha256:623bac2d6bdca3f3e61cf1e1c466c5fb9f5cf08735736ee1111187b7a4108891",
"sha256:671f61c3db4595b0e86cc4b30f675a7c0206d9ce99f041b4f6761c7ddd1e0074",
"sha256:67c1c27c48875afc950bee5ee24582794f20b545e64e4f9ca94071a9b514d6ed",
"sha256:6a6cfd468f54d65324fd3847cfd0148b0610efa6a43e5f5fcc89f455696ae9e7",
"sha256:70048a83f0a1ece1fcd7189891c888e20af2c57fbd33eb760d8cece9843b896c",
"sha256:7ee14a7f9f76d1ef9d5e5b760c9252617c839b87eee04d1ce8325ac66ae155c4",
"sha256:804cf491437f3e4ce31247ab4b309b181f06ecc97d309b746d10f09439b4eb85",
"sha256:878c7beaafa365602762c19f638282e1885454fed1aed86f8fae038933c7c671",
"sha256:954ea8c527c4322afb6885944904714893af81fe9167e421273770991bf08a4a",
"sha256:a47bf6b7ca6c28e4f4e262fabcf5be6b907af81be36de77839c9eeda2cdf3bb3",
"sha256:a4fb5c6ee84a6bba4ff6f9f5379f0b3a0ffe9de7ba5a0945659b3da8d519709b",
"sha256:b34bbc683789559f1bc9bb685fc162e0956dbbdfbe2fbd6755a9f5982c113610",
"sha256:c025d45318b73c0601cca451532556cbab532b2742839ebb8cb58f9ebf06811e",
"sha256:c3ad7f5b61ba014f5045912aea15b03c473bb02b1c07fd92c9d2c794fa183276",
"sha256:c9218e3519398129e364121e0d89823e6ba2a2b77c28bfc661face0829c41433",
"sha256:cd5cffd1dd753828f1069f33062f3896e51c990acd957c264f40e051b3e19887",
"sha256:d8efcaa709ea8e7c08c3d3e7639c39b36083f5a995f397f9e6eedf5f5e4e4946",
"sha256:e297a5cc625e3f1367a82deedf2d48ee4d2b2bd263b8b8d2efbaaf5608b5229e",
"sha256:e67278ceb63270cdac0a7b89fc3c29a56f7dac9616a7ee48e7ad6b52e3b631e5",
"sha256:eb6558ba07409dafa18c793c34292b3265be455904966f0724c10198829477e3",
"sha256:f197c66663ed0f9e1178d51141d864688fb244a83f6b17f667d521e482537b2e",
"sha256:f47996b1810894f766c9ee689607077c6c0e0fd6761e04c12ba13efb56d50c1d"
],
"index": "pypi",
"version": "==1.4.32"
"version": "==1.4.34"
},
"telethon": {
"hashes": [
@@ -1163,11 +1139,11 @@
},
"click": {
"hashes": [
"sha256:5e0d195c2067da3136efb897449ec1e9e6c98282fbf30d7f9e164af9be901a6b",
"sha256:7ab900e38149c9872376e8f9b5986ddcaf68c0f413cf73678a0bca5547e6f976"
"sha256:24e1a4a9ec5bf6299411369b208c1df2188d9eb8d916302fe6bf03faed227f1e",
"sha256:479707fe14d9ec9a0757618b7a100a0ae4c4e236fac5b7f80ca68028141a1a72"
],
"markers": "python_version >= '3.7'",
"version": "==8.1.1"
"version": "==8.1.2"
},
"coverage": {
"extras": [
@@ -1415,9 +1391,7 @@
"version": "==2022.1"
},
"requests": {
"extras": [
"socks"
],
"extras": [],
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
@@ -1501,7 +1475,7 @@
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
"sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
],
"markers": "python_full_version < '3.11.0'",
"markers": "python_version >= '3.7'",
"version": "==2.0.1"
},
"typing-extensions": {

View File

@@ -235,6 +235,20 @@ class Scraper:
return archived_url
def archive_files(self, result: ScraperResult) -> ScraperResult:
"""Archive files corresponding to ``archived_url`` dict keys, if the
files have not previously been archived.
Parameters
----------
result: ScraperResult
Previously scraped ScraperResult run with ``archive_media=False``.
Returns
-------
ScraperResult
Same ScraperResult as ``result``, but with all URLs in ``archived_url`` dict archived.
"""
for url in result.archived_urls:
if result.archived_urls[url] is None:
media_blob, content_type, key = self.url_to_blob(url)
@@ -244,7 +258,6 @@ class Scraper:
result.media_archived = True
return result
def can_handle(self, channel: Channel) -> bool:
"""Whether or not the scraper can scrape the specified channel.
@@ -365,6 +378,10 @@ class ScraperController:
else:
since = None
# TODO currently, if channels haven't been added to the database, if channel.id is None, the `since` returns the most recently scraped ScraperResult with channel.id == None, which can be from a different platform and channel. Maybe add check in above query logic that channel.id isn't null.
if channel.id is None:
since = None
posts = scraper.get_posts(channel, since=since, archive_media=archive_media)
for post in posts:

View File

@@ -43,9 +43,12 @@ class BitchuteScraper(Scraper):
archived_urls = {}
if archive_media:
if 'video_url' in post:
url = post['video_url']
if 'video_url' in post:
url = post['video_url']
archived_urls[url] = None
if archive_media:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
@@ -112,6 +115,7 @@ class BitchuteScraper(Scraper):
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def strip_tags(html, convert_newlines=True):

View File

@@ -50,25 +50,24 @@ class GabScraper(Scraper):
if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
break
media_urls = []
archived_urls = {}
if archive_media:
for attachment in post.get('media_attachments'):
for attachment in post.get('media_attachments'):
if attachment.get('type') == 'video':
archived_urls[attachment['source_mp4']] = None
else:
archived_urls[attachment['url']] = None
if post.get('reblog') is not None:
for attachment in post['reblog'].get('media_attachments'):
if attachment.get('type') == 'video':
media_urls.append(attachment['source_mp4'])
archived_urls[attachment['source_mp4']] = None
else:
media_urls.append(attachment['url'])
if post.get('reblog') is not None:
for attachment in post['reblog'].get('media_attachments'):
if attachment.get('type') == 'video':
media_urls.append(attachment['source_mp4'])
else:
media_urls.append(attachment['url'])
archived_urls[attachment['url']] = None
for url in media_urls:
for url in archived_urls.keys():
if archive_media:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url

View File

@@ -30,26 +30,25 @@ class GettrScraper(Scraper):
archived_urls = {}
if archive_media:
if 'imgs' in post:
for img in post['imgs']:
url = "https://media.gettr.com/" + img
archived_urls[url] = None
if 'imgs' in post:
for img in post['imgs']:
url = "https://media.gettr.com/" + img
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[img] = archived_url
if 'main' in post:
url = "https://media.gettr.com/" + post['main']
archived_urls[url] = None
if 'main' in post:
url = "https://media.gettr.com/" + post['main']
if 'ovid' in post:
url = "https://media.gettr.com/" + post['ovid']
archived_urls[url] = None
for url in archived_urls.keys():
if archive_media:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[post['main']] = archived_url
if 'vid' in post:
url = "https://media.gettr.com/" + post['vid']
media_blob, content_type, key = self.m3u8_url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[post['vid']] = archived_url
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
@@ -72,7 +71,7 @@ class GettrScraper(Scraper):
return key
def get_profile(self, channel: Channel) -> RawChannelInfo:
client = client = PublicClient()
client = PublicClient()
username = self.get_username_from_url(channel.url)
profile = client.user_info(username)

View File

@@ -1,4 +1,4 @@
from typing import Generator
from typing import Generator, List
from datetime import datetime, timezone
import os
import json
@@ -49,28 +49,14 @@ class InstagramScraper(Scraper):
post_url = f'{BASE_URL}p/{post.shortcode}/'
archived_urls = {}
archived_urls = get_archived_urls_from_post(post = post)
if archive_media:
for url in archived_urls.keys():
with tempfile.TemporaryDirectory() as temp_dir:
loader.download_post(post = post, target = Path(temp_dir))
files = os.listdir(temp_dir)
files = [f for f in files if not f.endswith('.txt')]
for file in files:
ext = file.split('.')[-1]
content_type = CONTENT_TYPES[ext]
filename = Path(temp_dir, file)
key = f'{post.shortcode}__{file}'
with open(filename, 'rb') as f:
blob = f.read()
archived_url = self.archive_blob(blob = blob, content_type = content_type, key = key)
archived_urls[post_url] = archived_url
if archive_media:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
@@ -98,7 +84,7 @@ class InstagramScraper(Scraper):
date_archived=datetime.now(timezone.utc),
raw_posts=json.dumps(comment_dict, default=str),
archived_urls={},
media_archived=archive_media)
media_archived=True)
def can_handle(self, channel):
if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None:
@@ -126,7 +112,20 @@ class InstagramScraper(Scraper):
profile['followees'] = user_profile.followees
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
def get_archived_urls_from_post(post: instaloader.Post) -> List[str]:
typename = post._node['__typename']
if typename == 'GraphImage':
urls = [post._node['display_url']]
elif typename == 'GraphVideo':
urls = [post._node['video_url']]
elif typename == 'GraphSidecar':
urls = [edge['node']['display_url'] for edge in post._node['edge_sidecar_to_children']['edges']]
else:
raise NotImplementedError(f'post of type {typename} is currently not supported.')
return {url : None for url in urls}

View File

@@ -36,10 +36,11 @@ class OdyseeScraper(Scraper):
if since is not None and datetime.fromtimestamp(video.info['created']) <= since.date:
break
archived_urls = {}
url = video.info['streaming_url']
archived_urls = {url: None}
if archive_media:
url = video.info['streaming_url']
# Check if file is a video file or an m3u8 file
r = requests.head(url)
@@ -77,6 +78,21 @@ class OdyseeScraper(Scraper):
archived_urls={},
media_archived=True)
def archive_files(self, result: ScraperResult) -> ScraperResult:
for url in result.archived_urls:
if result.archived_urls[url] is None:
r = requests.head(url)
if r.headers['Content-Type'] == 'text/html; charset=utf-8':
media_blob, content_type, key = self.m3u8_url_to_blob(url)
else:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
result.archived_urls[url] = archived_url
result.media_archived = True
return result
def can_handle(self, channel):
if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None:
return True
@@ -94,7 +110,7 @@ class OdyseeScraper(Scraper):
profile = odysee_channel.info
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -19,18 +19,18 @@ class RumbleScraper(Scraper):
scraper = get_channel_videos(channel.url)
for post in scraper:
if since is not None and post['datetime'].replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
if since is not None and post['datetime'].replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
break
archived_urls = {}
url = post['media_url']
archived_urls = {url: None}
if archive_media:
url = post['media_url']
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[post['media_url']] = archived_url
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
@@ -48,6 +48,16 @@ class RumbleScraper(Scraper):
key = urlparse(url).path.split('/')[-2] + ext
return key
def archive_files(self, result: ScraperResult) -> ScraperResult:
for url in result.archived_urls:
if result.archived_urls[url] is None:
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
result.archived_urls[url] = archived_url
result.media_archived = True
return result
def can_handle(self, channel):
if channel.platform == "Rumble" and channel.url is not None:
return True
@@ -57,10 +67,10 @@ class RumbleScraper(Scraper):
profile = get_channel_profile(url = channel.url)
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -33,8 +33,8 @@ class TelegramSnscrapeScraper(Scraper):
for image_url in post.images:
archived_urls[image_url] = None
if post.video:
archived_urls[post.video] = None
for video_url in post.videos:
archived_urls[video_url] = None
if archive_media:
for url in archived_urls:

View File

@@ -14,7 +14,7 @@ class TwitterScraper(Scraper):
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
if channel.platform_id:
identifier = channel.platform_id
identifier = int(channel.platform_id)
else:
identifier = channel.screenname
@@ -23,7 +23,7 @@ class TwitterScraper(Scraper):
first = True
for tweet in scraper.get_items():
if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
# with TwitterProfileScraper, the first tweet could be an old pinned tweet
if first:
first = False
@@ -105,7 +105,7 @@ class TwitterScraper(Scraper):
raise ChannelDoesNotExistError(channel.url)
else:
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(entity.__dict__, default=str),
date_archived=datetime.now(timezone.utc))
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(entity.__dict__, default=str),
date_archived=datetime.now(timezone.utc))

View File

@@ -1,8 +1,12 @@
from datetime import datetime, timezone
from typing import Generator
from urllib.parse import urlparse
import json
import re
from snscrape.modules.vkontakte import VKontakteUserScraper
from loguru import logger
from yt_dlp.extractor.vk import VKIE
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
@@ -24,7 +28,7 @@ class VkontakteScraper(Scraper):
first = True
for post in scraper.get_items():
if since is not None and datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
if since is not None and datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
# with VKontakteUserScraper, the first tweet could be an old pinned tweet
if first:
first = False
@@ -34,23 +38,26 @@ class VkontakteScraper(Scraper):
archived_urls = {}
if archive_media:
if post.photos:
if post.photos:
for photo in post.photos:
variant = max(
[v for v in photo.variants], key=lambda v: v.width * v.height)
url = variant.url
if url is not None:
archived_urls[url] = None
for photo in post.photos:
variant = max(
[v for v in photo.variants], key=lambda v: v.width * v.height)
url = variant.url
if url is not None:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
if post.video:
archived_urls[post.video.url] = None
if post.video:
url = post.video.url
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
for url in archived_urls.keys():
if archive_media:
if re.match(VKIE._VALID_URL, url):
# Uses regex from yt_dlp to verify VK video URL
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
else:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
@@ -65,6 +72,21 @@ class VkontakteScraper(Scraper):
archived_urls=archived_urls,
media_archived=archive_media)
def archive_files(self, result: ScraperResult) -> ScraperResult:
for url in result.archived_urls:
if result.archived_urls[url] is None:
if re.match(VKIE._VALID_URL, url):
# Uses regex from yt_dlp to verify VK video URL
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
else:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
result.archived_urls[url] = archived_url
result.media_archived = True
return result
def can_handle(self, channel):
if channel.platform == "Vkontakte" and channel.platform_id:
return True
@@ -87,7 +109,7 @@ class VkontakteScraper(Scraper):
profile = scraper._get_entity().__dict__
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -2,7 +2,11 @@ from datetime import datetime, timezone
import json
from typing import Generator
import tempfile
from pathlib import Path
import os
import yt_dlp
from loguru import logger
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper import Scraper
@@ -46,7 +50,10 @@ class YoutubeScraper(Scraper):
for video in valid_videos:
archived_urls = {}
url = video['webpage_url']
archived_urls = {url: None}
video_id = video["id"]
video_ext = video["ext"]
@@ -54,11 +61,8 @@ class YoutubeScraper(Scraper):
key = f"{video_id}.{video_ext}"
with open(f"{temp_dir}/{key}", "rb") as f:
with open(Path(temp_dir)/key, "rb") as f:
media_blob = f.read()
archived_url = self.archive_blob(media_blob, content_type, key)
url = video['webpage_url']
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
@@ -78,6 +82,41 @@ class YoutubeScraper(Scraper):
if channel.platform == "Youtube" and channel.url:
return True
def archive_files(self, result: ScraperResult) -> ScraperResult:
for url in result.archived_urls:
if result.archived_urls[url] is None:
media_blob = None
with tempfile.TemporaryDirectory() as temp_dir:
ydl_opts = {
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4",
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s"}
ydl = yt_dlp.YoutubeDL(ydl_opts)
try:
ydl.download(url)
except yt_dlp.utils.DownloadError as e:
raise e
files = os.listdir(temp_dir)
if len(files) != 1:
logger.warning(f'{len(files)} files downloaded for video: {url}')
key = files[0]
with open(Path(temp_dir, key), 'rb') as f:
media_blob = f.read()
if media_blob is not None:
content_type = 'video/mp4'
archived_url = self.archive_blob(media_blob, content_type, key)
result.archived_urls[url] = archived_url
result.media_archived = True
return result
def get_profile(self, channel: Channel) -> RawChannelInfo:
ydl_opts = {}
ydl = yt_dlp.YoutubeDL(ydl_opts)
@@ -87,12 +126,13 @@ class YoutubeScraper(Scraper):
meta = ydl.extract_info(
channel.url,
process=False)
meta.pop('entries')
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(meta),
date_archived=datetime.now(timezone.utc))
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(meta),
date_archived=datetime.now(timezone.utc))
except yt_dlp.utils.DownloadError as e:
raise e

View File

@@ -12,10 +12,9 @@ addopts =
--html='reports/tests.html'
--self-contained-html
markers =
profile: marks tests for only extracting channel metadata (deselect with '-m
"not profile"')
media: marks tests for archiving all media attachments (deselect with '-m
"not media"')
profile: marks tests for only extracting channel metadata (deselect with '-m "not profile"')
media: marks tests for archiving all media attachments (deselect with '-m "not media"')
unarchived: marks tests for archiving all unarchived media attachments (deselect with '-m "not unarchived"')
filterwarnings =
ignore:the imp module is deprecated:DeprecationWarning
ignore:The localize method is no longer necessary, as this time zone supports the fold attribute

View File

@@ -99,12 +99,12 @@ RUMBLE_CHANNEL_KWARGS = {
'notes': ''}
TELEGRAM_CHANNEL_KWARGS = {
'name': 'USA Freedom Convoy (test)',
'platform_id': -1001799578085,
'name': 'South West Ohio Proud Boys (test)',
'platform_id': -1001276612436,
'category': 'test',
'platform': 'Telegram',
'url': 'https://t.me/usafreedomconvoy2022',
'screenname': 'usafreedomconvoy2022',
'url': 'https://t.me/SouthwestOhioPB',
'screenname': 'SouthwestOhioPB',
'country': 'US',
'influencer': None,
'public': True,

View File

@@ -3,12 +3,19 @@ import pytest
from cisticola.base import Channel
from cisticola.scraper import BitchuteScraper
@pytest.mark.unarchived
def test_scrape_bitchute_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['bitchute'])]
controller.register_scraper(scraper = BitchuteScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_bitchute_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_bitchute_channel(controller, channel_kwargs):

View File

@@ -3,12 +3,19 @@ import pytest
from cisticola.base import Channel
from cisticola.scraper import GabScraper
@pytest.mark.unarchived
def test_scrape_gab_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['gab'])]
controller.register_scraper(scraper = GabScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_gab_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_gab_channel(controller, channel_kwargs):

View File

@@ -3,12 +3,19 @@ import pytest
from cisticola.base import Channel
from cisticola.scraper import GettrScraper
@pytest.mark.unarchived
def test_scrape_gettr_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['gettr'])]
controller.register_scraper(scraper = GettrScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_gettr_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_gettr_channel(controller, channel_kwargs):

View File

@@ -3,12 +3,19 @@ import pytest
from cisticola.base import Channel
from cisticola.scraper import InstagramScraper
@pytest.mark.unarchived
def test_scrape_instagram_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['instagram'])]
controller.register_scraper(scraper = InstagramScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_instagram_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_instagram_channel(controller, channel_kwargs):

View File

@@ -3,12 +3,19 @@ import pytest
from cisticola.base import Channel
from cisticola.scraper import OdyseeScraper
@pytest.mark.unarchived
def test_scrape_odysee_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['odysee'])]
controller.register_scraper(scraper = OdyseeScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_odysee_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_odysee_channel(controller, channel_kwargs):

View File

@@ -3,12 +3,19 @@ import pytest
from cisticola.base import Channel
from cisticola.scraper import RumbleScraper
@pytest.mark.unarchived
def test_scrape_rumble_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['rumble'])]
controller.register_scraper(scraper = RumbleScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_rumble_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_rumble_channel(controller, channel_kwargs):

View File

@@ -3,12 +3,19 @@ import pytest
from cisticola.base import Channel
from cisticola.scraper import TelegramSnscrapeScraper
@pytest.mark.unarchived
def test_scrape_telegram_snscrape_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['telegram'])]
controller.register_scraper(scraper = TelegramSnscrapeScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_telegram_snscrape_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_telegram_snscrape_channel(controller, channel_kwargs):

View File

@@ -3,6 +3,7 @@ import pytest
from cisticola.base import Channel
from cisticola.scraper import TelegramTelethonScraper
@pytest.mark.unarchived
def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs):
controller.remove_all_scrapers()
@@ -10,6 +11,12 @@ def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs):
controller.register_scraper(scraper = TelegramTelethonScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_telegram_telethon_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_telegram_telethon_channel(controller, channel_kwargs):

View File

@@ -3,12 +3,19 @@ import pytest
from cisticola.base import Channel
from cisticola.scraper import TwitterScraper
@pytest.mark.unarchived
def test_scrape_twitter_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['twitter'])]
controller.register_scraper(scraper = TwitterScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_twitter_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_twitter_channel(controller, channel_kwargs):

View File

@@ -3,12 +3,19 @@ import pytest
from cisticola.base import Channel
from cisticola.scraper import VkontakteScraper
@pytest.mark.unarchived
def test_scrape_vkontakte_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['vkontakte'])]
controller.register_scraper(scraper = VkontakteScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_vkontakte_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_vkontakte_channel(controller, channel_kwargs):

View File

@@ -3,12 +3,19 @@ import pytest
from cisticola.base import Channel
from cisticola.scraper import YoutubeScraper
@pytest.mark.unarchived
def test_scrape_youtube_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['youtube'])]
controller.register_scraper(scraper = YoutubeScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_youtube_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_youtube_channel(controller, channel_kwargs):