diff --git a/.gitignore b/.gitignore index 632ac68..65dd3d7 100644 --- a/.gitignore +++ b/.gitignore @@ -11,10 +11,11 @@ docs/source/_* *.session service_account.json .vscode/ +*.log # Unit test / coverage reports reports -.coverage +.coverage* .cache .pytest_cache/ cover/ diff --git a/Pipfile b/Pipfile index d57b5a9..df21bef 100644 --- a/Pipfile +++ b/Pipfile @@ -22,6 +22,9 @@ instaloader = "*" gspread = "*" cryptg = "*" gabber = {git = "https://github.com/stanfordio/gabber.git"} +psycopg2-binary = "*" +tqdm = "*" +ratelimit = "*" [dev-packages] pytest = "*" @@ -33,7 +36,7 @@ sphinx = "*" sphinx_rtd_theme = "*" [requires] -python_version = "3.9" +python_version = "3.8" [pipenv] allow_prereleases = true diff --git a/Pipfile.lock b/Pipfile.lock index bb0e2a2..ea45b5e 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,11 +1,11 @@ { "_meta": { "hash": { - "sha256": "b712e767d64e54e83e8c2d8a27a68203583ed7ad31d4ea3b4b6076a72a2150fd" + "sha256": "e57f79178ac0e05f9753a29f97e08d2ae96b7775044bb4c6ba616baae1d21183" }, "pipfile-spec": 6, "requires": { - "python_version": "3.9" + "python_version": "3.8" }, "sources": [ { @@ -16,6 +16,28 @@ ] }, "default": { + "backports.zoneinfo": { + "hashes": [ + "sha256:17746bd546106fa389c51dbea67c8b7c8f0d14b5526a579ca6ccf5ed72c526cf", + "sha256:1b13e654a55cd45672cb54ed12148cd33628f672548f373963b0bff67b217328", + "sha256:1c5742112073a563c81f786e77514969acb58649bcdf6cdf0b4ed31a348d4546", + "sha256:4a0f800587060bf8880f954dbef70de6c11bbe59c673c3d818921f042f9954a6", + "sha256:5c144945a7752ca544b4b78c8c41544cdfaf9786f25fe5ffb10e838e19a27570", + "sha256:7b0a64cda4145548fed9efc10322770f929b944ce5cee6c0dfe0c87bf4c0c8c9", + "sha256:8439c030a11780786a2002261569bdf362264f605dfa4d65090b64b05c9f79a7", + "sha256:8961c0f32cd0336fb8e8ead11a1f8cd99ec07145ec2931122faaac1c8f7fd987", + "sha256:89a48c0d158a3cc3f654da4c2de1ceba85263fafb861b98b59040a5086259722", + "sha256:a76b38c52400b762e48131494ba26be363491ac4f9a04c1b7e92483d169f6582", + "sha256:da6013fd84a690242c310d77ddb8441a559e9cb3d3d59ebac9aca1a57b2e18bc", + "sha256:e55b384612d93be96506932a786bbcde5a2db7a9e6a4bb4bffe8b733f5b9036b", + "sha256:e81b76cace8eda1fca50e345242ba977f9be6ae3945af8d46326d776b4cf78d1", + "sha256:e8236383a20872c0cdf5a62b554b27538db7fa1bbec52429d8d106effbaeca08", + "sha256:f04e857b59d9d1ccc39ce2da1021d196e47234873820cbeaad210724b1ee28ac", + "sha256:fadbfe37f74051d024037f223b8e001611eac868b5c5b06144ef4d8b799862f2" + ], + "markers": "python_version >= '3.6' and python_version < '3.9'", + "version": "==0.2.1" + }, "beautifulsoup4": { "hashes": [ "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf", @@ -26,19 +48,19 @@ }, "boto3": { "hashes": [ - "sha256:127ebdf58c8825b53f1eff111e08c49ffffeb1f6d7a5665c9907ce8128fe14b1", - "sha256:b7ce3bf013f0f60e40c2676d5a7b620ed927cfad0aa348a606b10e9a0387f249" + "sha256:ef210f8e85cdb6d26a38ebad1cfe9cefdef2ab269207e5987653555375a7ef6b", + "sha256:f0af8f4ef5fe6353c794cd3cce627d469a618b58ace7ca75a63cfd719df615ce" ], "index": "pypi", - "version": "==1.21.29" + "version": "==1.21.30" }, "botocore": { "hashes": [ - "sha256:b467d64cd773dc4d49ef31b18a8dded554f284f799720bd12e989fe2138fd5b8", - "sha256:de87907d42682179946ddfa113b9334e3c4258404aef19edd8c92381ff54775c" + "sha256:af4bdc51eeecbe9fdcdadbed9ad58c5c91380ef30f3560022bbc2ee1d78f0ad6", + "sha256:c622751093e3d0bf61343e66d6d06190ef30bf42b1557d5070ca84e9efa06d4b" ], "markers": "python_version >= '3.6'", - "version": "==1.24.29" + "version": "==1.24.30" }, "brotli": { "hashes": [ @@ -195,11 +217,11 @@ }, "click": { "hashes": [ - "sha256:19a4baa64da924c5e0cd889aba8e947f280309f1a2ce0947a3e3a7bcb7cc72d6", - "sha256:977c213473c7665d3aa092b41ff12063227751c41d7b17165013e10069cc5cd2" + "sha256:5e0d195c2067da3136efb897449ec1e9e6c98282fbf30d7f9e164af9be901a6b", + "sha256:7ab900e38149c9872376e8f9b5986ddcaf68c0f413cf73678a0bca5547e6f976" ], "markers": "python_version >= '3.7'", - "version": "==8.1.0" + "version": "==8.1.1" }, "cryptg": { "hashes": [ @@ -302,63 +324,64 @@ }, "greenlet": { "hashes": [ - "sha256:004aed447382d80a56ecc354a6d807f305e6c808714ce6ccbca4839c94fae81d", - "sha256:068d68fad6bd623e29a2d36e74538c9b9d6dc6464931cd27d93da6cfc6a7f242", - "sha256:06fd4075754009c9817c6b4e1dc0af4616de52757b6ca973a81c3c1aadc28257", - "sha256:1004cb542451814b12a4f38e835a47734e2b2c683acbf463d5ae76282a3974cf", - "sha256:10c358633a8b27bfc32d27114ef2ca2ddc9f1f89f1643d1157b85e1fdd695315", - "sha256:115bc25fefbdc692c4483e9ddb9011ccd0251590ed59dbfff0f4eb7050bf99c4", - "sha256:1d987a2579336792f73ae6b106c2f087e32afc8573fbf9566f123ac6d8cfb72f", - "sha256:2128d727fd1e8afba8e68feb2cdcf88c90163b69ddc9707722a3e491c5280720", - "sha256:230132c241fe284f93f2e7b3969e9b22bbd76ef98cf93e382c945d378907f5a4", - "sha256:23558f7bd08a663386c032ab8d302d613d2d02ae0c9758ad410bab6035b58d3d", - "sha256:255d520d3e4a5f16883b182e1a94219fe455ab4f50aaaf534bfd6d64ee728397", - "sha256:2a6bc19a728f6f643cfc89b876159a1a25a8f7d8700c013d48a73691f80b4550", - "sha256:379bed346ef8ba0a0e698b3c5975a44d15dd4a5bbff40bbd7fd548b445d5550b", - "sha256:3b12d0866759db93b0a893b4e50a7d7d1681519d2346c26695bb8bb2c652230e", - "sha256:40d491944f69e350e1e8b25f6ca49459824ede1678ec0cd4b5541f41edc06614", - "sha256:471484c7b9d7b7867263051aa81cdeed6e06b455e629a7f05eb91a6cb8bd0836", - "sha256:488c557080557bc01aabb3e1bda7225c68455b853733a8652857ac0d810dad1b", - "sha256:49c2e76e7aa81ba889b3c183e2341af3cc6161ee38852085110ae49d5b5d9a40", - "sha256:52d13ec90236e5935ed6da044e78faa1371d5116cc43fe6d7ca8994dd619ef96", - "sha256:57898c69a253d81f487787bdd538629fabd671fab8a9e31b041ca30965fd9556", - "sha256:5d577eef5beb5730ef01ab39983eb852a97c359b7a546809adf70c409f4b2ecc", - "sha256:6a41987c1474c9158a0c0c96611530a8f299bc547d35bee8add981b8b2534f74", - "sha256:6ae67b7df8db3626af8e042e9c6949cfa27d1a3bbbfdff29e45b72bb6673a650", - "sha256:6c42c27e9d12e8a481aff469ffe8dd4ce0484c354a418470960f760f6ae41e7c", - "sha256:6c4a90c9f6128b4d0905a89930bd325e0491574e5cb453f606bb7094a3197587", - "sha256:6e64518e5833ac2d9359b6d9bd4df2c0cf441a0f3a4eca9e735fbea99009fa70", - "sha256:6fd3a270c23c5b42d86a9c7c6b0229f23ee4a7a4cabdaaa1693ad7a0982d13cb", - "sha256:70db73351e0fcf11a76288c47a0469d9a330bcb2e7618c5eb57432b8caa82403", - "sha256:771f401692046845626cbdf1dd0f04e999413ede0ee9ad39033fe30b5fa2e845", - "sha256:7935026ec61b967cbc6b746c0ca75c1651ea118d7fee4d259cff9e6866153374", - "sha256:7b76b1cac9baac1980210e29145800954e7b42e91ef69c4d695de1cab87ce41f", - "sha256:7e3f37c11b6699b1a1e0fcc0e88829dba4f2866546381b05ab8b3f4db645a823", - "sha256:8370fa65ad421484894f559055f951843754153b72b9bca2ebdc5288efe2e3f0", - "sha256:8ae9c443d44a4e23252632e4d7775f419f992d0df3eff923e23775f5cc551d39", - "sha256:8b31d85f2781e44f1ffaaf7ea07f484e7d42317c677c355fa77b4a1a4bea7394", - "sha256:8b450336b27f3b375cadc474c6704838eaa8dd3ca312aac3bb69d92264a8e638", - "sha256:9ce84357388a76d886febff4e50e321c212ffd3248b590960b2da6e02404a5c9", - "sha256:a23e986fb0ba8e7407286add41fa0d4207be44e3dce1b04789f4757800eca1cf", - "sha256:a81610ee00d0da9cd2c8679479b7791149365b6dfb3971b01b22ee29b04787ce", - "sha256:b4e40444975e5ab0ed3004369209c39a28e084951daaeee4919f164b6b849b14", - "sha256:b66600de16702b9dfa74bea34524b55183a2183e5fd92f20fe6c2fcae550a64c", - "sha256:ba6ee18694d3673796b7a31b7d21254e87e9e43ca5be56f323fd396111255315", - "sha256:bd03837da28293baa39bdfc3cada69e2f8807f423ae06168aa28d2b32c63a6b6", - "sha256:bd2192070f88c0778ae1d68a0980fdece3473498c1db37f3794e3454f91e3ecf", - "sha256:c1f6f1a3cc013012cd1da913c40b13e6d721046a8c8a0ea0cde94069645a75db", - "sha256:ce10a8e7e067bde3c1fbf494d2b8859db510206030b0b67bc3af90b0eb1887b9", - "sha256:d31386d208303a5a6cf0819ef9f6db6680bab9e4ca8e48adb3d4b26ead89beb7", - "sha256:d83b3af53b201970973c5574b39df226746194063bb248a53fd12b470ac34319", - "sha256:df9657b212c054ac6d803290d7c4bcd7790af0b725984fce1eeb0a1e3f2d9798", - "sha256:e576e5fd3f129e6b3595dc734ac7f2b8c548f19ef07781194bc538dc9c0cdbbc", - "sha256:e7400358558094c1bcedc75f3b3c4f400c53130b44833848890a99968dee6a64", - "sha256:eb6a385f8577d30e4cb43dd555fb134ddaae1edeb84205e09dabec332bf49fd0", - "sha256:f27f0875e0873f6bf5df09a456bfcac0667824cabac4cad30b43f36e0382ffe7", - "sha256:fcd4a6d04995f1d66bc78b503e4e59ae72fd32aaec4f661657fe5ae5c1aa4ce3" + "sha256:0051c6f1f27cb756ffc0ffbac7d2cd48cb0362ac1736871399a739b2885134d3", + "sha256:00e44c8afdbe5467e4f7b5851be223be68adb4272f44696ee71fe46b7036a711", + "sha256:013d61294b6cd8fe3242932c1c5e36e5d1db2c8afb58606c5a67efce62c1f5fd", + "sha256:049fe7579230e44daef03a259faa24511d10ebfa44f69411d99e6a184fe68073", + "sha256:14d4f3cd4e8b524ae9b8aa567858beed70c392fdec26dbdb0a8a418392e71708", + "sha256:166eac03e48784a6a6e0e5f041cfebb1ab400b394db188c48b3a84737f505b67", + "sha256:17ff94e7a83aa8671a25bf5b59326ec26da379ace2ebc4411d690d80a7fbcf23", + "sha256:1e12bdc622676ce47ae9abbf455c189e442afdde8818d9da983085df6312e7a1", + "sha256:21915eb821a6b3d9d8eefdaf57d6c345b970ad722f856cd71739493ce003ad08", + "sha256:288c6a76705dc54fba69fbcb59904ae4ad768b4c768839b8ca5fdadec6dd8cfd", + "sha256:2bde6792f313f4e918caabc46532aa64aa27a0db05d75b20edfc5c6f46479de2", + "sha256:32ca72bbc673adbcfecb935bb3fb1b74e663d10a4b241aaa2f5a75fe1d1f90aa", + "sha256:356b3576ad078c89a6107caa9c50cc14e98e3a6c4874a37c3e0273e4baf33de8", + "sha256:40b951f601af999a8bf2ce8c71e8aaa4e8c6f78ff8afae7b808aae2dc50d4c40", + "sha256:572e1787d1460da79590bf44304abbc0a2da944ea64ec549188fa84d89bba7ab", + "sha256:58df5c2a0e293bf665a51f8a100d3e9956febfbf1d9aaf8c0677cf70218910c6", + "sha256:64e6175c2e53195278d7388c454e0b30997573f3f4bd63697f88d855f7a6a1fc", + "sha256:7227b47e73dedaa513cdebb98469705ef0d66eb5a1250144468e9c3097d6b59b", + "sha256:7418b6bfc7fe3331541b84bb2141c9baf1ec7132a7ecd9f375912eca810e714e", + "sha256:7cbd7574ce8e138bda9df4efc6bf2ab8572c9aff640d8ecfece1b006b68da963", + "sha256:7ff61ff178250f9bb3cd89752df0f1dd0e27316a8bd1465351652b1b4a4cdfd3", + "sha256:833e1551925ed51e6b44c800e71e77dacd7e49181fdc9ac9a0bf3714d515785d", + "sha256:8639cadfda96737427330a094476d4c7a56ac03de7265622fcf4cfe57c8ae18d", + "sha256:8c5d5b35f789a030ebb95bff352f1d27a93d81069f2adb3182d99882e095cefe", + "sha256:8c790abda465726cfb8bb08bd4ca9a5d0a7bd77c7ac1ca1b839ad823b948ea28", + "sha256:8d2f1fb53a421b410751887eb4ff21386d119ef9cde3797bf5e7ed49fb51a3b3", + "sha256:903bbd302a2378f984aef528f76d4c9b1748f318fe1294961c072bdc7f2ffa3e", + "sha256:93f81b134a165cc17123626ab8da2e30c0455441d4ab5576eed73a64c025b25c", + "sha256:95e69877983ea39b7303570fa6760f81a3eec23d0e3ab2021b7144b94d06202d", + "sha256:9633b3034d3d901f0a46b7939f8c4d64427dfba6bbc5a36b1a67364cf148a1b0", + "sha256:97e5306482182170ade15c4b0d8386ded995a07d7cc2ca8f27958d34d6736497", + "sha256:9f3cba480d3deb69f6ee2c1825060177a22c7826431458c697df88e6aeb3caee", + "sha256:aa5b467f15e78b82257319aebc78dd2915e4c1436c3c0d1ad6f53e47ba6e2713", + "sha256:abb7a75ed8b968f3061327c433a0fbd17b729947b400747c334a9c29a9af6c58", + "sha256:aec52725173bd3a7b56fe91bc56eccb26fbdff1386ef123abb63c84c5b43b63a", + "sha256:b11548073a2213d950c3f671aa88e6f83cda6e2fb97a8b6317b1b5b33d850e06", + "sha256:b1692f7d6bc45e3200844be0dba153612103db241691088626a33ff1f24a0d88", + "sha256:b336501a05e13b616ef81ce329c0e09ac5ed8c732d9ba7e3e983fcc1a9e86965", + "sha256:b8c008de9d0daba7b6666aa5bbfdc23dcd78cafc33997c9b7741ff6353bafb7f", + "sha256:b92e29e58bef6d9cfd340c72b04d74c4b4e9f70c9fa7c78b674d1fec18896dc4", + "sha256:be5f425ff1f5f4b3c1e33ad64ab994eed12fc284a6ea71c5243fd564502ecbe5", + "sha256:dd0b1e9e891f69e7675ba5c92e28b90eaa045f6ab134ffe70b52e948aa175b3c", + "sha256:e30f5ea4ae2346e62cedde8794a56858a67b878dd79f7df76a0767e356b1744a", + "sha256:e6a36bb9474218c7a5b27ae476035497a6990e21d04c279884eb10d9b290f1b1", + "sha256:e859fcb4cbe93504ea18008d1df98dee4f7766db66c435e4882ab35cf70cac43", + "sha256:eb6ea6da4c787111adf40f697b4e58732ee0942b5d3bd8f435277643329ba627", + "sha256:ec8c433b3ab0419100bd45b47c9c8551248a5aee30ca5e9d399a0b57ac04651b", + "sha256:eff9d20417ff9dcb0d25e2defc2574d10b491bf2e693b4e491914738b7908168", + "sha256:f0214eb2a23b85528310dad848ad2ac58e735612929c8072f6093f3585fd342d", + "sha256:f276df9830dba7a333544bd41070e8175762a7ac20350786b322b714b0e654f5", + "sha256:f3acda1924472472ddd60c29e5b9db0cec629fbe3c5c5accb74d6d6d14773478", + "sha256:f70a9e237bb792c7cc7e44c531fd48f5897961701cdaa06cf22fc14965c496cf", + "sha256:f9d29ca8a77117315101425ec7ec2a47a22ccf59f5593378fc4077ac5b754fce", + "sha256:fa877ca7f6b48054f847b61d6fa7bed5cebb663ebc55e018fda12db09dcc664c", + "sha256:fdcec0b8399108577ec290f55551d926d9a1fa6cad45882093a7a07ac5ec147b" ], - "markers": "python_version >= '3' and platform_machine == 'aarch64' or (platform_machine == 'ppc64le' or (platform_machine == 'x86_64' or (platform_machine == 'amd64' or (platform_machine == 'AMD64' or (platform_machine == 'win32' or platform_machine == 'WIN32')))))", - "version": "==2.0.0a2" + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==1.1.2" }, "gspread": { "hashes": [ @@ -393,11 +416,11 @@ }, "loguru": { "hashes": [ - "sha256:b28e72ac7a98be3d28ad28570299a393dfcd32e5e3f6a353dec94675767b6319", - "sha256:f8087ac396b5ee5f67c963b495d615ebbceac2796379599820e324419d53667c" + "sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c", + "sha256:4e2414d534a2ab57573365b3e6d0234dfb1d84b68b7f3b948e6fb743860a77c3" ], "index": "pypi", - "version": "==0.5.3" + "version": "==0.6.0" }, "lxml": { "hashes": [ @@ -588,6 +611,68 @@ "git": "https://github.com/bellingcat/polyphemus.git", "ref": "00a5123a3768a55ffe29f2c803a4181895f17890" }, + "psycopg2-binary": { + "hashes": [ + "sha256:01310cf4cf26db9aea5158c217caa92d291f0500051a6469ac52166e1a16f5b7", + "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76", + "sha256:090f3348c0ab2cceb6dfbe6bf721ef61262ddf518cd6cc6ecc7d334996d64efa", + "sha256:0a29729145aaaf1ad8bafe663131890e2111f13416b60e460dae0a96af5905c9", + "sha256:0c9d5450c566c80c396b7402895c4369a410cab5a82707b11aee1e624da7d004", + "sha256:10bb90fb4d523a2aa67773d4ff2b833ec00857f5912bafcfd5f5414e45280fb1", + "sha256:12b11322ea00ad8db8c46f18b7dfc47ae215e4df55b46c67a94b4effbaec7094", + "sha256:152f09f57417b831418304c7f30d727dc83a12761627bb826951692cc6491e57", + "sha256:15803fa813ea05bef089fa78835118b5434204f3a17cb9f1e5dbfd0b9deea5af", + "sha256:15c4e4cfa45f5a60599d9cec5f46cd7b1b29d86a6390ec23e8eebaae84e64554", + "sha256:183a517a3a63503f70f808b58bfbf962f23d73b6dccddae5aa56152ef2bcb232", + "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c", + "sha256:1f6b813106a3abdf7b03640d36e24669234120c72e91d5cbaeb87c5f7c36c65b", + "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834", + "sha256:2d872e3c9d5d075a2e104540965a1cf898b52274a5923936e5bfddb58c59c7c2", + "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71", + "sha256:3303f8807f342641851578ee7ed1f3efc9802d00a6f83c101d21c608cb864460", + "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e", + "sha256:3a79d622f5206d695d7824cbf609a4f5b88ea6d6dab5f7c147fc6d333a8787e4", + "sha256:404224e5fef3b193f892abdbf8961ce20e0b6642886cfe1fe1923f41aaa75c9d", + "sha256:46f0e0a6b5fa5851bbd9ab1bc805eef362d3a230fbdfbc209f4a236d0a7a990d", + "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9", + "sha256:526ea0378246d9b080148f2d6681229f4b5964543c170dd10bf4faaab6e0d27f", + "sha256:53293533fcbb94c202b7c800a12c873cfe24599656b341f56e71dd2b557be063", + "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478", + "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092", + "sha256:63638d875be8c2784cfc952c9ac34e2b50e43f9f0a0660b65e2a87d656b3116c", + "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce", + "sha256:68641a34023d306be959101b345732360fc2ea4938982309b786f7be1b43a4a1", + "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65", + "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e", + "sha256:7af0dd86ddb2f8af5da57a976d27cd2cd15510518d582b478fbb2292428710b4", + "sha256:7b1e9b80afca7b7a386ef087db614faebbf8839b7f4db5eb107d0f1a53225029", + "sha256:874a52ecab70af13e899f7847b3e074eeb16ebac5615665db33bce8a1009cf33", + "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39", + "sha256:8b344adbb9a862de0c635f4f0425b7958bf5a4b927c8594e6e8d261775796d53", + "sha256:8fc53f9af09426a61db9ba357865c77f26076d48669f2e1bb24d85a22fb52307", + "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42", + "sha256:93cd1967a18aa0edd4b95b1dfd554cf15af657cb606280996d393dadc88c3c35", + "sha256:99485cab9ba0fa9b84f1f9e1fef106f44a46ef6afdeec8885e0b88d0772b49e8", + "sha256:9d29409b625a143649d03d0fd7b57e4b92e0ecad9726ba682244b73be91d2fdb", + "sha256:a29b3ca4ec9defec6d42bf5feb36bb5817ba3c0230dd83b4edf4bf02684cd0ae", + "sha256:a9e1f75f96ea388fbcef36c70640c4efbe4650658f3d6a2967b4cc70e907352e", + "sha256:accfe7e982411da3178ec690baaceaad3c278652998b2c45828aaac66cd8285f", + "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba", + "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24", + "sha256:b1c8068513f5b158cf7e29c43a77eb34b407db29aca749d3eb9293ee0d3103ca", + "sha256:bda845b664bb6c91446ca9609fc69f7db6c334ec5e4adc87571c34e4f47b7ddb", + "sha256:c381bda330ddf2fccbafab789d83ebc6c53db126e4383e73794c74eedce855ef", + "sha256:c3ae8e75eb7160851e59adc77b3a19a976e50622e44fd4fd47b8b18208189d42", + "sha256:d1c1b569ecafe3a69380a94e6ae09a4789bbb23666f3d3a08d06bbd2451f5ef1", + "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667", + "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272", + "sha256:e3699852e22aa68c10de06524a3721ade969abf382da95884e6a10ff798f9281", + "sha256:e847774f8ffd5b398a75bc1c18fbb56564cda3d629fe68fd81971fece2d3c67e", + "sha256:ffb7a888a047696e7f8240d649b43fb3644f14f0ee229077e7f6b9f9081635bd" + ], + "index": "pypi", + "version": "==2.9.3" + }, "pyaes": { "hashes": [ "sha256:02c1b1405c38d3c370b085fb952dd8bea3fadcee6411ad99f312cc129c536d8f" @@ -711,6 +796,7 @@ "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7", "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c" ], + "markers": "python_version < '3.9'", "version": "==2022.1" }, "pytz-deprecation-shim": { @@ -725,6 +811,7 @@ "hashes": [ "sha256:af8a9b64b821529aca09ebaf6d8d279100d766f19e90b5059ac6a718ca6dee42" ], + "index": "pypi", "version": "==2.2.1" }, "regex": { @@ -808,6 +895,9 @@ "version": "==2022.3.2" }, "requests": { + "extras": [ + "socks" + ], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" @@ -828,7 +918,7 @@ "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17", "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb" ], - "markers": "python_version >= '3.6' and python_version < '4'", + "markers": "python_version >= '3.6'", "version": "==4.8" }, "s3transfer": { @@ -849,7 +939,7 @@ }, "snscrape": { "git": "https://github.com/bellingcat/snscrape.git", - "ref": "fb8d73ac95011b7ad848a6048d3eed1880e80f21" + "ref": "d32c9add8a3691c81c9091dc1a7d079e9871379f" }, "soupsieve": { "hashes": [ @@ -913,7 +1003,7 @@ "sha256:4230a49119a416c88cc47d0d2d32d5d90f1a282d5e497d49801950704e49863d", "sha256:6461b009d6792008d0000e1b0c7ca50195ec78c0e808a3a6b668a56a3236c3a5" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "index": "pypi", "version": "==4.63.1" }, "tzdata": { @@ -1073,11 +1163,11 @@ }, "click": { "hashes": [ - "sha256:19a4baa64da924c5e0cd889aba8e947f280309f1a2ce0947a3e3a7bcb7cc72d6", - "sha256:977c213473c7665d3aa092b41ff12063227751c41d7b17165013e10069cc5cd2" + "sha256:5e0d195c2067da3136efb897449ec1e9e6c98282fbf30d7f9e164af9be901a6b", + "sha256:7ab900e38149c9872376e8f9b5986ddcaf68c0f413cf73678a0bca5547e6f976" ], "markers": "python_version >= '3.7'", - "version": "==8.1.0" + "version": "==8.1.1" }, "coverage": { "extras": [ @@ -1321,9 +1411,13 @@ "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7", "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c" ], + "markers": "python_version < '3.9'", "version": "==2022.1" }, "requests": { + "extras": [ + "socks" + ], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" diff --git a/app.py b/app.py new file mode 100644 index 0000000..a55faab --- /dev/null +++ b/app.py @@ -0,0 +1,135 @@ +import argparse +from loguru import logger +import gspread +from sqlalchemy import create_engine, func +from sqlalchemy.orm import sessionmaker +import os +import time + +from cisticola.base import Channel, RawChannelInfo, mapper_registry +from cisticola.scraper import ( + ScraperController, + BitchuteScraper, + GabScraper, + GettrScraper, + OdyseeScraper, + RumbleScraper, + TelegramSnscrapeScraper, + TelegramTelethonScraper, + TwitterScraper) + +def sync_channels(args): + logger.info("Synchronizing channels") + + session = get_db_session() + + gc = gspread.service_account(filename='service_account.json') + + # Open a sheet from a spreadsheet in one go + wks = gc.open_by_url(args.gsheet).worksheet("channels") + channels = wks.get_all_records() + row = 2 + + for c in channels: + # only adding channels, so skip everything with an ID + if c['id'] == '': + del c['id'] + del c['followers'] + + if c['public'] == '': c['public'] = False + if c['chat'] == '': c['chat'] = False + + for k in c.keys(): + if c[k] == 'TRUE' or c[k] == 'yes': c[k] = True + if c[k] == 'FALSE' or c[k] == 'no': c[k] = False + + if c[k] == '': c[k] = None + + # check to see if this already exists, + platform_id = None + if c['platform_id'] != '': + platform_id = c['platform_id'] + + channel = session.query(Channel).filter_by(platform_id=platform_id, platform=c['platform'], url=c['url']).first() + + if not channel: + channel = Channel(**c, source='researcher') + logger.debug(f"{channel} does not exist, adding") + session.add(channel) + session.flush() + session.commit() + + wks.update_cell(row, 1, channel.id) + time.sleep(1) + + row += 1 + + session.commit() + +def get_db_session(): + engine = create_engine(os.environ['DB']) + + session_generator = sessionmaker() + session_generator.configure(bind=engine) + session = session_generator() + + return session + +def get_scraper_controller(): + engine = create_engine(os.environ['DB']) + + controller = ScraperController() + controller.connect_to_db(engine) + + scrapers = [ + TelegramTelethonScraper(), + TwitterScraper()] + + controller.register_scrapers(scrapers) + + return controller + +def scrape_channels(args): + logger.info(f"Scraping channels, media: {args.media}") + + controller = get_scraper_controller() + controller.scrape_all_channels(archive_media = args.media) + +def scrape_channel_info(args): + logger.info(f"Scraping channel info") + + controller = get_scraper_controller() + controller.scrape_all_channel_info() + +def archive_media(args): + logger.info(f"Archiving unarchived media") + + controller = get_scraper_controller() + controller.archive_unarchived_media() + +def init_db(): + engine = create_engine(os.environ['DB']) + mapper_registry.metadata.create_all(bind=engine) + +if __name__ == '__main__': + logger.add("./test.log", level="TRACE") + + parser = argparse.ArgumentParser(description = 'Cisticola command line tools') + parser.add_argument('command', type=str, help='Command to run: "sync-channels", "scrape-channels", or "archive-media"') + parser.add_argument('--gsheet', type=str, help='[sync-channels] URL of Google Sheet to synchronize') + parser.add_argument('--media', action='store_true', help='[scrape-channels] Add this flag to media') + + args = parser.parse_args() + + if args.command == 'init-db': + init_db() + elif args.command == 'sync-channels': + sync_channels(args) + elif args.command == 'scrape-channels': + scrape_channels(args) + elif args.command == 'archive-media': + archive_media(args) + elif args.command == 'channel-info': + scrape_channel_info(args) + else: + logger.error(f"Unrecognized command {args.command}") diff --git a/cisticola/base.py b/cisticola/base.py index ff7f136..37c897b 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -34,7 +34,7 @@ class ScraperResult: date: datetime #: JSON dump of dict that contains all data scraped for the post. - raw_data: str + raw_posts: str #: Datetime (relative to UTC) that the scraped post was archived at. date_archived: datetime @@ -44,7 +44,7 @@ class ScraperResult: #: Has the media in this post been archived? media_archived: bool - + @dataclass class Channel: """Information about a specific channel to be scraped. @@ -89,11 +89,31 @@ class Channel: def hydrate(self): pass +@dataclass +class RawChannelInfo: + """A minimally processed result from a scraper + """ + + #: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``. + scraper: str + + #: Name of platform from which result was scraped, e.g. ``"Twitter"``. + platform: str + + #: Foreign key of channel ID that this was scraped from + channel: int + + #: JSON dump of dict that contains all data scraped for the post. + raw_data: str + + #: Datetime (relative to UTC) that the scraped post was archived at. + date_archived: datetime + @dataclass class Post: """An object with fields for columns in the analysis table""" - #: ID number of the scraped post in the ``raw_data`` table + #: ID number of the scraped post in the ``raw_posts`` table raw_id: int #: Platform specific post ID @@ -144,7 +164,7 @@ class Media: """Base class for organizing information about a media file. """ - #: ID number of the media's corresponding scraped post in the ``raw_data`` table. + #: ID number of the media's corresponding scraped post in the ``raw_posts`` table. raw_id: int #: ID number of the media's corresponging scraped post in the ``analysis`` table. @@ -221,7 +241,7 @@ class Video(Media): mapper_registry = registry() -raw_data_table = Table('raw_data', mapper_registry.metadata, +raw_posts_table = Table('raw_posts', mapper_registry.metadata, Column('id', Integer, primary_key=True, autoincrement=True), Column('scraper', String), @@ -229,15 +249,23 @@ raw_data_table = Table('raw_data', mapper_registry.metadata, Column('channel', Integer, ForeignKey('channels.id')), Column('platform_id', String), Column('date', DateTime), - Column('raw_data', String), + Column('raw_posts', String), Column('date_archived', DateTime), Column('archived_urls', JSON), Column('media_archived', Boolean)) +raw_channel_info_table = Table('raw_channel_info', mapper_registry.metadata, + Column('id', Integer, primary_key=True), + Column('scraper', String), + Column('platform', String), + Column('channel', Integer, ForeignKey('channels.id')), + Column('raw_data', String), + Column('date_archived', DateTime)) + channel_table = Table('channels', mapper_registry.metadata, Column('id', Integer, primary_key=True, autoincrement=True), Column('name', String), - Column('platform_id', Integer), + Column('platform_id', String), Column('category', String), Column('platform', String), Column('url', String), @@ -253,7 +281,7 @@ channel_table = Table('channels', mapper_registry.metadata, post_table = Table('posts', mapper_registry.metadata, Column('id', Integer, primary_key=True, autoincrement=True), - Column('raw_id', Integer, ForeignKey('raw_data.id')), + Column('raw_id', Integer, ForeignKey('raw_posts.id')), Column('platform_id', Integer), Column('scraper', String), Column('transformer', String), @@ -273,7 +301,7 @@ media_table = Table('media', mapper_registry.metadata, Column('id', Integer, primary_key=True, autoincrement=True), Column('type', String), - Column('raw_id', Integer, ForeignKey('raw_data.id')), + Column('raw_id', Integer, ForeignKey('raw_posts.id')), Column('post', Integer, ForeignKey('posts.id')), Column('url', String), Column('original_url', String), @@ -282,7 +310,8 @@ media_table = Table('media', mapper_registry.metadata, mapper_registry.map_imperatively(Post, post_table) mapper_registry.map_imperatively(Channel, channel_table) -mapper_registry.map_imperatively(ScraperResult, raw_data_table) +mapper_registry.map_imperatively(ScraperResult, raw_posts_table) +mapper_registry.map_imperatively(RawChannelInfo, raw_channel_info_table) mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media') mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image') mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video') \ No newline at end of file diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 023fa3c..fb25b58 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -303,6 +303,9 @@ class ScraperController: """ self.scrapers.extend(scraper) + def remove_all_scrapers(self): + self.scrapers = [] + def scrape_all_channels(self, archive_media: bool = True): if self.session is None: logger.error("No DB session") @@ -313,6 +316,17 @@ class ScraperController: channels = session.query(Channel).where(Channel.source=='researcher').all() return self.scrape_channels(channels, archive_media=archive_media) + + def scrape_all_channel_info(self): + if self.session is None: + logger.error("No DB session") + return + + session = self.session() + + channels = session.query(Channel).where(Channel.source=='researcher').all() + + return self.scrape_channel_info(channels) @logger.catch(reraise = True) def scrape_channels(self, channels: List[Channel], archive_media: bool = True): @@ -336,6 +350,7 @@ class ScraperController: for scraper in self.scrapers: if scraper.can_handle(channel): + logger.debug(f"{scraper} is handling {channel}") handled = True added = 0 @@ -382,7 +397,7 @@ class ScraperController: for scraper in self.scrapers: if scraper.__version__ == post.scraper: handled = True - logger.info(f"{scraper} is archiving media for {post}") + logger.debug(f"{scraper} is archiving media for ID {post.id}") post = scraper.archive_files(post) if post: @@ -396,6 +411,48 @@ class ScraperController: session.commit() + @logger.catch(reraise = True) + def scrape_channel_info(self, channels: List[Channel]): + """Scrape channel info for specified channels. + + Parameters + ---------- + channels: list + List of Channel instances to be scraped + archive_media: bool + If ``True``, any media files (images, video, etc.) from posts are archived. + If ``False``, media files are not archived. + """ + + if self.session is None: + logger.error("No DB session") + return + + for channel in channels: + handled = False + + for scraper in self.scrapers: + if scraper.can_handle(channel): + logger.debug(f"{scraper} is getting channel info for {channel}") + handled = True + + # get most recent post + session = self.session() + + try: + info = scraper.get_profile(channel) + session.add(info) + + session.commit() + logger.info( + f"{scraper} found {info}") + break + except ChannelDoesNotExistError: + logger.warning(f"ChannelDoesNotExist {channel}") + + if not handled: + logger.warning(f"No handler found for Channel {channel}") + def connect_to_db(self, engine): """Connect the specified SQLAlchemy engine to the controller. """ diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index e9a9770..034e3ac 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -9,7 +9,7 @@ from typing import Generator import requests from bs4 import BeautifulSoup -from cisticola.base import Channel, ScraperResult +from cisticola.base import Channel, ScraperResult, RawChannelInfo from cisticola.scraper.base import Scraper class BitchuteScraper(Scraper): @@ -57,7 +57,7 @@ class BitchuteScraper(Scraper): platform_id=post['id'], date=datetime.fromtimestamp(post['timestamp']), date_archived=datetime.now(timezone.utc), - raw_data=json.dumps(post), + raw_posts=json.dumps(post), archived_urls=archived_urls, media_archived=archive_media) @@ -65,7 +65,7 @@ class BitchuteScraper(Scraper): if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None: return True - def get_profile(self, channel: Channel) -> dict: + def get_profile(self, channel: Channel) -> RawChannelInfo: base_url = channel.url @@ -106,8 +106,12 @@ class BitchuteScraper(Scraper): 'subscribers': counts['subscriber_count'], 'views': int(counts['about_view_count'].split(' ')[0])} - return profile - + + return RawChannelInfo(scraper=self.__version__, + platform=channel.platform, + channel=channel.id, + raw_data=json.dumps(profile), + date_archived=datetime.now(timezone.utc)) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# def strip_tags(html, convert_newlines=True): diff --git a/cisticola/scraper/gab.py b/cisticola/scraper/gab.py index d1b6fbb..f66d562 100644 --- a/cisticola/scraper/gab.py +++ b/cisticola/scraper/gab.py @@ -5,7 +5,7 @@ import os from gabber.client import Client, GAB_API_BASE_URL -from cisticola.base import Channel, ScraperResult +from cisticola.base import Channel, ScraperResult, RawChannelInfo from cisticola.scraper.base import Scraper class GabScraper(Scraper): @@ -80,7 +80,7 @@ class GabScraper(Scraper): platform_id=post['id'], date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc), date_archived=datetime.now(timezone.utc), - raw_data=json.dumps(post), + raw_posts=json.dumps(post), archived_urls=archived_urls, media_archived=archive_media) @@ -88,7 +88,7 @@ class GabScraper(Scraper): if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None: return True - def get_profile(self, channel: Channel) -> dict: + def get_profile(self, channel: Channel) -> RawChannelInfo: client = Client( username = os.environ['GAB_USER'], @@ -106,4 +106,8 @@ class GabScraper(Scraper): profile = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json() - return profile \ No newline at end of file + return RawChannelInfo(scraper=self.__version__, + platform=channel.platform, + channel=channel.id, + raw_data=json.dumps(profile), + date_archived=datetime.now(timezone.utc)) diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index c8e63f9..a5088cd 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -5,7 +5,7 @@ from urllib.parse import urlparse from gogettr import PublicClient -from cisticola.base import Channel, ScraperResult +from cisticola.base import Channel, ScraperResult, RawChannelInfo from cisticola.scraper.base import Scraper class GettrScraper(Scraper): @@ -58,7 +58,7 @@ class GettrScraper(Scraper): platform_id=post['_id'], date=datetime.fromtimestamp(post['cdate']/1000.), date_archived=datetime.now(timezone.utc), - raw_data=json.dumps(post), + raw_posts=json.dumps(post), archived_urls=archived_urls, media_archived=archive_media) @@ -71,9 +71,13 @@ class GettrScraper(Scraper): key = urlparse(url).path.split('/')[-2] + ext return key - def get_profile(self, channel: Channel) -> dict: + def get_profile(self, channel: Channel) -> RawChannelInfo: client = client = PublicClient() username = self.get_username_from_url(channel.url) profile = client.user_info(username) - return profile \ No newline at end of file + return RawChannelInfo(scraper=self.__version__, + platform=channel.platform, + channel=channel.id, + raw_data=json.dumps(profile), + date_archived=datetime.now(timezone.utc)) diff --git a/cisticola/scraper/instagram.py b/cisticola/scraper/instagram.py index f045011..dfe0304 100644 --- a/cisticola/scraper/instagram.py +++ b/cisticola/scraper/instagram.py @@ -8,7 +8,7 @@ from pathlib import Path from loguru import logger import instaloader -from cisticola.base import Channel, ScraperResult +from cisticola.base import Channel, ScraperResult, RawChannelInfo from cisticola.scraper.base import Scraper BASE_URL = 'https://www.instagram.com/' @@ -79,7 +79,7 @@ class InstagramScraper(Scraper): platform_id=post.mediaid, date=post.date_utc, date_archived=datetime.now(timezone.utc), - raw_data=json.dumps(post._asdict(), default=str), + raw_posts=json.dumps(post._asdict(), default=str), archived_urls=archived_urls, media_archived=archive_media) @@ -96,7 +96,7 @@ class InstagramScraper(Scraper): platform_id=post.mediaid, date=comment.created_at_utc, date_archived=datetime.now(timezone.utc), - raw_data=json.dumps(comment_dict, default=str), + raw_posts=json.dumps(comment_dict, default=str), archived_urls={}, media_archived=archive_media) @@ -104,7 +104,7 @@ class InstagramScraper(Scraper): if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None: return True - def get_profile(self, channel: Channel) -> dict: + def get_profile(self, channel: Channel) -> RawChannelInfo: username = self.get_username_from_url(channel.url) @@ -125,4 +125,8 @@ class InstagramScraper(Scraper): profile['followers'] = user_profile.followers profile['followees'] = user_profile.followees - return profile \ No newline at end of file + return RawChannelInfo(scraper=self.__version__, + platform=channel.platform, + channel=channel.id, + raw_data=json.dumps(profile), + date_archived=datetime.now(timezone.utc)) diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py index 4ff80e0..0f5db65 100644 --- a/cisticola/scraper/odysee.py +++ b/cisticola/scraper/odysee.py @@ -8,7 +8,7 @@ from loguru import logger from polyphemus.base import OdyseeChannel from polyphemus.api import get_auth_token -from cisticola.base import Channel, ScraperResult +from cisticola.base import Channel, ScraperResult, RawChannelInfo from cisticola.scraper.base import Scraper class OdyseeScraper(Scraper): @@ -60,7 +60,7 @@ class OdyseeScraper(Scraper): platform_id=video.info['claim_id'], date=datetime.fromtimestamp(video.info['created']), date_archived=datetime.now(timezone.utc), - raw_data=json.dumps(video.info), + raw_posts=json.dumps(video.info), archived_urls=archived_urls, media_archived=archive_media) @@ -73,7 +73,7 @@ class OdyseeScraper(Scraper): platform_id=comment.info['claim_id'], date=datetime.fromtimestamp(comment.info['created']), date_archived=datetime.now(), - raw_data=json.dumps(comment.info), + raw_posts=json.dumps(comment.info), archived_urls={}, media_archived=True) @@ -87,10 +87,14 @@ class OdyseeScraper(Scraper): return f'{key}.{ext}' - def get_profile(self, channel: Channel) -> dict: + def get_profile(self, channel: Channel) -> RawChannelInfo: username = self.get_username_from_url(channel.url) odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token) profile = odysee_channel.info - return profile \ No newline at end of file + return RawChannelInfo(scraper=self.__version__, + platform=channel.platform, + channel=channel.id, + raw_data=json.dumps(profile), + date_archived=datetime.now(timezone.utc)) \ No newline at end of file diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index 39a29ba..cb24c57 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -5,7 +5,7 @@ from urllib.parse import urlparse from bs4 import BeautifulSoup -from cisticola.base import Channel, ScraperResult +from cisticola.base import Channel, ScraperResult, RawChannelInfo from cisticola.scraper import Scraper, make_request BASE_URL = 'https://rumble.com' @@ -39,7 +39,7 @@ class RumbleScraper(Scraper): platform_id=post['media_url'].split('/')[-2], date=post['datetime'].replace(tzinfo=timezone.utc), date_archived=datetime.now(timezone.utc), - raw_data=json.dumps(post, default = str), + raw_posts=json.dumps(post, default = str), archived_urls=archived_urls, media_archived=archive_media) @@ -52,11 +52,15 @@ class RumbleScraper(Scraper): if channel.platform == "Rumble" and channel.url is not None: return True - def get_profile(self, channel: Channel) -> dict: + def get_profile(self, channel: Channel) -> RawChannelInfo: profile = get_channel_profile(url = channel.url) - return profile + return RawChannelInfo(scraper=self.__version__, + platform=channel.platform, + channel=channel.id, + raw_data=json.dumps(profile), + date_archived=datetime.now(timezone.utc)) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# @@ -128,6 +132,7 @@ def get_channel_profile(url): 'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None, 'cover': cover_soup.get('src') if cover_soup else None, 'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text} + return profile #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py index d181609..e683296 100644 --- a/cisticola/scraper/telegram_snscrape.py +++ b/cisticola/scraper/telegram_snscrape.py @@ -1,10 +1,10 @@ from typing import Generator from datetime import datetime, timezone - +import json import snscrape.modules from loguru import logger -from cisticola.base import Channel, ScraperResult +from cisticola.base import Channel, ScraperResult, RawChannelInfo from cisticola.scraper.base import Scraper class TelegramSnscrapeScraper(Scraper): @@ -49,15 +49,20 @@ class TelegramSnscrapeScraper(Scraper): platform_id=post.url, date=post.date, date_archived=datetime.now(timezone.utc), - raw_data=post.json(), + raw_posts=post.json(), archived_urls=archived_urls, media_archived=archive_media ) - def get_profile(self, channel: Channel) -> dict: + def get_profile(self, channel: Channel) -> RawChannelInfo: scr = snscrape.modules.telegram.TelegramChannelScraper( channel.screenname) profile = scr._get_entity().__dict__ - return profile \ No newline at end of file + + return RawChannelInfo(scraper=self.__version__, + platform=channel.platform, + channel=channel.id, + raw_data=json.dumps(profile), + date_archived=datetime.now(timezone.utc)) diff --git a/cisticola/scraper/telegram_telethon.py b/cisticola/scraper/telegram_telethon.py index b300551..fb9a58f 100644 --- a/cisticola/scraper/telegram_telethon.py +++ b/cisticola/scraper/telegram_telethon.py @@ -11,7 +11,7 @@ from telethon.sync import TelegramClient from telethon.tl.functions.channels import GetFullChannelRequest from telethon.tl import types -from cisticola.base import Channel, ScraperResult +from cisticola.base import Channel, ScraperResult, RawChannelInfo from cisticola.scraper.base import Scraper MEDIA_TYPES = ['photo', 'video', 'document', 'webpage'] @@ -44,7 +44,7 @@ class TelegramTelethonScraper(Scraper): key = list(result.archived_urls.keys())[0] if result.archived_urls[key] is None: - raw = json.loads(result.raw_data) + raw = json.loads(result.raw_posts) message = client.get_messages(raw['peer_id']['channel_id'], ids=[raw['id']]) @@ -66,13 +66,10 @@ class TelegramTelethonScraper(Scraper): return result def archive_post_media(self, post : types.Message, client : TelegramClient = None): - logger.debug(f"Archiving post {post}") - if post.media is None: + logger.debug("No media for post") return None, None - logger.debug(f"Archiving media {post.media}") - if client is None: api_id = os.environ['TELEGRAM_API_ID'] api_hash = os.environ['TELEGRAM_API_HASH'] @@ -81,6 +78,11 @@ class TelegramTelethonScraper(Scraper): with TelegramClient(phone, api_id, api_hash) as client: return self.archive_post_media(post, client=client) + if type(post.media) == types.MessageMediaDocument: + logger.debug(f"Archiving {type(post.media)} with size {post.media.document.size/(1024*1024)} MB") + else: + logger.debug(f"Archiving {type(post.media)}") + key = f'{post.peer_id.channel_id}_{post.id}' with tempfile.TemporaryDirectory() as temp_dir: @@ -88,6 +90,10 @@ class TelegramTelethonScraper(Scraper): client.download_media(post.media, output_file) + if len(os.listdir(temp_dir)) == 0: + logger.warning(f"No file present. Could not archive {post.media}") + return None, None + output_file_with_ext = os.listdir(temp_dir)[0] filename = Path(temp_dir, output_file_with_ext) @@ -96,11 +102,13 @@ class TelegramTelethonScraper(Scraper): return (blob, output_file_with_ext) def can_handle(self, channel): - if channel.platform == "Telegram" and channel.public and not channel.chat: + if channel.platform == "Telegram" and channel.public: return True def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: - username = self.get_username_from_url(channel.url) + username = channel.screenname + if username is None: + username = self.get_username_from_url(channel.url) api_id = os.environ['TELEGRAM_API_ID'] api_hash = os.environ['TELEGRAM_API_HASH'] @@ -110,14 +118,13 @@ class TelegramTelethonScraper(Scraper): for post in client.iter_messages(username): post_url = f'{channel.url}/{post.id}' - logger.info(f"Archiving post {post_url} from {post.date}") + logger.trace(f"Archiving post {post_url} from {post.date}") if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}') break archived_urls = {} - logger.info(f"Archiving post {post_url}") if post.media is not None: archived_urls[post_url] = None @@ -136,13 +143,14 @@ class TelegramTelethonScraper(Scraper): platform_id=post_url, date=post.date.replace(tzinfo=timezone.utc), date_archived=datetime.now(timezone.utc), - raw_data=json.dumps(post.to_dict(), default=str), + raw_posts=json.dumps(post.to_dict(), default=str), archived_urls=archived_urls, media_archived=archive_media) - def get_profile(self, channel: Channel) -> dict: - - username = self.get_username_from_url(channel.url) + def get_profile(self, channel: Channel) -> RawChannelInfo: + username = channel.screenname + if username is None: + username = self.get_username_from_url(channel.url) api_id = os.environ['TELEGRAM_API_ID'] api_hash = os.environ['TELEGRAM_API_HASH'] @@ -150,6 +158,10 @@ class TelegramTelethonScraper(Scraper): with TelegramClient(phone, api_id, api_hash) as client: full_channel = client(GetFullChannelRequest(channel = username)) - profile = full_channel.__dict__ + profile = full_channel.to_dict() - return profile + return RawChannelInfo(scraper=self.__version__, + platform=channel.platform, + channel=channel.id, + raw_data=json.dumps(profile, default=str), + date_archived=datetime.now(timezone.utc)) diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index cc0afb9..6ed37db 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -1,11 +1,11 @@ from datetime import datetime, timezone from typing import Generator from urllib.parse import urlparse, parse_qs - from snscrape.modules.twitter import TwitterProfileScraper, TwitterUserScraper, Video, Gif, Photo from loguru import logger +import json -from cisticola.base import Channel, ScraperResult +from cisticola.base import Channel, ScraperResult, RawChannelInfo from cisticola.scraper.base import Scraper, ChannelDoesNotExistError class TwitterScraper(Scraper): @@ -13,7 +13,12 @@ class TwitterScraper(Scraper): __version__ = "TwitterScraper 0.0.1" def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: - scraper = TwitterProfileScraper(channel.platform_id) + if channel.platform_id: + identifier = channel.platform_id + else: + identifier = channel.screenname + + scraper = TwitterProfileScraper(identifier) first = True @@ -32,10 +37,10 @@ class TwitterScraper(Scraper): if tweet.media: media_list += tweet.media - if tweet.retweetedTweet and tweet.retweetedTweet.media: + if tweet.retweetedTweet and hasattr(tweet.retweetedTweet, 'media') and tweet.retweetedTweet.media: media_list += tweet.retweetedTweet.media - if tweet.quotedTweet and tweet.quotedTweet.media: + if tweet.quotedTweet and hasattr(tweet.quotedTweet, 'media') and tweet.quotedTweet.media: media_list += tweet.quotedTweet.media for media in media_list: @@ -66,12 +71,12 @@ class TwitterScraper(Scraper): platform_id=tweet.id, date=tweet.date, date_archived=datetime.now(timezone.utc), - raw_data=tweet.json(), + raw_posts=tweet.json(), archived_urls=archived_urls, media_archived=archive_media) def can_handle(self, channel): - if channel.platform == "Twitter" and channel.platform_id: + if channel.platform == "Twitter" and (channel.platform_id or channel.screenname): return True def url_to_key(self, url: str, content_type: str) -> str: @@ -91,7 +96,7 @@ class TwitterScraper(Scraper): key = parsed_url.path.split('/')[-1] + ext return key - def get_profile(self, channel: Channel) -> dict: + def get_profile(self, channel: Channel) -> RawChannelInfo: scraper = TwitterUserScraper(channel.screenname) entity = scraper._get_entity() @@ -99,4 +104,8 @@ class TwitterScraper(Scraper): if entity is None: raise ChannelDoesNotExistError(channel.url) else: - return entity.__dict__ \ No newline at end of file + return RawChannelInfo(scraper=self.__version__, + platform=channel.platform, + channel=channel.id, + raw_data=json.dumps(entity.__dict__, default=str), + date_archived=datetime.now(timezone.utc)) diff --git a/cisticola/scraper/vkontakte.py b/cisticola/scraper/vkontakte.py index 97724c6..3f23bca 100644 --- a/cisticola/scraper/vkontakte.py +++ b/cisticola/scraper/vkontakte.py @@ -1,11 +1,10 @@ from datetime import datetime, timezone from typing import Generator from urllib.parse import urlparse - from snscrape.modules.vkontakte import VKontakteUserScraper from loguru import logger -from cisticola.base import Channel, ScraperResult +from cisticola.base import Channel, ScraperResult, RawChannelInfo from cisticola.scraper.base import Scraper class VkontakteScraper(Scraper): @@ -62,7 +61,7 @@ class VkontakteScraper(Scraper): platform_id=post.url.split('/')[-1], date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc), date_archived=datetime.now(timezone.utc), - raw_data=post.json(), + raw_posts=post.json(), archived_urls=archived_urls, media_archived=archive_media) @@ -80,10 +79,15 @@ class VkontakteScraper(Scraper): return key - def get_profile(self, channel: Channel) -> dict: + def get_profile(self, channel: Channel) -> RawChannelInfo: username = self.get_username_from_url(channel.url) scraper = VKontakteUserScraper(username) profile = scraper._get_entity().__dict__ - return profile \ No newline at end of file + + return RawChannelInfo(scraper=self.__version__, + platform=channel.platform, + channel=channel.id, + raw_data=json.dumps(profile), + date_archived=datetime.now(timezone.utc)) diff --git a/cisticola/scraper/youtube.py b/cisticola/scraper/youtube.py index 1e2346b..0d4c8e3 100644 --- a/cisticola/scraper/youtube.py +++ b/cisticola/scraper/youtube.py @@ -2,10 +2,9 @@ from datetime import datetime, timezone import json from typing import Generator import tempfile - import yt_dlp -from cisticola.base import Channel, ScraperResult +from cisticola.base import Channel, ScraperResult, RawChannelInfo from cisticola.scraper import Scraper class YoutubeScraper(Scraper): @@ -71,7 +70,7 @@ class YoutubeScraper(Scraper): platform_id=video_id, date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc), date_archived=datetime.now(timezone.utc), - raw_data=json.dumps(video, default = str), + raw_posts=json.dumps(video, default = str), archived_urls=archived_urls, media_archived=archive_media) @@ -79,8 +78,7 @@ class YoutubeScraper(Scraper): if channel.platform == "Youtube" and channel.url: return True - def get_profile(self, channel: Channel) -> dict: - + def get_profile(self, channel: Channel) -> RawChannelInfo: ydl_opts = {} ydl = yt_dlp.YoutubeDL(ydl_opts) @@ -89,7 +87,12 @@ class YoutubeScraper(Scraper): meta = ydl.extract_info( channel.url, process=False) + + return RawChannelInfo(scraper=self.__version__, + platform=channel.platform, + channel=channel.id, + raw_data=json.dumps(meta), + date_archived=datetime.now(timezone.utc)) + except yt_dlp.utils.DownloadError as e: raise e - - return meta \ No newline at end of file diff --git a/cisticola/transformer/bitchute.py b/cisticola/transformer/bitchute.py index d0c5fe0..61b327d 100644 --- a/cisticola/transformer/bitchute.py +++ b/cisticola/transformer/bitchute.py @@ -20,7 +20,7 @@ class BitchuteTransformer(Transformer): return False def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]: - raw = json.loads(data.raw_data) + raw = json.loads(data.raw_posts) orig = raw['video_url'] new = data.archived_urls[orig] @@ -30,7 +30,7 @@ class BitchuteTransformer(Transformer): yield m def transform(self, data: ScraperResult) -> Post: - raw = json.loads(data.raw_data) + raw = json.loads(data.raw_posts) soup = BeautifulSoup(raw['body'], features = 'html.parser') content = soup.find_all('p')[-1].text diff --git a/cisticola/transformer/twitter.py b/cisticola/transformer/twitter.py index 85ada05..8fa2e68 100644 --- a/cisticola/transformer/twitter.py +++ b/cisticola/transformer/twitter.py @@ -47,7 +47,7 @@ class TwitterTransformer(Transformer): def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]: - raw = json.loads(data.raw_data) + raw = json.loads(data.raw_posts) transformed = Post( raw_id=data.id, diff --git a/test.py b/test.py deleted file mode 100644 index 8726820..0000000 --- a/test.py +++ /dev/null @@ -1,74 +0,0 @@ -from sqlalchemy import create_engine -from loguru import logger -import gspread -from sqlalchemy import create_engine -from sqlalchemy.orm import sessionmaker - -from cisticola.base import Channel, Post, ScraperResult, mapper_registry -from cisticola.scraper import ( - ScraperController, - BitchuteScraper, - GabScraper, - GettrScraper, - OdyseeScraper, - RumbleScraper, - TelegramSnscrapeScraper, - TelegramTelethonScraper, - TwitterScraper) -from cisticola.transformer import ETLController -from cisticola.transformer.twitter import TwitterTransformer - -logger.add("../test.log") - -controller = ScraperController() - -scrapers = [ - BitchuteScraper(), - GabScraper(), - GettrScraper(), - OdyseeScraper(), - RumbleScraper(), - TelegramTelethonScraper(), - TwitterScraper()] - -controller.register_scrapers(scrapers) - -engine = create_engine('sqlite:///test.db') -mapper_registry.metadata.create_all(bind=engine) -session_generator = sessionmaker() -session_generator.configure(bind=engine) -session = session_generator() - -gc = gspread.service_account(filename='service_account.json') - -# Open a sheet from a spreadsheet in one go -wks = gc.open_by_url("https://docs.google.com/spreadsheets/d/1k5VgqREoA3v1r7bkVq7TOTRDtdYqTMWkQnsZpRbntpw/edit#gid=0") -channels = wks.worksheet("channels").get_all_records() - -for c in channels: - del c['followers'] - - for k in c.keys(): - if c[k] == 'TRUE' or c[k] == 'yes': c[k] = True - if c[k] == 'FALSE' or c[k] == 'no': c[k] = False - - # check to see if this already exists, - channel = session.query(Channel).filter_by(platform_id=c['platform_id'], platform=c['platform']).first() - - if not channel: - channel = Channel(**c, source='researcher') - session.add(channel) - -session.commit() - -controller.connect_to_db(engine) -controller.scrape_all_channels(archive_media = False) - -controller.archive_unarchived_media() - -# transformer = TwitterTransformer() - -# etl_controller = ETLController() -# etl_controller.register_transformer(transformer) -# etl_controller.connect_to_db(engine) -# etl_controller.transform_all_untransformed() diff --git a/tests/scraper/telegram_telethon.py b/tests/scraper/telegram_telethon.py index c6fb399..ee994eb 100644 --- a/tests/scraper/telegram_telethon.py +++ b/tests/scraper/telegram_telethon.py @@ -4,6 +4,7 @@ from cisticola.base import Channel from cisticola.scraper import TelegramTelethonScraper def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs): + controller.remove_all_scrapers() channels = [Channel(**channel_kwargs['telegram'])] controller.register_scraper(scraper = TelegramTelethonScraper()) @@ -13,6 +14,7 @@ def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs): def test_scrape_telegram_telethon_channel(controller, channel_kwargs): controller.reset_db() + controller.remove_all_scrapers() channels = [Channel(**channel_kwargs['telegram'])] controller.register_scraper(scraper = TelegramTelethonScraper())