Merge pull request #20 from bellingcat/separate-media-archiving

WIP: Separate media archiving and CLI
This commit is contained in:
Logan Williams
2022-03-31 16:28:30 +02:00
committed by GitHub
21 changed files with 540 additions and 235 deletions

3
.gitignore vendored
View File

@@ -11,10 +11,11 @@ docs/source/_*
*.session
service_account.json
.vscode/
*.log
# Unit test / coverage reports
reports
.coverage
.coverage*
.cache
.pytest_cache/
cover/

View File

@@ -22,6 +22,9 @@ instaloader = "*"
gspread = "*"
cryptg = "*"
gabber = {git = "https://github.com/stanfordio/gabber.git"}
psycopg2-binary = "*"
tqdm = "*"
ratelimit = "*"
[dev-packages]
pytest = "*"
@@ -33,7 +36,7 @@ sphinx = "*"
sphinx_rtd_theme = "*"
[requires]
python_version = "3.9"
python_version = "3.8"
[pipenv]
allow_prereleases = true

246
Pipfile.lock generated
View File

@@ -1,11 +1,11 @@
{
"_meta": {
"hash": {
"sha256": "b712e767d64e54e83e8c2d8a27a68203583ed7ad31d4ea3b4b6076a72a2150fd"
"sha256": "e57f79178ac0e05f9753a29f97e08d2ae96b7775044bb4c6ba616baae1d21183"
},
"pipfile-spec": 6,
"requires": {
"python_version": "3.9"
"python_version": "3.8"
},
"sources": [
{
@@ -16,6 +16,28 @@
]
},
"default": {
"backports.zoneinfo": {
"hashes": [
"sha256:17746bd546106fa389c51dbea67c8b7c8f0d14b5526a579ca6ccf5ed72c526cf",
"sha256:1b13e654a55cd45672cb54ed12148cd33628f672548f373963b0bff67b217328",
"sha256:1c5742112073a563c81f786e77514969acb58649bcdf6cdf0b4ed31a348d4546",
"sha256:4a0f800587060bf8880f954dbef70de6c11bbe59c673c3d818921f042f9954a6",
"sha256:5c144945a7752ca544b4b78c8c41544cdfaf9786f25fe5ffb10e838e19a27570",
"sha256:7b0a64cda4145548fed9efc10322770f929b944ce5cee6c0dfe0c87bf4c0c8c9",
"sha256:8439c030a11780786a2002261569bdf362264f605dfa4d65090b64b05c9f79a7",
"sha256:8961c0f32cd0336fb8e8ead11a1f8cd99ec07145ec2931122faaac1c8f7fd987",
"sha256:89a48c0d158a3cc3f654da4c2de1ceba85263fafb861b98b59040a5086259722",
"sha256:a76b38c52400b762e48131494ba26be363491ac4f9a04c1b7e92483d169f6582",
"sha256:da6013fd84a690242c310d77ddb8441a559e9cb3d3d59ebac9aca1a57b2e18bc",
"sha256:e55b384612d93be96506932a786bbcde5a2db7a9e6a4bb4bffe8b733f5b9036b",
"sha256:e81b76cace8eda1fca50e345242ba977f9be6ae3945af8d46326d776b4cf78d1",
"sha256:e8236383a20872c0cdf5a62b554b27538db7fa1bbec52429d8d106effbaeca08",
"sha256:f04e857b59d9d1ccc39ce2da1021d196e47234873820cbeaad210724b1ee28ac",
"sha256:fadbfe37f74051d024037f223b8e001611eac868b5c5b06144ef4d8b799862f2"
],
"markers": "python_version >= '3.6' and python_version < '3.9'",
"version": "==0.2.1"
},
"beautifulsoup4": {
"hashes": [
"sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf",
@@ -26,19 +48,19 @@
},
"boto3": {
"hashes": [
"sha256:127ebdf58c8825b53f1eff111e08c49ffffeb1f6d7a5665c9907ce8128fe14b1",
"sha256:b7ce3bf013f0f60e40c2676d5a7b620ed927cfad0aa348a606b10e9a0387f249"
"sha256:ef210f8e85cdb6d26a38ebad1cfe9cefdef2ab269207e5987653555375a7ef6b",
"sha256:f0af8f4ef5fe6353c794cd3cce627d469a618b58ace7ca75a63cfd719df615ce"
],
"index": "pypi",
"version": "==1.21.29"
"version": "==1.21.30"
},
"botocore": {
"hashes": [
"sha256:b467d64cd773dc4d49ef31b18a8dded554f284f799720bd12e989fe2138fd5b8",
"sha256:de87907d42682179946ddfa113b9334e3c4258404aef19edd8c92381ff54775c"
"sha256:af4bdc51eeecbe9fdcdadbed9ad58c5c91380ef30f3560022bbc2ee1d78f0ad6",
"sha256:c622751093e3d0bf61343e66d6d06190ef30bf42b1557d5070ca84e9efa06d4b"
],
"markers": "python_version >= '3.6'",
"version": "==1.24.29"
"version": "==1.24.30"
},
"brotli": {
"hashes": [
@@ -195,11 +217,11 @@
},
"click": {
"hashes": [
"sha256:19a4baa64da924c5e0cd889aba8e947f280309f1a2ce0947a3e3a7bcb7cc72d6",
"sha256:977c213473c7665d3aa092b41ff12063227751c41d7b17165013e10069cc5cd2"
"sha256:5e0d195c2067da3136efb897449ec1e9e6c98282fbf30d7f9e164af9be901a6b",
"sha256:7ab900e38149c9872376e8f9b5986ddcaf68c0f413cf73678a0bca5547e6f976"
],
"markers": "python_version >= '3.7'",
"version": "==8.1.0"
"version": "==8.1.1"
},
"cryptg": {
"hashes": [
@@ -302,63 +324,64 @@
},
"greenlet": {
"hashes": [
"sha256:004aed447382d80a56ecc354a6d807f305e6c808714ce6ccbca4839c94fae81d",
"sha256:068d68fad6bd623e29a2d36e74538c9b9d6dc6464931cd27d93da6cfc6a7f242",
"sha256:06fd4075754009c9817c6b4e1dc0af4616de52757b6ca973a81c3c1aadc28257",
"sha256:1004cb542451814b12a4f38e835a47734e2b2c683acbf463d5ae76282a3974cf",
"sha256:10c358633a8b27bfc32d27114ef2ca2ddc9f1f89f1643d1157b85e1fdd695315",
"sha256:115bc25fefbdc692c4483e9ddb9011ccd0251590ed59dbfff0f4eb7050bf99c4",
"sha256:1d987a2579336792f73ae6b106c2f087e32afc8573fbf9566f123ac6d8cfb72f",
"sha256:2128d727fd1e8afba8e68feb2cdcf88c90163b69ddc9707722a3e491c5280720",
"sha256:230132c241fe284f93f2e7b3969e9b22bbd76ef98cf93e382c945d378907f5a4",
"sha256:23558f7bd08a663386c032ab8d302d613d2d02ae0c9758ad410bab6035b58d3d",
"sha256:255d520d3e4a5f16883b182e1a94219fe455ab4f50aaaf534bfd6d64ee728397",
"sha256:2a6bc19a728f6f643cfc89b876159a1a25a8f7d8700c013d48a73691f80b4550",
"sha256:379bed346ef8ba0a0e698b3c5975a44d15dd4a5bbff40bbd7fd548b445d5550b",
"sha256:3b12d0866759db93b0a893b4e50a7d7d1681519d2346c26695bb8bb2c652230e",
"sha256:40d491944f69e350e1e8b25f6ca49459824ede1678ec0cd4b5541f41edc06614",
"sha256:471484c7b9d7b7867263051aa81cdeed6e06b455e629a7f05eb91a6cb8bd0836",
"sha256:488c557080557bc01aabb3e1bda7225c68455b853733a8652857ac0d810dad1b",
"sha256:49c2e76e7aa81ba889b3c183e2341af3cc6161ee38852085110ae49d5b5d9a40",
"sha256:52d13ec90236e5935ed6da044e78faa1371d5116cc43fe6d7ca8994dd619ef96",
"sha256:57898c69a253d81f487787bdd538629fabd671fab8a9e31b041ca30965fd9556",
"sha256:5d577eef5beb5730ef01ab39983eb852a97c359b7a546809adf70c409f4b2ecc",
"sha256:6a41987c1474c9158a0c0c96611530a8f299bc547d35bee8add981b8b2534f74",
"sha256:6ae67b7df8db3626af8e042e9c6949cfa27d1a3bbbfdff29e45b72bb6673a650",
"sha256:6c42c27e9d12e8a481aff469ffe8dd4ce0484c354a418470960f760f6ae41e7c",
"sha256:6c4a90c9f6128b4d0905a89930bd325e0491574e5cb453f606bb7094a3197587",
"sha256:6e64518e5833ac2d9359b6d9bd4df2c0cf441a0f3a4eca9e735fbea99009fa70",
"sha256:6fd3a270c23c5b42d86a9c7c6b0229f23ee4a7a4cabdaaa1693ad7a0982d13cb",
"sha256:70db73351e0fcf11a76288c47a0469d9a330bcb2e7618c5eb57432b8caa82403",
"sha256:771f401692046845626cbdf1dd0f04e999413ede0ee9ad39033fe30b5fa2e845",
"sha256:7935026ec61b967cbc6b746c0ca75c1651ea118d7fee4d259cff9e6866153374",
"sha256:7b76b1cac9baac1980210e29145800954e7b42e91ef69c4d695de1cab87ce41f",
"sha256:7e3f37c11b6699b1a1e0fcc0e88829dba4f2866546381b05ab8b3f4db645a823",
"sha256:8370fa65ad421484894f559055f951843754153b72b9bca2ebdc5288efe2e3f0",
"sha256:8ae9c443d44a4e23252632e4d7775f419f992d0df3eff923e23775f5cc551d39",
"sha256:8b31d85f2781e44f1ffaaf7ea07f484e7d42317c677c355fa77b4a1a4bea7394",
"sha256:8b450336b27f3b375cadc474c6704838eaa8dd3ca312aac3bb69d92264a8e638",
"sha256:9ce84357388a76d886febff4e50e321c212ffd3248b590960b2da6e02404a5c9",
"sha256:a23e986fb0ba8e7407286add41fa0d4207be44e3dce1b04789f4757800eca1cf",
"sha256:a81610ee00d0da9cd2c8679479b7791149365b6dfb3971b01b22ee29b04787ce",
"sha256:b4e40444975e5ab0ed3004369209c39a28e084951daaeee4919f164b6b849b14",
"sha256:b66600de16702b9dfa74bea34524b55183a2183e5fd92f20fe6c2fcae550a64c",
"sha256:ba6ee18694d3673796b7a31b7d21254e87e9e43ca5be56f323fd396111255315",
"sha256:bd03837da28293baa39bdfc3cada69e2f8807f423ae06168aa28d2b32c63a6b6",
"sha256:bd2192070f88c0778ae1d68a0980fdece3473498c1db37f3794e3454f91e3ecf",
"sha256:c1f6f1a3cc013012cd1da913c40b13e6d721046a8c8a0ea0cde94069645a75db",
"sha256:ce10a8e7e067bde3c1fbf494d2b8859db510206030b0b67bc3af90b0eb1887b9",
"sha256:d31386d208303a5a6cf0819ef9f6db6680bab9e4ca8e48adb3d4b26ead89beb7",
"sha256:d83b3af53b201970973c5574b39df226746194063bb248a53fd12b470ac34319",
"sha256:df9657b212c054ac6d803290d7c4bcd7790af0b725984fce1eeb0a1e3f2d9798",
"sha256:e576e5fd3f129e6b3595dc734ac7f2b8c548f19ef07781194bc538dc9c0cdbbc",
"sha256:e7400358558094c1bcedc75f3b3c4f400c53130b44833848890a99968dee6a64",
"sha256:eb6a385f8577d30e4cb43dd555fb134ddaae1edeb84205e09dabec332bf49fd0",
"sha256:f27f0875e0873f6bf5df09a456bfcac0667824cabac4cad30b43f36e0382ffe7",
"sha256:fcd4a6d04995f1d66bc78b503e4e59ae72fd32aaec4f661657fe5ae5c1aa4ce3"
"sha256:0051c6f1f27cb756ffc0ffbac7d2cd48cb0362ac1736871399a739b2885134d3",
"sha256:00e44c8afdbe5467e4f7b5851be223be68adb4272f44696ee71fe46b7036a711",
"sha256:013d61294b6cd8fe3242932c1c5e36e5d1db2c8afb58606c5a67efce62c1f5fd",
"sha256:049fe7579230e44daef03a259faa24511d10ebfa44f69411d99e6a184fe68073",
"sha256:14d4f3cd4e8b524ae9b8aa567858beed70c392fdec26dbdb0a8a418392e71708",
"sha256:166eac03e48784a6a6e0e5f041cfebb1ab400b394db188c48b3a84737f505b67",
"sha256:17ff94e7a83aa8671a25bf5b59326ec26da379ace2ebc4411d690d80a7fbcf23",
"sha256:1e12bdc622676ce47ae9abbf455c189e442afdde8818d9da983085df6312e7a1",
"sha256:21915eb821a6b3d9d8eefdaf57d6c345b970ad722f856cd71739493ce003ad08",
"sha256:288c6a76705dc54fba69fbcb59904ae4ad768b4c768839b8ca5fdadec6dd8cfd",
"sha256:2bde6792f313f4e918caabc46532aa64aa27a0db05d75b20edfc5c6f46479de2",
"sha256:32ca72bbc673adbcfecb935bb3fb1b74e663d10a4b241aaa2f5a75fe1d1f90aa",
"sha256:356b3576ad078c89a6107caa9c50cc14e98e3a6c4874a37c3e0273e4baf33de8",
"sha256:40b951f601af999a8bf2ce8c71e8aaa4e8c6f78ff8afae7b808aae2dc50d4c40",
"sha256:572e1787d1460da79590bf44304abbc0a2da944ea64ec549188fa84d89bba7ab",
"sha256:58df5c2a0e293bf665a51f8a100d3e9956febfbf1d9aaf8c0677cf70218910c6",
"sha256:64e6175c2e53195278d7388c454e0b30997573f3f4bd63697f88d855f7a6a1fc",
"sha256:7227b47e73dedaa513cdebb98469705ef0d66eb5a1250144468e9c3097d6b59b",
"sha256:7418b6bfc7fe3331541b84bb2141c9baf1ec7132a7ecd9f375912eca810e714e",
"sha256:7cbd7574ce8e138bda9df4efc6bf2ab8572c9aff640d8ecfece1b006b68da963",
"sha256:7ff61ff178250f9bb3cd89752df0f1dd0e27316a8bd1465351652b1b4a4cdfd3",
"sha256:833e1551925ed51e6b44c800e71e77dacd7e49181fdc9ac9a0bf3714d515785d",
"sha256:8639cadfda96737427330a094476d4c7a56ac03de7265622fcf4cfe57c8ae18d",
"sha256:8c5d5b35f789a030ebb95bff352f1d27a93d81069f2adb3182d99882e095cefe",
"sha256:8c790abda465726cfb8bb08bd4ca9a5d0a7bd77c7ac1ca1b839ad823b948ea28",
"sha256:8d2f1fb53a421b410751887eb4ff21386d119ef9cde3797bf5e7ed49fb51a3b3",
"sha256:903bbd302a2378f984aef528f76d4c9b1748f318fe1294961c072bdc7f2ffa3e",
"sha256:93f81b134a165cc17123626ab8da2e30c0455441d4ab5576eed73a64c025b25c",
"sha256:95e69877983ea39b7303570fa6760f81a3eec23d0e3ab2021b7144b94d06202d",
"sha256:9633b3034d3d901f0a46b7939f8c4d64427dfba6bbc5a36b1a67364cf148a1b0",
"sha256:97e5306482182170ade15c4b0d8386ded995a07d7cc2ca8f27958d34d6736497",
"sha256:9f3cba480d3deb69f6ee2c1825060177a22c7826431458c697df88e6aeb3caee",
"sha256:aa5b467f15e78b82257319aebc78dd2915e4c1436c3c0d1ad6f53e47ba6e2713",
"sha256:abb7a75ed8b968f3061327c433a0fbd17b729947b400747c334a9c29a9af6c58",
"sha256:aec52725173bd3a7b56fe91bc56eccb26fbdff1386ef123abb63c84c5b43b63a",
"sha256:b11548073a2213d950c3f671aa88e6f83cda6e2fb97a8b6317b1b5b33d850e06",
"sha256:b1692f7d6bc45e3200844be0dba153612103db241691088626a33ff1f24a0d88",
"sha256:b336501a05e13b616ef81ce329c0e09ac5ed8c732d9ba7e3e983fcc1a9e86965",
"sha256:b8c008de9d0daba7b6666aa5bbfdc23dcd78cafc33997c9b7741ff6353bafb7f",
"sha256:b92e29e58bef6d9cfd340c72b04d74c4b4e9f70c9fa7c78b674d1fec18896dc4",
"sha256:be5f425ff1f5f4b3c1e33ad64ab994eed12fc284a6ea71c5243fd564502ecbe5",
"sha256:dd0b1e9e891f69e7675ba5c92e28b90eaa045f6ab134ffe70b52e948aa175b3c",
"sha256:e30f5ea4ae2346e62cedde8794a56858a67b878dd79f7df76a0767e356b1744a",
"sha256:e6a36bb9474218c7a5b27ae476035497a6990e21d04c279884eb10d9b290f1b1",
"sha256:e859fcb4cbe93504ea18008d1df98dee4f7766db66c435e4882ab35cf70cac43",
"sha256:eb6ea6da4c787111adf40f697b4e58732ee0942b5d3bd8f435277643329ba627",
"sha256:ec8c433b3ab0419100bd45b47c9c8551248a5aee30ca5e9d399a0b57ac04651b",
"sha256:eff9d20417ff9dcb0d25e2defc2574d10b491bf2e693b4e491914738b7908168",
"sha256:f0214eb2a23b85528310dad848ad2ac58e735612929c8072f6093f3585fd342d",
"sha256:f276df9830dba7a333544bd41070e8175762a7ac20350786b322b714b0e654f5",
"sha256:f3acda1924472472ddd60c29e5b9db0cec629fbe3c5c5accb74d6d6d14773478",
"sha256:f70a9e237bb792c7cc7e44c531fd48f5897961701cdaa06cf22fc14965c496cf",
"sha256:f9d29ca8a77117315101425ec7ec2a47a22ccf59f5593378fc4077ac5b754fce",
"sha256:fa877ca7f6b48054f847b61d6fa7bed5cebb663ebc55e018fda12db09dcc664c",
"sha256:fdcec0b8399108577ec290f55551d926d9a1fa6cad45882093a7a07ac5ec147b"
],
"markers": "python_version >= '3' and platform_machine == 'aarch64' or (platform_machine == 'ppc64le' or (platform_machine == 'x86_64' or (platform_machine == 'amd64' or (platform_machine == 'AMD64' or (platform_machine == 'win32' or platform_machine == 'WIN32')))))",
"version": "==2.0.0a2"
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==1.1.2"
},
"gspread": {
"hashes": [
@@ -393,11 +416,11 @@
},
"loguru": {
"hashes": [
"sha256:b28e72ac7a98be3d28ad28570299a393dfcd32e5e3f6a353dec94675767b6319",
"sha256:f8087ac396b5ee5f67c963b495d615ebbceac2796379599820e324419d53667c"
"sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c",
"sha256:4e2414d534a2ab57573365b3e6d0234dfb1d84b68b7f3b948e6fb743860a77c3"
],
"index": "pypi",
"version": "==0.5.3"
"version": "==0.6.0"
},
"lxml": {
"hashes": [
@@ -588,6 +611,68 @@
"git": "https://github.com/bellingcat/polyphemus.git",
"ref": "00a5123a3768a55ffe29f2c803a4181895f17890"
},
"psycopg2-binary": {
"hashes": [
"sha256:01310cf4cf26db9aea5158c217caa92d291f0500051a6469ac52166e1a16f5b7",
"sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76",
"sha256:090f3348c0ab2cceb6dfbe6bf721ef61262ddf518cd6cc6ecc7d334996d64efa",
"sha256:0a29729145aaaf1ad8bafe663131890e2111f13416b60e460dae0a96af5905c9",
"sha256:0c9d5450c566c80c396b7402895c4369a410cab5a82707b11aee1e624da7d004",
"sha256:10bb90fb4d523a2aa67773d4ff2b833ec00857f5912bafcfd5f5414e45280fb1",
"sha256:12b11322ea00ad8db8c46f18b7dfc47ae215e4df55b46c67a94b4effbaec7094",
"sha256:152f09f57417b831418304c7f30d727dc83a12761627bb826951692cc6491e57",
"sha256:15803fa813ea05bef089fa78835118b5434204f3a17cb9f1e5dbfd0b9deea5af",
"sha256:15c4e4cfa45f5a60599d9cec5f46cd7b1b29d86a6390ec23e8eebaae84e64554",
"sha256:183a517a3a63503f70f808b58bfbf962f23d73b6dccddae5aa56152ef2bcb232",
"sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c",
"sha256:1f6b813106a3abdf7b03640d36e24669234120c72e91d5cbaeb87c5f7c36c65b",
"sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834",
"sha256:2d872e3c9d5d075a2e104540965a1cf898b52274a5923936e5bfddb58c59c7c2",
"sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71",
"sha256:3303f8807f342641851578ee7ed1f3efc9802d00a6f83c101d21c608cb864460",
"sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e",
"sha256:3a79d622f5206d695d7824cbf609a4f5b88ea6d6dab5f7c147fc6d333a8787e4",
"sha256:404224e5fef3b193f892abdbf8961ce20e0b6642886cfe1fe1923f41aaa75c9d",
"sha256:46f0e0a6b5fa5851bbd9ab1bc805eef362d3a230fbdfbc209f4a236d0a7a990d",
"sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9",
"sha256:526ea0378246d9b080148f2d6681229f4b5964543c170dd10bf4faaab6e0d27f",
"sha256:53293533fcbb94c202b7c800a12c873cfe24599656b341f56e71dd2b557be063",
"sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478",
"sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092",
"sha256:63638d875be8c2784cfc952c9ac34e2b50e43f9f0a0660b65e2a87d656b3116c",
"sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce",
"sha256:68641a34023d306be959101b345732360fc2ea4938982309b786f7be1b43a4a1",
"sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65",
"sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e",
"sha256:7af0dd86ddb2f8af5da57a976d27cd2cd15510518d582b478fbb2292428710b4",
"sha256:7b1e9b80afca7b7a386ef087db614faebbf8839b7f4db5eb107d0f1a53225029",
"sha256:874a52ecab70af13e899f7847b3e074eeb16ebac5615665db33bce8a1009cf33",
"sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39",
"sha256:8b344adbb9a862de0c635f4f0425b7958bf5a4b927c8594e6e8d261775796d53",
"sha256:8fc53f9af09426a61db9ba357865c77f26076d48669f2e1bb24d85a22fb52307",
"sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42",
"sha256:93cd1967a18aa0edd4b95b1dfd554cf15af657cb606280996d393dadc88c3c35",
"sha256:99485cab9ba0fa9b84f1f9e1fef106f44a46ef6afdeec8885e0b88d0772b49e8",
"sha256:9d29409b625a143649d03d0fd7b57e4b92e0ecad9726ba682244b73be91d2fdb",
"sha256:a29b3ca4ec9defec6d42bf5feb36bb5817ba3c0230dd83b4edf4bf02684cd0ae",
"sha256:a9e1f75f96ea388fbcef36c70640c4efbe4650658f3d6a2967b4cc70e907352e",
"sha256:accfe7e982411da3178ec690baaceaad3c278652998b2c45828aaac66cd8285f",
"sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba",
"sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24",
"sha256:b1c8068513f5b158cf7e29c43a77eb34b407db29aca749d3eb9293ee0d3103ca",
"sha256:bda845b664bb6c91446ca9609fc69f7db6c334ec5e4adc87571c34e4f47b7ddb",
"sha256:c381bda330ddf2fccbafab789d83ebc6c53db126e4383e73794c74eedce855ef",
"sha256:c3ae8e75eb7160851e59adc77b3a19a976e50622e44fd4fd47b8b18208189d42",
"sha256:d1c1b569ecafe3a69380a94e6ae09a4789bbb23666f3d3a08d06bbd2451f5ef1",
"sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667",
"sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272",
"sha256:e3699852e22aa68c10de06524a3721ade969abf382da95884e6a10ff798f9281",
"sha256:e847774f8ffd5b398a75bc1c18fbb56564cda3d629fe68fd81971fece2d3c67e",
"sha256:ffb7a888a047696e7f8240d649b43fb3644f14f0ee229077e7f6b9f9081635bd"
],
"index": "pypi",
"version": "==2.9.3"
},
"pyaes": {
"hashes": [
"sha256:02c1b1405c38d3c370b085fb952dd8bea3fadcee6411ad99f312cc129c536d8f"
@@ -711,6 +796,7 @@
"sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7",
"sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"
],
"markers": "python_version < '3.9'",
"version": "==2022.1"
},
"pytz-deprecation-shim": {
@@ -725,6 +811,7 @@
"hashes": [
"sha256:af8a9b64b821529aca09ebaf6d8d279100d766f19e90b5059ac6a718ca6dee42"
],
"index": "pypi",
"version": "==2.2.1"
},
"regex": {
@@ -808,6 +895,9 @@
"version": "==2022.3.2"
},
"requests": {
"extras": [
"socks"
],
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
@@ -828,7 +918,7 @@
"sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17",
"sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"
],
"markers": "python_version >= '3.6' and python_version < '4'",
"markers": "python_version >= '3.6'",
"version": "==4.8"
},
"s3transfer": {
@@ -849,7 +939,7 @@
},
"snscrape": {
"git": "https://github.com/bellingcat/snscrape.git",
"ref": "fb8d73ac95011b7ad848a6048d3eed1880e80f21"
"ref": "d32c9add8a3691c81c9091dc1a7d079e9871379f"
},
"soupsieve": {
"hashes": [
@@ -913,7 +1003,7 @@
"sha256:4230a49119a416c88cc47d0d2d32d5d90f1a282d5e497d49801950704e49863d",
"sha256:6461b009d6792008d0000e1b0c7ca50195ec78c0e808a3a6b668a56a3236c3a5"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"index": "pypi",
"version": "==4.63.1"
},
"tzdata": {
@@ -1073,11 +1163,11 @@
},
"click": {
"hashes": [
"sha256:19a4baa64da924c5e0cd889aba8e947f280309f1a2ce0947a3e3a7bcb7cc72d6",
"sha256:977c213473c7665d3aa092b41ff12063227751c41d7b17165013e10069cc5cd2"
"sha256:5e0d195c2067da3136efb897449ec1e9e6c98282fbf30d7f9e164af9be901a6b",
"sha256:7ab900e38149c9872376e8f9b5986ddcaf68c0f413cf73678a0bca5547e6f976"
],
"markers": "python_version >= '3.7'",
"version": "==8.1.0"
"version": "==8.1.1"
},
"coverage": {
"extras": [
@@ -1321,9 +1411,13 @@
"sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7",
"sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"
],
"markers": "python_version < '3.9'",
"version": "==2022.1"
},
"requests": {
"extras": [
"socks"
],
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"

135
app.py Normal file
View File

@@ -0,0 +1,135 @@
import argparse
from loguru import logger
import gspread
from sqlalchemy import create_engine, func
from sqlalchemy.orm import sessionmaker
import os
import time
from cisticola.base import Channel, RawChannelInfo, mapper_registry
from cisticola.scraper import (
ScraperController,
BitchuteScraper,
GabScraper,
GettrScraper,
OdyseeScraper,
RumbleScraper,
TelegramSnscrapeScraper,
TelegramTelethonScraper,
TwitterScraper)
def sync_channels(args):
logger.info("Synchronizing channels")
session = get_db_session()
gc = gspread.service_account(filename='service_account.json')
# Open a sheet from a spreadsheet in one go
wks = gc.open_by_url(args.gsheet).worksheet("channels")
channels = wks.get_all_records()
row = 2
for c in channels:
# only adding channels, so skip everything with an ID
if c['id'] == '':
del c['id']
del c['followers']
if c['public'] == '': c['public'] = False
if c['chat'] == '': c['chat'] = False
for k in c.keys():
if c[k] == 'TRUE' or c[k] == 'yes': c[k] = True
if c[k] == 'FALSE' or c[k] == 'no': c[k] = False
if c[k] == '': c[k] = None
# check to see if this already exists,
platform_id = None
if c['platform_id'] != '':
platform_id = c['platform_id']
channel = session.query(Channel).filter_by(platform_id=platform_id, platform=c['platform'], url=c['url']).first()
if not channel:
channel = Channel(**c, source='researcher')
logger.debug(f"{channel} does not exist, adding")
session.add(channel)
session.flush()
session.commit()
wks.update_cell(row, 1, channel.id)
time.sleep(1)
row += 1
session.commit()
def get_db_session():
engine = create_engine(os.environ['DB'])
session_generator = sessionmaker()
session_generator.configure(bind=engine)
session = session_generator()
return session
def get_scraper_controller():
engine = create_engine(os.environ['DB'])
controller = ScraperController()
controller.connect_to_db(engine)
scrapers = [
TelegramTelethonScraper(),
TwitterScraper()]
controller.register_scrapers(scrapers)
return controller
def scrape_channels(args):
logger.info(f"Scraping channels, media: {args.media}")
controller = get_scraper_controller()
controller.scrape_all_channels(archive_media = args.media)
def scrape_channel_info(args):
logger.info(f"Scraping channel info")
controller = get_scraper_controller()
controller.scrape_all_channel_info()
def archive_media(args):
logger.info(f"Archiving unarchived media")
controller = get_scraper_controller()
controller.archive_unarchived_media()
def init_db():
engine = create_engine(os.environ['DB'])
mapper_registry.metadata.create_all(bind=engine)
if __name__ == '__main__':
logger.add("./test.log", level="TRACE")
parser = argparse.ArgumentParser(description = 'Cisticola command line tools')
parser.add_argument('command', type=str, help='Command to run: "sync-channels", "scrape-channels", or "archive-media"')
parser.add_argument('--gsheet', type=str, help='[sync-channels] URL of Google Sheet to synchronize')
parser.add_argument('--media', action='store_true', help='[scrape-channels] Add this flag to media')
args = parser.parse_args()
if args.command == 'init-db':
init_db()
elif args.command == 'sync-channels':
sync_channels(args)
elif args.command == 'scrape-channels':
scrape_channels(args)
elif args.command == 'archive-media':
archive_media(args)
elif args.command == 'channel-info':
scrape_channel_info(args)
else:
logger.error(f"Unrecognized command {args.command}")

View File

@@ -34,7 +34,7 @@ class ScraperResult:
date: datetime
#: JSON dump of dict that contains all data scraped for the post.
raw_data: str
raw_posts: str
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime
@@ -44,7 +44,7 @@ class ScraperResult:
#: Has the media in this post been archived?
media_archived: bool
@dataclass
class Channel:
"""Information about a specific channel to be scraped.
@@ -89,11 +89,31 @@ class Channel:
def hydrate(self):
pass
@dataclass
class RawChannelInfo:
"""A minimally processed result from a scraper
"""
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
scraper: str
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
platform: str
#: Foreign key of channel ID that this was scraped from
channel: int
#: JSON dump of dict that contains all data scraped for the post.
raw_data: str
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime
@dataclass
class Post:
"""An object with fields for columns in the analysis table"""
#: ID number of the scraped post in the ``raw_data`` table
#: ID number of the scraped post in the ``raw_posts`` table
raw_id: int
#: Platform specific post ID
@@ -144,7 +164,7 @@ class Media:
"""Base class for organizing information about a media file.
"""
#: ID number of the media's corresponding scraped post in the ``raw_data`` table.
#: ID number of the media's corresponding scraped post in the ``raw_posts`` table.
raw_id: int
#: ID number of the media's corresponging scraped post in the ``analysis`` table.
@@ -221,7 +241,7 @@ class Video(Media):
mapper_registry = registry()
raw_data_table = Table('raw_data', mapper_registry.metadata,
raw_posts_table = Table('raw_posts', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('scraper', String),
@@ -229,15 +249,23 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
Column('channel', Integer, ForeignKey('channels.id')),
Column('platform_id', String),
Column('date', DateTime),
Column('raw_data', String),
Column('raw_posts', String),
Column('date_archived', DateTime),
Column('archived_urls', JSON),
Column('media_archived', Boolean))
raw_channel_info_table = Table('raw_channel_info', mapper_registry.metadata,
Column('id', Integer, primary_key=True),
Column('scraper', String),
Column('platform', String),
Column('channel', Integer, ForeignKey('channels.id')),
Column('raw_data', String),
Column('date_archived', DateTime))
channel_table = Table('channels', mapper_registry.metadata,
Column('id', Integer, primary_key=True, autoincrement=True),
Column('name', String),
Column('platform_id', Integer),
Column('platform_id', String),
Column('category', String),
Column('platform', String),
Column('url', String),
@@ -253,7 +281,7 @@ channel_table = Table('channels', mapper_registry.metadata,
post_table = Table('posts', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('raw_id', Integer, ForeignKey('raw_data.id')),
Column('raw_id', Integer, ForeignKey('raw_posts.id')),
Column('platform_id', Integer),
Column('scraper', String),
Column('transformer', String),
@@ -273,7 +301,7 @@ media_table = Table('media', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('type', String),
Column('raw_id', Integer, ForeignKey('raw_data.id')),
Column('raw_id', Integer, ForeignKey('raw_posts.id')),
Column('post', Integer, ForeignKey('posts.id')),
Column('url', String),
Column('original_url', String),
@@ -282,7 +310,8 @@ media_table = Table('media', mapper_registry.metadata,
mapper_registry.map_imperatively(Post, post_table)
mapper_registry.map_imperatively(Channel, channel_table)
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
mapper_registry.map_imperatively(ScraperResult, raw_posts_table)
mapper_registry.map_imperatively(RawChannelInfo, raw_channel_info_table)
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')

View File

@@ -303,6 +303,9 @@ class ScraperController:
"""
self.scrapers.extend(scraper)
def remove_all_scrapers(self):
self.scrapers = []
def scrape_all_channels(self, archive_media: bool = True):
if self.session is None:
logger.error("No DB session")
@@ -313,6 +316,17 @@ class ScraperController:
channels = session.query(Channel).where(Channel.source=='researcher').all()
return self.scrape_channels(channels, archive_media=archive_media)
def scrape_all_channel_info(self):
if self.session is None:
logger.error("No DB session")
return
session = self.session()
channels = session.query(Channel).where(Channel.source=='researcher').all()
return self.scrape_channel_info(channels)
@logger.catch(reraise = True)
def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
@@ -336,6 +350,7 @@ class ScraperController:
for scraper in self.scrapers:
if scraper.can_handle(channel):
logger.debug(f"{scraper} is handling {channel}")
handled = True
added = 0
@@ -382,7 +397,7 @@ class ScraperController:
for scraper in self.scrapers:
if scraper.__version__ == post.scraper:
handled = True
logger.info(f"{scraper} is archiving media for {post}")
logger.debug(f"{scraper} is archiving media for ID {post.id}")
post = scraper.archive_files(post)
if post:
@@ -396,6 +411,48 @@ class ScraperController:
session.commit()
@logger.catch(reraise = True)
def scrape_channel_info(self, channels: List[Channel]):
"""Scrape channel info for specified channels.
Parameters
----------
channels: list<Channel>
List of Channel instances to be scraped
archive_media: bool
If ``True``, any media files (images, video, etc.) from posts are archived.
If ``False``, media files are not archived.
"""
if self.session is None:
logger.error("No DB session")
return
for channel in channels:
handled = False
for scraper in self.scrapers:
if scraper.can_handle(channel):
logger.debug(f"{scraper} is getting channel info for {channel}")
handled = True
# get most recent post
session = self.session()
try:
info = scraper.get_profile(channel)
session.add(info)
session.commit()
logger.info(
f"{scraper} found {info}")
break
except ChannelDoesNotExistError:
logger.warning(f"ChannelDoesNotExist {channel}")
if not handled:
logger.warning(f"No handler found for Channel {channel}")
def connect_to_db(self, engine):
"""Connect the specified SQLAlchemy engine to the controller.
"""

View File

@@ -9,7 +9,7 @@ from typing import Generator
import requests
from bs4 import BeautifulSoup
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class BitchuteScraper(Scraper):
@@ -57,7 +57,7 @@ class BitchuteScraper(Scraper):
platform_id=post['id'],
date=datetime.fromtimestamp(post['timestamp']),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post),
raw_posts=json.dumps(post),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -65,7 +65,7 @@ class BitchuteScraper(Scraper):
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
return True
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
base_url = channel.url
@@ -106,8 +106,12 @@ class BitchuteScraper(Scraper):
'subscribers': counts['subscriber_count'],
'views': int(counts['about_view_count'].split(' ')[0])}
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def strip_tags(html, convert_newlines=True):

View File

@@ -5,7 +5,7 @@ import os
from gabber.client import Client, GAB_API_BASE_URL
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class GabScraper(Scraper):
@@ -80,7 +80,7 @@ class GabScraper(Scraper):
platform_id=post['id'],
date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post),
raw_posts=json.dumps(post),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -88,7 +88,7 @@ class GabScraper(Scraper):
if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:
return True
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
client = Client(
username = os.environ['GAB_USER'],
@@ -106,4 +106,8 @@ class GabScraper(Scraper):
profile = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -5,7 +5,7 @@ from urllib.parse import urlparse
from gogettr import PublicClient
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class GettrScraper(Scraper):
@@ -58,7 +58,7 @@ class GettrScraper(Scraper):
platform_id=post['_id'],
date=datetime.fromtimestamp(post['cdate']/1000.),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post),
raw_posts=json.dumps(post),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -71,9 +71,13 @@ class GettrScraper(Scraper):
key = urlparse(url).path.split('/')[-2] + ext
return key
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
client = client = PublicClient()
username = self.get_username_from_url(channel.url)
profile = client.user_info(username)
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -8,7 +8,7 @@ from pathlib import Path
from loguru import logger
import instaloader
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
BASE_URL = 'https://www.instagram.com/'
@@ -79,7 +79,7 @@ class InstagramScraper(Scraper):
platform_id=post.mediaid,
date=post.date_utc,
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post._asdict(), default=str),
raw_posts=json.dumps(post._asdict(), default=str),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -96,7 +96,7 @@ class InstagramScraper(Scraper):
platform_id=post.mediaid,
date=comment.created_at_utc,
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(comment_dict, default=str),
raw_posts=json.dumps(comment_dict, default=str),
archived_urls={},
media_archived=archive_media)
@@ -104,7 +104,7 @@ class InstagramScraper(Scraper):
if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None:
return True
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
username = self.get_username_from_url(channel.url)
@@ -125,4 +125,8 @@ class InstagramScraper(Scraper):
profile['followers'] = user_profile.followers
profile['followees'] = user_profile.followees
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -8,7 +8,7 @@ from loguru import logger
from polyphemus.base import OdyseeChannel
from polyphemus.api import get_auth_token
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class OdyseeScraper(Scraper):
@@ -60,7 +60,7 @@ class OdyseeScraper(Scraper):
platform_id=video.info['claim_id'],
date=datetime.fromtimestamp(video.info['created']),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(video.info),
raw_posts=json.dumps(video.info),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -73,7 +73,7 @@ class OdyseeScraper(Scraper):
platform_id=comment.info['claim_id'],
date=datetime.fromtimestamp(comment.info['created']),
date_archived=datetime.now(),
raw_data=json.dumps(comment.info),
raw_posts=json.dumps(comment.info),
archived_urls={},
media_archived=True)
@@ -87,10 +87,14 @@ class OdyseeScraper(Scraper):
return f'{key}.{ext}'
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
username = self.get_username_from_url(channel.url)
odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token)
profile = odysee_channel.info
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -5,7 +5,7 @@ from urllib.parse import urlparse
from bs4 import BeautifulSoup
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper import Scraper, make_request
BASE_URL = 'https://rumble.com'
@@ -39,7 +39,7 @@ class RumbleScraper(Scraper):
platform_id=post['media_url'].split('/')[-2],
date=post['datetime'].replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post, default = str),
raw_posts=json.dumps(post, default = str),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -52,11 +52,15 @@ class RumbleScraper(Scraper):
if channel.platform == "Rumble" and channel.url is not None:
return True
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
profile = get_channel_profile(url = channel.url)
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
@@ -128,6 +132,7 @@ def get_channel_profile(url):
'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None,
'cover': cover_soup.get('src') if cover_soup else None,
'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text}
return profile
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -1,10 +1,10 @@
from typing import Generator
from datetime import datetime, timezone
import json
import snscrape.modules
from loguru import logger
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class TelegramSnscrapeScraper(Scraper):
@@ -49,15 +49,20 @@ class TelegramSnscrapeScraper(Scraper):
platform_id=post.url,
date=post.date,
date_archived=datetime.now(timezone.utc),
raw_data=post.json(),
raw_posts=post.json(),
archived_urls=archived_urls,
media_archived=archive_media
)
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
scr = snscrape.modules.telegram.TelegramChannelScraper(
channel.screenname)
profile = scr._get_entity().__dict__
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -11,7 +11,7 @@ from telethon.sync import TelegramClient
from telethon.tl.functions.channels import GetFullChannelRequest
from telethon.tl import types
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
@@ -44,7 +44,7 @@ class TelegramTelethonScraper(Scraper):
key = list(result.archived_urls.keys())[0]
if result.archived_urls[key] is None:
raw = json.loads(result.raw_data)
raw = json.loads(result.raw_posts)
message = client.get_messages(raw['peer_id']['channel_id'], ids=[raw['id']])
@@ -66,13 +66,10 @@ class TelegramTelethonScraper(Scraper):
return result
def archive_post_media(self, post : types.Message, client : TelegramClient = None):
logger.debug(f"Archiving post {post}")
if post.media is None:
logger.debug("No media for post")
return None, None
logger.debug(f"Archiving media {post.media}")
if client is None:
api_id = os.environ['TELEGRAM_API_ID']
api_hash = os.environ['TELEGRAM_API_HASH']
@@ -81,6 +78,11 @@ class TelegramTelethonScraper(Scraper):
with TelegramClient(phone, api_id, api_hash) as client:
return self.archive_post_media(post, client=client)
if type(post.media) == types.MessageMediaDocument:
logger.debug(f"Archiving {type(post.media)} with size {post.media.document.size/(1024*1024)} MB")
else:
logger.debug(f"Archiving {type(post.media)}")
key = f'{post.peer_id.channel_id}_{post.id}'
with tempfile.TemporaryDirectory() as temp_dir:
@@ -88,6 +90,10 @@ class TelegramTelethonScraper(Scraper):
client.download_media(post.media, output_file)
if len(os.listdir(temp_dir)) == 0:
logger.warning(f"No file present. Could not archive {post.media}")
return None, None
output_file_with_ext = os.listdir(temp_dir)[0]
filename = Path(temp_dir, output_file_with_ext)
@@ -96,11 +102,13 @@ class TelegramTelethonScraper(Scraper):
return (blob, output_file_with_ext)
def can_handle(self, channel):
if channel.platform == "Telegram" and channel.public and not channel.chat:
if channel.platform == "Telegram" and channel.public:
return True
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = self.get_username_from_url(channel.url)
username = channel.screenname
if username is None:
username = self.get_username_from_url(channel.url)
api_id = os.environ['TELEGRAM_API_ID']
api_hash = os.environ['TELEGRAM_API_HASH']
@@ -110,14 +118,13 @@ class TelegramTelethonScraper(Scraper):
for post in client.iter_messages(username):
post_url = f'{channel.url}/{post.id}'
logger.info(f"Archiving post {post_url} from {post.date}")
logger.trace(f"Archiving post {post_url} from {post.date}")
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}')
break
archived_urls = {}
logger.info(f"Archiving post {post_url}")
if post.media is not None:
archived_urls[post_url] = None
@@ -136,13 +143,14 @@ class TelegramTelethonScraper(Scraper):
platform_id=post_url,
date=post.date.replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post.to_dict(), default=str),
raw_posts=json.dumps(post.to_dict(), default=str),
archived_urls=archived_urls,
media_archived=archive_media)
def get_profile(self, channel: Channel) -> dict:
username = self.get_username_from_url(channel.url)
def get_profile(self, channel: Channel) -> RawChannelInfo:
username = channel.screenname
if username is None:
username = self.get_username_from_url(channel.url)
api_id = os.environ['TELEGRAM_API_ID']
api_hash = os.environ['TELEGRAM_API_HASH']
@@ -150,6 +158,10 @@ class TelegramTelethonScraper(Scraper):
with TelegramClient(phone, api_id, api_hash) as client:
full_channel = client(GetFullChannelRequest(channel = username))
profile = full_channel.__dict__
profile = full_channel.to_dict()
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile, default=str),
date_archived=datetime.now(timezone.utc))

View File

@@ -1,11 +1,11 @@
from datetime import datetime, timezone
from typing import Generator
from urllib.parse import urlparse, parse_qs
from snscrape.modules.twitter import TwitterProfileScraper, TwitterUserScraper, Video, Gif, Photo
from loguru import logger
import json
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper, ChannelDoesNotExistError
class TwitterScraper(Scraper):
@@ -13,7 +13,12 @@ class TwitterScraper(Scraper):
__version__ = "TwitterScraper 0.0.1"
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
scraper = TwitterProfileScraper(channel.platform_id)
if channel.platform_id:
identifier = channel.platform_id
else:
identifier = channel.screenname
scraper = TwitterProfileScraper(identifier)
first = True
@@ -32,10 +37,10 @@ class TwitterScraper(Scraper):
if tweet.media:
media_list += tweet.media
if tweet.retweetedTweet and tweet.retweetedTweet.media:
if tweet.retweetedTweet and hasattr(tweet.retweetedTweet, 'media') and tweet.retweetedTweet.media:
media_list += tweet.retweetedTweet.media
if tweet.quotedTweet and tweet.quotedTweet.media:
if tweet.quotedTweet and hasattr(tweet.quotedTweet, 'media') and tweet.quotedTweet.media:
media_list += tweet.quotedTweet.media
for media in media_list:
@@ -66,12 +71,12 @@ class TwitterScraper(Scraper):
platform_id=tweet.id,
date=tweet.date,
date_archived=datetime.now(timezone.utc),
raw_data=tweet.json(),
raw_posts=tweet.json(),
archived_urls=archived_urls,
media_archived=archive_media)
def can_handle(self, channel):
if channel.platform == "Twitter" and channel.platform_id:
if channel.platform == "Twitter" and (channel.platform_id or channel.screenname):
return True
def url_to_key(self, url: str, content_type: str) -> str:
@@ -91,7 +96,7 @@ class TwitterScraper(Scraper):
key = parsed_url.path.split('/')[-1] + ext
return key
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
scraper = TwitterUserScraper(channel.screenname)
entity = scraper._get_entity()
@@ -99,4 +104,8 @@ class TwitterScraper(Scraper):
if entity is None:
raise ChannelDoesNotExistError(channel.url)
else:
return entity.__dict__
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(entity.__dict__, default=str),
date_archived=datetime.now(timezone.utc))

View File

@@ -1,11 +1,10 @@
from datetime import datetime, timezone
from typing import Generator
from urllib.parse import urlparse
from snscrape.modules.vkontakte import VKontakteUserScraper
from loguru import logger
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class VkontakteScraper(Scraper):
@@ -62,7 +61,7 @@ class VkontakteScraper(Scraper):
platform_id=post.url.split('/')[-1],
date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=post.json(),
raw_posts=post.json(),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -80,10 +79,15 @@ class VkontakteScraper(Scraper):
return key
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
username = self.get_username_from_url(channel.url)
scraper = VKontakteUserScraper(username)
profile = scraper._get_entity().__dict__
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -2,10 +2,9 @@ from datetime import datetime, timezone
import json
from typing import Generator
import tempfile
import yt_dlp
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper import Scraper
class YoutubeScraper(Scraper):
@@ -71,7 +70,7 @@ class YoutubeScraper(Scraper):
platform_id=video_id,
date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(video, default = str),
raw_posts=json.dumps(video, default = str),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -79,8 +78,7 @@ class YoutubeScraper(Scraper):
if channel.platform == "Youtube" and channel.url:
return True
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
ydl_opts = {}
ydl = yt_dlp.YoutubeDL(ydl_opts)
@@ -89,7 +87,12 @@ class YoutubeScraper(Scraper):
meta = ydl.extract_info(
channel.url,
process=False)
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(meta),
date_archived=datetime.now(timezone.utc))
except yt_dlp.utils.DownloadError as e:
raise e
return meta

View File

@@ -20,7 +20,7 @@ class BitchuteTransformer(Transformer):
return False
def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]:
raw = json.loads(data.raw_data)
raw = json.loads(data.raw_posts)
orig = raw['video_url']
new = data.archived_urls[orig]
@@ -30,7 +30,7 @@ class BitchuteTransformer(Transformer):
yield m
def transform(self, data: ScraperResult) -> Post:
raw = json.loads(data.raw_data)
raw = json.loads(data.raw_posts)
soup = BeautifulSoup(raw['body'], features = 'html.parser')
content = soup.find_all('p')[-1].text

View File

@@ -47,7 +47,7 @@ class TwitterTransformer(Transformer):
def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
raw = json.loads(data.raw_posts)
transformed = Post(
raw_id=data.id,

74
test.py
View File

@@ -1,74 +0,0 @@
from sqlalchemy import create_engine
from loguru import logger
import gspread
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from cisticola.base import Channel, Post, ScraperResult, mapper_registry
from cisticola.scraper import (
ScraperController,
BitchuteScraper,
GabScraper,
GettrScraper,
OdyseeScraper,
RumbleScraper,
TelegramSnscrapeScraper,
TelegramTelethonScraper,
TwitterScraper)
from cisticola.transformer import ETLController
from cisticola.transformer.twitter import TwitterTransformer
logger.add("../test.log")
controller = ScraperController()
scrapers = [
BitchuteScraper(),
GabScraper(),
GettrScraper(),
OdyseeScraper(),
RumbleScraper(),
TelegramTelethonScraper(),
TwitterScraper()]
controller.register_scrapers(scrapers)
engine = create_engine('sqlite:///test.db')
mapper_registry.metadata.create_all(bind=engine)
session_generator = sessionmaker()
session_generator.configure(bind=engine)
session = session_generator()
gc = gspread.service_account(filename='service_account.json')
# Open a sheet from a spreadsheet in one go
wks = gc.open_by_url("https://docs.google.com/spreadsheets/d/1k5VgqREoA3v1r7bkVq7TOTRDtdYqTMWkQnsZpRbntpw/edit#gid=0")
channels = wks.worksheet("channels").get_all_records()
for c in channels:
del c['followers']
for k in c.keys():
if c[k] == 'TRUE' or c[k] == 'yes': c[k] = True
if c[k] == 'FALSE' or c[k] == 'no': c[k] = False
# check to see if this already exists,
channel = session.query(Channel).filter_by(platform_id=c['platform_id'], platform=c['platform']).first()
if not channel:
channel = Channel(**c, source='researcher')
session.add(channel)
session.commit()
controller.connect_to_db(engine)
controller.scrape_all_channels(archive_media = False)
controller.archive_unarchived_media()
# transformer = TwitterTransformer()
# etl_controller = ETLController()
# etl_controller.register_transformer(transformer)
# etl_controller.connect_to_db(engine)
# etl_controller.transform_all_untransformed()

View File

@@ -4,6 +4,7 @@ from cisticola.base import Channel
from cisticola.scraper import TelegramTelethonScraper
def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs):
controller.remove_all_scrapers()
channels = [Channel(**channel_kwargs['telegram'])]
controller.register_scraper(scraper = TelegramTelethonScraper())
@@ -13,6 +14,7 @@ def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs):
def test_scrape_telegram_telethon_channel(controller, channel_kwargs):
controller.reset_db()
controller.remove_all_scrapers()
channels = [Channel(**channel_kwargs['telegram'])]
controller.register_scraper(scraper = TelegramTelethonScraper())