From 16870d7daa9e696d746a22ce361b8093b84a826c Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Mon, 28 Mar 2022 20:16:59 -0500 Subject: [PATCH 1/5] implemented methods for extracting profile metadata (still need to test) --- Pipfile.lock | 362 +++++++++++++++++-------- cisticola/scraper/bitchute.py | 63 +++-- cisticola/scraper/gab.py | 11 +- cisticola/scraper/gettr.py | 9 +- cisticola/scraper/instagram.py | 25 +- cisticola/scraper/odysee.py | 10 +- cisticola/scraper/rumble.py | 28 +- cisticola/scraper/telegram_snscrape.py | 8 + cisticola/scraper/telegram_telethon.py | 15 + cisticola/scraper/twitter.py | 11 +- cisticola/scraper/vkontakte.py | 10 +- cisticola/scraper/youtube.py | 17 +- tests/scraper/bitchute.py | 9 + 13 files changed, 424 insertions(+), 154 deletions(-) diff --git a/Pipfile.lock b/Pipfile.lock index b83b155..79b1b1c 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "e3b96b0ac8c80d4817f9adac4ab171bf4b7e07e80927c7b152a24e8bbdbf7faa" + "sha256": "26955249044f1cd4bb4504c14f00f0c50508192338026227fc7b889e9f4fc11c" }, "pipfile-spec": 6, "requires": { @@ -34,19 +34,19 @@ }, "boto3": { "hashes": [ - "sha256:76d5b90400c54b25278150768e946edf166acce2c1597c0ecfbebb1dbe9acf2c", - "sha256:7bb2e6506a6ad44d111dd20a5d510374b6958fe989b4ef887109c79d812f926f" + "sha256:788aa3281e91413bc201268a251c9d4ca2e9deb3a4af74daea2389cf66e5132e", + "sha256:ca37b9b4ade72f6d4fa2b7bee584dd5b1c7585f07f22ff1edbc9ecc0c4173b1f" ], "index": "pypi", - "version": "==1.21.19" + "version": "==1.21.28" }, "botocore": { "hashes": [ - "sha256:5ed2be0e413961134f4c17eab16396d41a5b4b73a637588260c04d20806d52ea", - "sha256:d0d77bce152ca51f3c2cd0f9bf05cb3b623e719406ad58b4c20444e237fe82eb" + "sha256:03c41d26d1e765380b8175d4b136d3144aa051f17a86eebfdf9a885a5a9a6a72", + "sha256:102eb24b44d473adea6bb8728b20fb9547fa5858c3293df7cad67ef17ea736a7" ], "markers": "python_version >= '3.6'", - "version": "==1.24.19" + "version": "==1.24.28" }, "brotli": { "hashes": [ @@ -123,6 +123,14 @@ "index": "pypi", "version": "==0.0.1" }, + "cachetools": { + "hashes": [ + "sha256:486471dfa8799eb7ec503a8059e263db000cdda20075ce5e48903087f79d5fd6", + "sha256:8fecd4203a38af17928be7b90689d8083603073622229ca7077b72d8e5a976e4" + ], + "markers": "python_version ~= '3.7'", + "version": "==5.0.0" + }, "certifi": { "hashes": [ "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872", @@ -140,19 +148,19 @@ }, "click": { "hashes": [ - "sha256:6a7a62563bbfabfda3a38f3023a1db4a35978c0abd76f6c9605ecd6554d6d9b1", - "sha256:8458d7b1287c5fb128c90e23381cf99dcde74beaf6c7ff6384ce84d6fe090adb" + "sha256:19a4baa64da924c5e0cd889aba8e947f280309f1a2ce0947a3e3a7bcb7cc72d6", + "sha256:977c213473c7665d3aa092b41ff12063227751c41d7b17165013e10069cc5cd2" ], - "markers": "python_version >= '3.6'", - "version": "==8.0.4" + "markers": "python_version >= '3.7'", + "version": "==8.1.0" }, "dateparser": { "hashes": [ - "sha256:faa2b97f51f3b5ff1ba2f17be90de2b733fb6191f89b4058787473e8202f3044", - "sha256:fec344db1f73d005182e214c0ff27313c748bbe0c1638ce9d48a809ddfdab2a0" + "sha256:038196b1f12c7397e38aad3d61588833257f6f552baa63a1499e6987fa8d42d9", + "sha256:9600874312ff28a41f96ec7ccdc73be1d1c44435719da47fea3339d55ff5a628" ], "index": "pypi", - "version": "==1.1.0" + "version": "==1.1.1" }, "ffmpeg-python": { "hashes": [ @@ -192,66 +200,89 @@ "index": "pypi", "version": "==0.8.0" }, + "google-auth": { + "hashes": [ + "sha256:3ba4d63cb29c1e6d5ffcc1c0623c03cf02ede6240a072f213084749574e691ab", + "sha256:60d449f8142c742db760f4c0be39121bc8d9be855555d784c252deaca1ced3f5" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", + "version": "==2.6.2" + }, + "google-auth-oauthlib": { + "hashes": [ + "sha256:24f67735513c4c7134dbde2f1dee5a1deb6acc8dfcb577d7bff30d213a28e7b0", + "sha256:30596b824fc6808fdaca2f048e4998cc40fb4b3599eaea66d28dc7085b36c5b8" + ], + "markers": "python_version >= '3.6'", + "version": "==0.5.1" + }, "greenlet": { "hashes": [ - "sha256:0051c6f1f27cb756ffc0ffbac7d2cd48cb0362ac1736871399a739b2885134d3", - "sha256:00e44c8afdbe5467e4f7b5851be223be68adb4272f44696ee71fe46b7036a711", - "sha256:013d61294b6cd8fe3242932c1c5e36e5d1db2c8afb58606c5a67efce62c1f5fd", - "sha256:049fe7579230e44daef03a259faa24511d10ebfa44f69411d99e6a184fe68073", - "sha256:14d4f3cd4e8b524ae9b8aa567858beed70c392fdec26dbdb0a8a418392e71708", - "sha256:166eac03e48784a6a6e0e5f041cfebb1ab400b394db188c48b3a84737f505b67", - "sha256:17ff94e7a83aa8671a25bf5b59326ec26da379ace2ebc4411d690d80a7fbcf23", - "sha256:1e12bdc622676ce47ae9abbf455c189e442afdde8818d9da983085df6312e7a1", - "sha256:21915eb821a6b3d9d8eefdaf57d6c345b970ad722f856cd71739493ce003ad08", - "sha256:288c6a76705dc54fba69fbcb59904ae4ad768b4c768839b8ca5fdadec6dd8cfd", - "sha256:2bde6792f313f4e918caabc46532aa64aa27a0db05d75b20edfc5c6f46479de2", - "sha256:32ca72bbc673adbcfecb935bb3fb1b74e663d10a4b241aaa2f5a75fe1d1f90aa", - "sha256:356b3576ad078c89a6107caa9c50cc14e98e3a6c4874a37c3e0273e4baf33de8", - "sha256:40b951f601af999a8bf2ce8c71e8aaa4e8c6f78ff8afae7b808aae2dc50d4c40", - "sha256:572e1787d1460da79590bf44304abbc0a2da944ea64ec549188fa84d89bba7ab", - "sha256:58df5c2a0e293bf665a51f8a100d3e9956febfbf1d9aaf8c0677cf70218910c6", - "sha256:64e6175c2e53195278d7388c454e0b30997573f3f4bd63697f88d855f7a6a1fc", - "sha256:7227b47e73dedaa513cdebb98469705ef0d66eb5a1250144468e9c3097d6b59b", - "sha256:7418b6bfc7fe3331541b84bb2141c9baf1ec7132a7ecd9f375912eca810e714e", - "sha256:7cbd7574ce8e138bda9df4efc6bf2ab8572c9aff640d8ecfece1b006b68da963", - "sha256:7ff61ff178250f9bb3cd89752df0f1dd0e27316a8bd1465351652b1b4a4cdfd3", - "sha256:833e1551925ed51e6b44c800e71e77dacd7e49181fdc9ac9a0bf3714d515785d", - "sha256:8639cadfda96737427330a094476d4c7a56ac03de7265622fcf4cfe57c8ae18d", - "sha256:8c5d5b35f789a030ebb95bff352f1d27a93d81069f2adb3182d99882e095cefe", - "sha256:8c790abda465726cfb8bb08bd4ca9a5d0a7bd77c7ac1ca1b839ad823b948ea28", - "sha256:8d2f1fb53a421b410751887eb4ff21386d119ef9cde3797bf5e7ed49fb51a3b3", - "sha256:903bbd302a2378f984aef528f76d4c9b1748f318fe1294961c072bdc7f2ffa3e", - "sha256:93f81b134a165cc17123626ab8da2e30c0455441d4ab5576eed73a64c025b25c", - "sha256:95e69877983ea39b7303570fa6760f81a3eec23d0e3ab2021b7144b94d06202d", - "sha256:9633b3034d3d901f0a46b7939f8c4d64427dfba6bbc5a36b1a67364cf148a1b0", - "sha256:97e5306482182170ade15c4b0d8386ded995a07d7cc2ca8f27958d34d6736497", - "sha256:9f3cba480d3deb69f6ee2c1825060177a22c7826431458c697df88e6aeb3caee", - "sha256:aa5b467f15e78b82257319aebc78dd2915e4c1436c3c0d1ad6f53e47ba6e2713", - "sha256:abb7a75ed8b968f3061327c433a0fbd17b729947b400747c334a9c29a9af6c58", - "sha256:aec52725173bd3a7b56fe91bc56eccb26fbdff1386ef123abb63c84c5b43b63a", - "sha256:b11548073a2213d950c3f671aa88e6f83cda6e2fb97a8b6317b1b5b33d850e06", - "sha256:b1692f7d6bc45e3200844be0dba153612103db241691088626a33ff1f24a0d88", - "sha256:b336501a05e13b616ef81ce329c0e09ac5ed8c732d9ba7e3e983fcc1a9e86965", - "sha256:b8c008de9d0daba7b6666aa5bbfdc23dcd78cafc33997c9b7741ff6353bafb7f", - "sha256:b92e29e58bef6d9cfd340c72b04d74c4b4e9f70c9fa7c78b674d1fec18896dc4", - "sha256:be5f425ff1f5f4b3c1e33ad64ab994eed12fc284a6ea71c5243fd564502ecbe5", - "sha256:dd0b1e9e891f69e7675ba5c92e28b90eaa045f6ab134ffe70b52e948aa175b3c", - "sha256:e30f5ea4ae2346e62cedde8794a56858a67b878dd79f7df76a0767e356b1744a", - "sha256:e6a36bb9474218c7a5b27ae476035497a6990e21d04c279884eb10d9b290f1b1", - "sha256:e859fcb4cbe93504ea18008d1df98dee4f7766db66c435e4882ab35cf70cac43", - "sha256:eb6ea6da4c787111adf40f697b4e58732ee0942b5d3bd8f435277643329ba627", - "sha256:ec8c433b3ab0419100bd45b47c9c8551248a5aee30ca5e9d399a0b57ac04651b", - "sha256:eff9d20417ff9dcb0d25e2defc2574d10b491bf2e693b4e491914738b7908168", - "sha256:f0214eb2a23b85528310dad848ad2ac58e735612929c8072f6093f3585fd342d", - "sha256:f276df9830dba7a333544bd41070e8175762a7ac20350786b322b714b0e654f5", - "sha256:f3acda1924472472ddd60c29e5b9db0cec629fbe3c5c5accb74d6d6d14773478", - "sha256:f70a9e237bb792c7cc7e44c531fd48f5897961701cdaa06cf22fc14965c496cf", - "sha256:f9d29ca8a77117315101425ec7ec2a47a22ccf59f5593378fc4077ac5b754fce", - "sha256:fa877ca7f6b48054f847b61d6fa7bed5cebb663ebc55e018fda12db09dcc664c", - "sha256:fdcec0b8399108577ec290f55551d926d9a1fa6cad45882093a7a07ac5ec147b" + "sha256:004aed447382d80a56ecc354a6d807f305e6c808714ce6ccbca4839c94fae81d", + "sha256:068d68fad6bd623e29a2d36e74538c9b9d6dc6464931cd27d93da6cfc6a7f242", + "sha256:06fd4075754009c9817c6b4e1dc0af4616de52757b6ca973a81c3c1aadc28257", + "sha256:1004cb542451814b12a4f38e835a47734e2b2c683acbf463d5ae76282a3974cf", + "sha256:10c358633a8b27bfc32d27114ef2ca2ddc9f1f89f1643d1157b85e1fdd695315", + "sha256:115bc25fefbdc692c4483e9ddb9011ccd0251590ed59dbfff0f4eb7050bf99c4", + "sha256:1d987a2579336792f73ae6b106c2f087e32afc8573fbf9566f123ac6d8cfb72f", + "sha256:2128d727fd1e8afba8e68feb2cdcf88c90163b69ddc9707722a3e491c5280720", + "sha256:230132c241fe284f93f2e7b3969e9b22bbd76ef98cf93e382c945d378907f5a4", + "sha256:23558f7bd08a663386c032ab8d302d613d2d02ae0c9758ad410bab6035b58d3d", + "sha256:255d520d3e4a5f16883b182e1a94219fe455ab4f50aaaf534bfd6d64ee728397", + "sha256:2a6bc19a728f6f643cfc89b876159a1a25a8f7d8700c013d48a73691f80b4550", + "sha256:379bed346ef8ba0a0e698b3c5975a44d15dd4a5bbff40bbd7fd548b445d5550b", + "sha256:3b12d0866759db93b0a893b4e50a7d7d1681519d2346c26695bb8bb2c652230e", + "sha256:40d491944f69e350e1e8b25f6ca49459824ede1678ec0cd4b5541f41edc06614", + "sha256:471484c7b9d7b7867263051aa81cdeed6e06b455e629a7f05eb91a6cb8bd0836", + "sha256:488c557080557bc01aabb3e1bda7225c68455b853733a8652857ac0d810dad1b", + "sha256:49c2e76e7aa81ba889b3c183e2341af3cc6161ee38852085110ae49d5b5d9a40", + "sha256:52d13ec90236e5935ed6da044e78faa1371d5116cc43fe6d7ca8994dd619ef96", + "sha256:57898c69a253d81f487787bdd538629fabd671fab8a9e31b041ca30965fd9556", + "sha256:5d577eef5beb5730ef01ab39983eb852a97c359b7a546809adf70c409f4b2ecc", + "sha256:6a41987c1474c9158a0c0c96611530a8f299bc547d35bee8add981b8b2534f74", + "sha256:6ae67b7df8db3626af8e042e9c6949cfa27d1a3bbbfdff29e45b72bb6673a650", + "sha256:6c42c27e9d12e8a481aff469ffe8dd4ce0484c354a418470960f760f6ae41e7c", + "sha256:6c4a90c9f6128b4d0905a89930bd325e0491574e5cb453f606bb7094a3197587", + "sha256:6e64518e5833ac2d9359b6d9bd4df2c0cf441a0f3a4eca9e735fbea99009fa70", + "sha256:6fd3a270c23c5b42d86a9c7c6b0229f23ee4a7a4cabdaaa1693ad7a0982d13cb", + "sha256:70db73351e0fcf11a76288c47a0469d9a330bcb2e7618c5eb57432b8caa82403", + "sha256:771f401692046845626cbdf1dd0f04e999413ede0ee9ad39033fe30b5fa2e845", + "sha256:7935026ec61b967cbc6b746c0ca75c1651ea118d7fee4d259cff9e6866153374", + "sha256:7b76b1cac9baac1980210e29145800954e7b42e91ef69c4d695de1cab87ce41f", + "sha256:7e3f37c11b6699b1a1e0fcc0e88829dba4f2866546381b05ab8b3f4db645a823", + "sha256:8370fa65ad421484894f559055f951843754153b72b9bca2ebdc5288efe2e3f0", + "sha256:8ae9c443d44a4e23252632e4d7775f419f992d0df3eff923e23775f5cc551d39", + "sha256:8b31d85f2781e44f1ffaaf7ea07f484e7d42317c677c355fa77b4a1a4bea7394", + "sha256:8b450336b27f3b375cadc474c6704838eaa8dd3ca312aac3bb69d92264a8e638", + "sha256:9ce84357388a76d886febff4e50e321c212ffd3248b590960b2da6e02404a5c9", + "sha256:a23e986fb0ba8e7407286add41fa0d4207be44e3dce1b04789f4757800eca1cf", + "sha256:a81610ee00d0da9cd2c8679479b7791149365b6dfb3971b01b22ee29b04787ce", + "sha256:b4e40444975e5ab0ed3004369209c39a28e084951daaeee4919f164b6b849b14", + "sha256:b66600de16702b9dfa74bea34524b55183a2183e5fd92f20fe6c2fcae550a64c", + "sha256:ba6ee18694d3673796b7a31b7d21254e87e9e43ca5be56f323fd396111255315", + "sha256:bd03837da28293baa39bdfc3cada69e2f8807f423ae06168aa28d2b32c63a6b6", + "sha256:bd2192070f88c0778ae1d68a0980fdece3473498c1db37f3794e3454f91e3ecf", + "sha256:c1f6f1a3cc013012cd1da913c40b13e6d721046a8c8a0ea0cde94069645a75db", + "sha256:ce10a8e7e067bde3c1fbf494d2b8859db510206030b0b67bc3af90b0eb1887b9", + "sha256:d31386d208303a5a6cf0819ef9f6db6680bab9e4ca8e48adb3d4b26ead89beb7", + "sha256:d83b3af53b201970973c5574b39df226746194063bb248a53fd12b470ac34319", + "sha256:df9657b212c054ac6d803290d7c4bcd7790af0b725984fce1eeb0a1e3f2d9798", + "sha256:e576e5fd3f129e6b3595dc734ac7f2b8c548f19ef07781194bc538dc9c0cdbbc", + "sha256:e7400358558094c1bcedc75f3b3c4f400c53130b44833848890a99968dee6a64", + "sha256:eb6a385f8577d30e4cb43dd555fb134ddaae1edeb84205e09dabec332bf49fd0", + "sha256:f27f0875e0873f6bf5df09a456bfcac0667824cabac4cad30b43f36e0382ffe7", + "sha256:fcd4a6d04995f1d66bc78b503e4e59ae72fd32aaec4f661657fe5ae5c1aa4ce3" ], "markers": "python_version >= '3' and platform_machine == 'aarch64' or (platform_machine == 'ppc64le' or (platform_machine == 'x86_64' or (platform_machine == 'amd64' or (platform_machine == 'AMD64' or (platform_machine == 'win32' or platform_machine == 'WIN32')))))", - "version": "==1.1.2" + "version": "==2.0.0a2" + }, + "gspread": { + "hashes": [ + "sha256:a347197628fa1885dcc860701fb1b3f5471386aa863a71cfe232b6473c6fea1b", + "sha256:be2220e19723570ed98e8b8eb6a5b6e04afa0f08ec1f08b89e217c354488a047" + ], + "index": "pypi", + "version": "==5.3.0" }, "idna": { "hashes": [ @@ -270,18 +301,18 @@ }, "instaloader": { "hashes": [ - "sha256:9615a12a5a01a8b6c9d99a2a047b21d81b341cfd77656b9261bda30ece0cd562" + "sha256:7fa6147810eedcc1dedcdec8cfa1f220c9379ab8faeab6a336a7c181d944e2e4" ], "index": "pypi", - "version": "==4.8.4" + "version": "==4.9" }, "jmespath": { "hashes": [ - "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9", - "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f" + "sha256:a490e280edd1f57d6de88636992d05b71e97d69a26a19f058ecf7d304474bf5e", + "sha256:e8dcd576ed616f14ec02eed0005c85973b5890083313860136657e24784e4c04" ], - "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==0.10.0" + "markers": "python_version >= '3.7'", + "version": "==1.0.0" }, "loguru": { "hashes": [ @@ -363,7 +394,7 @@ "sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1", "sha256:9c9f243fcec7f410f138cb12c21c84c64fde4195481a30c9bfb05b5f003adfed" ], - "markers": "python_version >= '3.5' and python_version < '4'", + "markers": "python_version >= '3.5' and python_version < '4.0'", "version": "==1.45.1" }, "numpy": { @@ -392,6 +423,14 @@ "markers": "python_version < '3.10' and platform_machine != 'aarch64' and platform_machine != 'arm64'", "version": "==1.22.3" }, + "oauthlib": { + "hashes": [ + "sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2", + "sha256:6db33440354787f9b7f3a6dbd4febf5d0f93758354060e802f6c06cb493022fe" + ], + "markers": "python_version >= '3.6'", + "version": "==3.2.0" + }, "packaging": { "hashes": [ "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", @@ -512,6 +551,24 @@ ], "version": "==0.4.8" }, + "pyasn1-modules": { + "hashes": [ + "sha256:0845a5582f6a02bb3e1bde9ecfc4bfcae6ec3210dd270522fee602365430c3f8", + "sha256:0fe1b68d1e486a1ed5473f1302bd991c1611d319bba158e98b106ff86e1d7199", + "sha256:15b7c67fabc7fc240d87fb9aabf999cf82311a6d6fb2c70d00d3d0604878c811", + "sha256:426edb7a5e8879f1ec54a1864f16b882c2837bfd06eee62f2c982315ee2473ed", + "sha256:65cebbaffc913f4fe9e4808735c95ea22d7a7775646ab690518c056784bc21b4", + "sha256:905f84c712230b2c592c19470d3ca8d552de726050d1d1716282a1f6146be65e", + "sha256:a50b808ffeb97cb3601dd25981f6b016cbb3d31fbf57a8b8a87428e6158d0c74", + "sha256:a99324196732f53093a84c4369c996713eb8c89d360a496b599fb1a9c47fc3eb", + "sha256:b80486a6c77252ea3a3e9b1e360bc9cf28eaac41263d173c032581ad2f20fe45", + "sha256:c29a5e5cc7a3f05926aff34e097e84f8589cd790ce0ed41b67aed6857b26aafd", + "sha256:cbac4bc38d117f2a49aeedec4407d23e8866ea4ac27ff2cf7fb3e5b570df19e0", + "sha256:f39edd8c4ecaa4556e989147ebf219227e2cd2e8a43c7e7fcb1f1c18c5fd6a3d", + "sha256:fe0644d9ab041506b62782e92b06b8c68cca799e1a9636ec398675459e031405" + ], + "version": "==0.2.8" + }, "pycryptodomex": { "hashes": [ "sha256:1ca8e1b4c62038bb2da55451385246f51f412c5f5eabd64812c01766a5989b4a", @@ -575,11 +632,11 @@ }, "pytest": { "hashes": [ - "sha256:b555252a95bbb2a37a97b5ac2eb050c436f7989993565f5e0c9128fcaacadd0e", - "sha256:f1089d218cfcc63a212c42896f1b7fbf096874d045e1988186861a1a87d27b47" + "sha256:841132caef6b1ad17a9afde46dc4f6cfa59a05f9555aae5151f73bdf2820ca63", + "sha256:92f723789a8fdd7180b6b06483874feca4c48a5c76968e03bb3e7f806a1869ea" ], "markers": "python_version >= '3.7'", - "version": "==7.1.0" + "version": "==7.1.1" }, "python-dateutil": { "hashes": [ @@ -591,10 +648,10 @@ }, "pytz": { "hashes": [ - "sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c", - "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326" + "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7", + "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c" ], - "version": "==2021.3" + "version": "==2022.1" }, "pytz-deprecation-shim": { "hashes": [ @@ -685,9 +742,7 @@ "version": "==2022.3.2" }, "requests": { - "extras": [ - "socks" - ], + "extras": [], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" @@ -695,12 +750,20 @@ "index": "pypi", "version": "==2.27.1" }, + "requests-oauthlib": { + "hashes": [ + "sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5", + "sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==1.3.1" + }, "rsa": { "hashes": [ "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17", "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb" ], - "markers": "python_version >= '3.6' and python_version < '4'", + "markers": "python_version >= '3.6' and python_version < '4.0'", "version": "==4.8" }, "s3transfer": { @@ -790,11 +853,11 @@ }, "tzdata": { "hashes": [ - "sha256:3eee491e22ebfe1e5cfcc97a4137cd70f092ce59144d81f8924a844de05ba8f5", - "sha256:68dbe41afd01b867894bbdfd54fa03f468cfa4f0086bfb4adcd8de8f24f3ee21" + "sha256:238e70234214138ed7b4e8a0fab0e5e13872edab3be586ab8198c407620e2ab9", + "sha256:8b536a8ec63dc0751342b3984193a3118f8fca2afe25752bb9b7fffd398552d3" ], "markers": "python_version >= '3.6'", - "version": "==2021.5" + "version": "==2022.1" }, "tzlocal": { "hashes": [ @@ -806,11 +869,11 @@ }, "urllib3": { "hashes": [ - "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed", - "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c" + "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14", + "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", - "version": "==1.26.8" + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'", + "version": "==1.26.9" }, "websockets": { "hashes": [ @@ -899,6 +962,35 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==2.9.1" }, + "black": { + "hashes": [ + "sha256:06f9d8846f2340dfac80ceb20200ea5d1b3f181dd0556b47af4e8e0b24fa0a6b", + "sha256:10dbe6e6d2988049b4655b2b739f98785a884d4d6b85bc35133a8fb9a2233176", + "sha256:2497f9c2386572e28921fa8bec7be3e51de6801f7459dffd6e62492531c47e09", + "sha256:30d78ba6bf080eeaf0b7b875d924b15cd46fec5fd044ddfbad38c8ea9171043a", + "sha256:328efc0cc70ccb23429d6be184a15ce613f676bdfc85e5fe8ea2a9354b4e9015", + "sha256:35020b8886c022ced9282b51b5a875b6d1ab0c387b31a065b84db7c33085ca79", + "sha256:5795a0375eb87bfe902e80e0c8cfaedf8af4d49694d69161e5bd3206c18618bb", + "sha256:5891ef8abc06576985de8fa88e95ab70641de6c1fca97e2a15820a9b69e51b20", + "sha256:637a4014c63fbf42a692d22b55d8ad6968a946b4a6ebc385c5505d9625b6a464", + "sha256:67c8301ec94e3bcc8906740fe071391bce40a862b7be0b86fb5382beefecd968", + "sha256:6d2fc92002d44746d3e7db7cf9313cf4452f43e9ea77a2c939defce3b10b5c82", + "sha256:6ee227b696ca60dd1c507be80a6bc849a5a6ab57ac7352aad1ffec9e8b805f21", + "sha256:863714200ada56cbc366dc9ae5291ceb936573155f8bf8e9de92aef51f3ad0f0", + "sha256:9b542ced1ec0ceeff5b37d69838106a6348e60db7b8fdd245294dc1d26136265", + "sha256:a6342964b43a99dbc72f72812bf88cad8f0217ae9acb47c0d4f141a6416d2d7b", + "sha256:ad4efa5fad66b903b4a5f96d91461d90b9507a812b3c5de657d544215bb7877a", + "sha256:bc58025940a896d7e5356952228b68f793cf5fcb342be703c3a2669a1488cb72", + "sha256:cc1e1de68c8e5444e8f94c3670bb48a2beef0e91dddfd4fcc29595ebd90bb9ce", + "sha256:cee3e11161dde1b2a33a904b850b0899e0424cc331b7295f2a9698e79f9a69a0", + "sha256:e3556168e2e5c49629f7b0f377070240bd5511e45e25a4497bb0073d9dda776a", + "sha256:e8477ec6bbfe0312c128e74644ac8a02ca06bcdb8982d4ee06f209be28cdf163", + "sha256:ee8f1f7228cce7dffc2b464f07ce769f478968bfb3dd1254a4c2eeed84928aad", + "sha256:fd57160949179ec517d32ac2ac898b5f20d68ed1a9c977346efbac9c2f1e779d" + ], + "index": "pypi", + "version": "==22.3.0" + }, "certifi": { "hashes": [ "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872", @@ -914,6 +1006,14 @@ "markers": "python_version >= '3'", "version": "==2.0.12" }, + "click": { + "hashes": [ + "sha256:19a4baa64da924c5e0cd889aba8e947f280309f1a2ce0947a3e3a7bcb7cc72d6", + "sha256:977c213473c7665d3aa092b41ff12063227751c41d7b17165013e10069cc5cd2" + ], + "markers": "python_version >= '3.7'", + "version": "==8.1.0" + }, "coverage": { "extras": [ "toml" @@ -1005,11 +1105,11 @@ }, "jinja2": { "hashes": [ - "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8", - "sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7" + "sha256:539835f51a74a69f41b848a9645dbdc35b4f20a3b601e2d9a7e22947b15ff119", + "sha256:640bed4bb501cbd17194b3cace1dc2126f5b619cf068a726b98192a0fde74ae9" ], - "markers": "python_version >= '3.6'", - "version": "==3.0.3" + "markers": "python_version >= '3.7'", + "version": "==3.1.1" }, "markupsafe": { "hashes": [ @@ -1057,6 +1157,13 @@ "markers": "python_version >= '3.7'", "version": "==2.1.1" }, + "mypy-extensions": { + "hashes": [ + "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d", + "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8" + ], + "version": "==0.4.3" + }, "packaging": { "hashes": [ "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", @@ -1065,6 +1172,21 @@ "markers": "python_version >= '3.6'", "version": "==21.3" }, + "pathspec": { + "hashes": [ + "sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a", + "sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1" + ], + "version": "==0.9.0" + }, + "platformdirs": { + "hashes": [ + "sha256:7535e70dfa32e84d4b34996ea99c5e432fa29a708d0f4e394bbcb2a8faa4f16d", + "sha256:bcae7cab893c2d310a711b70b24efb93334febe65f8de776ee320b517471e227" + ], + "markers": "python_version >= '3.7'", + "version": "==2.5.1" + }, "pluggy": { "hashes": [ "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159", @@ -1099,11 +1221,11 @@ }, "pytest": { "hashes": [ - "sha256:b555252a95bbb2a37a97b5ac2eb050c436f7989993565f5e0c9128fcaacadd0e", - "sha256:f1089d218cfcc63a212c42896f1b7fbf096874d045e1988186861a1a87d27b47" + "sha256:841132caef6b1ad17a9afde46dc4f6cfa59a05f9555aae5151f73bdf2820ca63", + "sha256:92f723789a8fdd7180b6b06483874feca4c48a5c76968e03bb3e7f806a1869ea" ], "markers": "python_version >= '3.7'", - "version": "==7.1.0" + "version": "==7.1.1" }, "pytest-cov": { "hashes": [ @@ -1123,23 +1245,21 @@ }, "pytest-metadata": { "hashes": [ - "sha256:576055b8336dd4a9006dd2a47615f76f2f8c30ab12b1b1c039d99e834583523f", - "sha256:71b506d49d34e539cc3cfdb7ce2c5f072bea5c953320002c95968e0238f8ecf1" + "sha256:141ba561a17659cda00cf74e7c7cf6103bab4550acad76a46f893339de63b1df", + "sha256:5cdb6aeea8ba9109181cf9f149c8a3ae1430ff7e44506a8f866af8a98ca46301" ], "index": "pypi", - "version": "==1.11.0" + "version": "==2.0.1" }, "pytz": { "hashes": [ - "sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c", - "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326" + "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7", + "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c" ], - "version": "==2021.3" + "version": "==2022.1" }, "requests": { - "extras": [ - "socks" - ], + "extras": [], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" @@ -1156,11 +1276,11 @@ }, "sphinx": { "hashes": [ - "sha256:5da895959511473857b6d0200f56865ed62c31e8f82dd338063b84ec022701fe", - "sha256:6caad9786055cb1fa22b4a365c1775816b876f91966481765d7d50e9f0dd35cc" + "sha256:7bf8ca9637a4ee15af412d1a1d9689fec70523a68ca9bb9127c2f3eeb344e2e6", + "sha256:ebf612653238bcc8f4359627a9b7ce44ede6fdd75d9d30f68255c7383d3a6226" ], "index": "pypi", - "version": "==4.4.0" + "version": "==4.5.0" }, "sphinx-rtd-theme": { "hashes": [ @@ -1226,13 +1346,21 @@ "markers": "python_version >= '3.7'", "version": "==2.0.1" }, + "typing-extensions": { + "hashes": [ + "sha256:1a9462dcc3347a79b1f1c0271fbe79e844580bb598bafa1ed208b94da3cdcd42", + "sha256:21c85e0fe4b9a155d0799430b0ad741cdce7e359660ccbd8b530613e8df88ce2" + ], + "markers": "python_version < '3.10'", + "version": "==4.1.1" + }, "urllib3": { "hashes": [ - "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed", - "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c" + "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14", + "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", - "version": "==1.26.8" + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'", + "version": "==1.26.9" }, "zipp": { "hashes": [ diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index b11d27a..a3103a6 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -64,6 +64,43 @@ class BitchuteScraper(Scraper): if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None: return True + def get_profile(self, channel: Channel) -> dict: + + base_url = "https://www.bitchute.com/channel/%s/" % channel.url + + session = requests.session() + response = session.get(base_url) + soup = BeautifulSoup(response.content, 'html.parser') + + csrftoken = session.cookies['csrftoken'] + csrfmiddlewaretoken = soup.find('input', {'name' : 'csrfmiddlewaretoken'})['value'] + + about_soup = soup.find('div', {'id' : 'channel-about'}) + info_list = about_soup.find('div', {'class' : 'channel-about-details'}).find_all('p') + description_soup = about_soup.find('div', {'id' : 'channel-description'}) + + headers = {'Referer': base_url} + data = { + 'csrftoken': csrftoken, + 'csrfmiddlewaretoken': csrfmiddlewaretoken} + + response = session.post(base_url + 'counts/', data = data, headers = headers) + counts = json.loads(response.text) + + profile = { + 'description' : description_soup.text.strip(), + 'description_links' : [a['href'] for a in description_soup.find_all('a', href = True)], + 'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')), + 'videos' : int(info_list[1].text.split('videos')[0].strip()), + 'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'], + 'owner_name' : soup.find('p', {'class' : 'owner'}).text, + 'category' : info_list[-1].text.split('Category')[1].strip(), + 'image' : about_soup.find('img', {'alt' : 'Channel Image'})['data-src'], + 'subscribers': counts['subscriber_count'], + 'views': int(counts['about_view_count'].split(' ')[0])} + + return profile + #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# def strip_tags(html, convert_newlines=True): @@ -420,29 +457,3 @@ def get_videos_user(session, user, csrftoken, detail): # before the video, which is weird yield comment #-----------------------------------------------------------------------------# - -def get_about(user): - """ - Extract fields from channel's "About" tab - """ - base_url = "https://www.bitchute.com/channel/%s/" % user - - response = requests.get(base_url) - soup = BeautifulSoup(response.content, 'html.parser') - - about_soup = soup.find('div', {'id' : 'channel-about'}) - info_list = about_soup.find('div', {'class' : 'channel-about-details'}).find_all('p') - description_soup = about_soup.find('div', {'id' : 'channel-description'}) - - about = { - 'description' : description_soup.text, - 'description_links' : [a['href'] for a in description_soup.find_all('a', href = True)], - 'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')), - 'videos' : int(info_list[1].text.split('videos')[0].strip()), - 'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'], - 'owner_name' : soup.find('p', {'class' : 'owner'}).text, - 'category' : info_list[-1].text.split('Category')[1].strip(), - 'image' : about_soup.find('img', {'alt' : 'Channel Image'})['data-src'] - } - - return about diff --git a/cisticola/scraper/gab.py b/cisticola/scraper/gab.py index f90f2a3..aa698ad 100644 --- a/cisticola/scraper/gab.py +++ b/cisticola/scraper/gab.py @@ -51,6 +51,13 @@ class GabScraper(Scraper): raw_data=json.dumps(post), archived_urls=archived_urls) - def can_handle(self, channel): + def can_handle(self, channel: Channel) -> bool: if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None: - return True \ No newline at end of file + return True + + def get_profile(self, channel: Channel) -> dict: + client = Garc(profile = 'main') + username = self.get_username_from_url(channel.url) + profile = list(client.user(username))[0] + + return profile \ No newline at end of file diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index 4fb15cc..5e8d7ac 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -68,4 +68,11 @@ class GettrScraper(Scraper): def url_to_key(self, url: str, content_type: str) -> str: ext = '.' + content_type.split('/')[-1] key = urlparse(url).path.split('/')[-2] + ext - return key \ No newline at end of file + return key + + def get_profile(self, channel: Channel) -> dict: + client = client = PublicClient() + username = self.get_username_from_url(channel.url) + profile = client.user_info(username) + + return profile \ No newline at end of file diff --git a/cisticola/scraper/instagram.py b/cisticola/scraper/instagram.py index f9ae76e..a5613a4 100644 --- a/cisticola/scraper/instagram.py +++ b/cisticola/scraper/instagram.py @@ -100,4 +100,27 @@ class InstagramScraper(Scraper): def can_handle(self, channel): if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None: - return True \ No newline at end of file + return True + + def get_profile(self, channel: Channel) -> dict: + + username = self.get_username_from_url(channel.url) + + loader = instaloader.Instaloader( + quiet = True, + download_comments = False, + save_metadata = False) + + loader.login( + user = os.environ['INSTAGRAM_USERNAME'], + passwd = os.environ['INSTAGRAM_PASSWORD']) + + user_profile = instaloader.Profile.from_username( + context = loader.context, + username = username) + + profile = user_profile._asdict() + profile['followers'] = user_profile.followers + profile['followees'] = user_profile.followees + + return profile \ No newline at end of file diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py index eb7ec04..980653c 100644 --- a/cisticola/scraper/odysee.py +++ b/cisticola/scraper/odysee.py @@ -77,4 +77,12 @@ class OdyseeScraper(Scraper): key = urlparse(url).path.split('/')[-2] ext = content_type.split('/')[-1] - return f'{key}.{ext}' \ No newline at end of file + return f'{key}.{ext}' + + def get_profile(self, channel: Channel) -> dict: + + username = self.get_username_from_url(channel.url) + odysee_channel = OdyseeChannel(channel_name = username) + profile = odysee_channel.info + + return profile \ No newline at end of file diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index 9863fb0..252239e 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -57,6 +57,13 @@ class RumbleScraper(Scraper): if channel.platform == "Rumble" and self.get_username_from_url(channel.url) is not None: return True + def get_profile(self, channel: Channel) -> dict: + + username = self.get_username_from_url(channel.url) + profile = get_channel_profile(username = username) + + return profile + #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# def get_media_url(url): @@ -90,10 +97,10 @@ def process_video(video): return info -def get_channel_videos(channel): +def get_channel_videos(username): page = 1 - channel_url = f'{BASE_URL}/c/{channel}?page=' + channel_url = f'{BASE_URL}/c/{username}?page=' while True: url = channel_url + str(page) @@ -111,4 +118,21 @@ def get_channel_videos(channel): page += 1 +def get_channel_profile(username): + + channel_url = f'{BASE_URL}/c/{username}' + r = make_request(url = channel_url) + soup = BeautifulSoup(r.content, features = 'lxml') + + verified_svg = soup.find('h1').find('svg', {'class' : 'listing-header--verified'}) + + profile = { + 'name': soup.find('h1').text, + 'verified': verified_svg is not None, + 'thumbnail': soup.find('img', {'class' : 'listing-header--thumb'})['src'], + 'cover': soup.find('img', {'class' : 'listing-header--backsplash-img'})['src'], + 'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text} + + return profile + #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py index ec5b292..5a99951 100644 --- a/cisticola/scraper/telegram_snscrape.py +++ b/cisticola/scraper/telegram_snscrape.py @@ -54,3 +54,11 @@ class TelegramSnscrapeScraper(Scraper): raw_data=post.json(), archived_urls=archived_urls ) + + def get_profile(self, channel: Channel) -> dict: + + scr = snscrape.modules.telegram.TelegramChannelScraper( + channel.screenname) + + profile = scr._get_entity().__dict__ + return profile \ No newline at end of file diff --git a/cisticola/scraper/telegram_telethon.py b/cisticola/scraper/telegram_telethon.py index b8231bc..c7d9b52 100644 --- a/cisticola/scraper/telegram_telethon.py +++ b/cisticola/scraper/telegram_telethon.py @@ -7,6 +7,7 @@ from pathlib import Path from loguru import logger from telethon.sync import TelegramClient +from telethon.tl.functions.channels import GetFullChannelRequest from cisticola.base import Channel, ScraperResult from cisticola.scraper.base import Scraper @@ -74,3 +75,17 @@ class TelegramTelethonScraper(Scraper): date_archived=datetime.now(timezone.utc), raw_data=json.dumps(post.to_dict(), default=str), archived_urls=archived_urls) + + def get_profile(self, channel: Channel) -> dict: + + username = self.get_username_from_url(channel.url) + + api_id = os.environ['TELEGRAM_API_ID'] + api_hash = os.environ['TELEGRAM_API_HASH'] + phone = os.environ['TELEGRAM_PHONE'] + + with TelegramClient(phone, api_id, api_hash) as client: + full_channel = client(GetFullChannelRequest(channel = username)) + profile = full_channel.__dict__ + + return profile \ No newline at end of file diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index 8209282..d8a8c9d 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -2,7 +2,7 @@ from datetime import datetime, timezone from typing import Generator from urllib.parse import urlparse, parse_qs -from snscrape.modules.twitter import TwitterProfileScraper, Video, Gif, Photo +from snscrape.modules.twitter import TwitterProfileScraper, TwitterUserScraper, Video, Gif, Photo from loguru import logger from cisticola.base import Channel, ScraperResult @@ -86,4 +86,11 @@ class TwitterScraper(Scraper): ext = '' key = parsed_url.path.split('/')[-1] + ext - return key \ No newline at end of file + return key + + def get_profile(self, channel: Channel) -> dict: + + scraper = TwitterUserScraper(channel.platform_id) + + profile = scraper._get_entity().__dict__ + return profile \ No newline at end of file diff --git a/cisticola/scraper/vkontakte.py b/cisticola/scraper/vkontakte.py index 7ca5659..78dfe98 100644 --- a/cisticola/scraper/vkontakte.py +++ b/cisticola/scraper/vkontakte.py @@ -77,4 +77,12 @@ class VkontakteScraper(Scraper): ext = '.mp4' key = path.split('/')[-1] + ext - return key \ No newline at end of file + return key + + def get_profile(self, channel: Channel) -> dict: + + username = self.get_username_from_url(channel.url) + scraper = VKontakteUserScraper(username) + + profile = scraper._get_entity().__dict__ + return profile \ No newline at end of file diff --git a/cisticola/scraper/youtube.py b/cisticola/scraper/youtube.py index 20ae6a3..88c75a4 100644 --- a/cisticola/scraper/youtube.py +++ b/cisticola/scraper/youtube.py @@ -76,4 +76,19 @@ class YoutubeScraper(Scraper): def can_handle(self, channel): if channel.platform == "Youtube" and channel.url: - return True \ No newline at end of file + return True + + def get_profile(self, channel: Channel) -> dict: + + ydl_opts = {} + ydl = yt_dlp.YoutubeDL(ydl_opts) + + meta = None + try: + meta = ydl.extract_info( + channel.url, + process=False) + except yt_dlp.utils.DownloadError as e: + raise e + + return meta \ No newline at end of file diff --git a/tests/scraper/bitchute.py b/tests/scraper/bitchute.py index c32e840..687a6b0 100644 --- a/tests/scraper/bitchute.py +++ b/tests/scraper/bitchute.py @@ -1,3 +1,5 @@ +import pytest + from cisticola.base import Channel from cisticola.scraper import BitchuteScraper @@ -14,3 +16,10 @@ def test_scrape_bitchute_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['bitchute'])] controller.register_scraper(scraper = BitchuteScraper()) controller.scrape_channels(channels = channels, archive_media = True) + +@pytest.mark.profile +def test_scrape_bitchute_profile(channel_kwargs): + + scraper = BitchuteScraper() + channel = Channel(**channel_kwargs['bitchute']) + scraper.get_profile(channel=channel) \ No newline at end of file From 67d1abf024d3d498f9994f9a1a93f820014b8b53 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Mon, 28 Mar 2022 21:11:34 -0500 Subject: [PATCH 2/5] added methods for extracting channel profile metadata, and tests --- cisticola/scraper/bitchute.py | 25 +++++++++++++++++++++---- cisticola/scraper/rumble.py | 9 +++++---- cisticola/scraper/telegram_telethon.py | 2 +- pytest.ini | 3 +++ tests/scraper/gab.py | 9 +++++++++ tests/scraper/gettr.py | 9 +++++++++ tests/scraper/instagram.py | 9 +++++++++ tests/scraper/odysee.py | 9 +++++++++ tests/scraper/rumble.py | 9 +++++++++ tests/scraper/telegram_snscrape.py | 9 +++++++++ tests/scraper/telegram_telethon.py | 9 +++++++++ tests/scraper/twitter.py | 9 +++++++++ tests/scraper/vkontakte.py | 9 +++++++++ tests/scraper/youtube.py | 9 +++++++++ 14 files changed, 120 insertions(+), 9 deletions(-) diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index a3103a6..d8d3f0b 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -66,12 +66,13 @@ class BitchuteScraper(Scraper): def get_profile(self, channel: Channel) -> dict: - base_url = "https://www.bitchute.com/channel/%s/" % channel.url + base_url = channel.url session = requests.session() response = session.get(base_url) soup = BeautifulSoup(response.content, 'html.parser') + canonical_url = soup.find('link', {'id' : 'canonical'})['href'] csrftoken = session.cookies['csrftoken'] csrfmiddlewaretoken = soup.find('input', {'name' : 'csrfmiddlewaretoken'})['value'] @@ -84,7 +85,7 @@ class BitchuteScraper(Scraper): 'csrftoken': csrftoken, 'csrfmiddlewaretoken': csrfmiddlewaretoken} - response = session.post(base_url + 'counts/', data = data, headers = headers) + response = session.post(canonical_url + 'counts/', data = data, headers = headers) counts = json.loads(response.text) profile = { @@ -93,9 +94,9 @@ class BitchuteScraper(Scraper): 'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')), 'videos' : int(info_list[1].text.split('videos')[0].strip()), 'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'], - 'owner_name' : soup.find('p', {'class' : 'owner'}).text, + 'owner_name' : decode_cfemail(soup.find('p', {'class' : 'owner'}).find('span', {'class': "__cf_email__"})['data-cfemail']), 'category' : info_list[-1].text.split('Category')[1].strip(), - 'image' : about_soup.find('img', {'alt' : 'Channel Image'})['data-src'], + 'image' : about_soup.find('img', {'alt' : 'Channel Image'}).get('data-src'), 'subscribers': counts['subscriber_count'], 'views': int(counts['about_view_count'].split(' ')[0])} @@ -456,4 +457,20 @@ def get_videos_user(session, user, csrftoken, detail): # these need to be yielded *after* the video because else the result file will have the comments # before the video, which is weird yield comment + #-----------------------------------------------------------------------------# + +def decode_cfemail(cfemail): + + """https://stackoverflow.com/questions/36911296/scraping-of-protected-email + """ + + email = "" + k = int(cfemail[:2], 16) + + for i in range(2, len(cfemail)-1, 2): + email += chr(int(cfemail[i:i+2], 16)^k) + + return email + +#---------------------------------------------------------------------------# \ No newline at end of file diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index 252239e..32e40e8 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -125,14 +125,15 @@ def get_channel_profile(username): soup = BeautifulSoup(r.content, features = 'lxml') verified_svg = soup.find('h1').find('svg', {'class' : 'listing-header--verified'}) + thumbnail_soup = soup.find('img', {'class' : 'listing-header--thumb'}) + cover_soup = soup.find('img', {'class' : 'listing-header--backsplash-img'}) profile = { 'name': soup.find('h1').text, 'verified': verified_svg is not None, - 'thumbnail': soup.find('img', {'class' : 'listing-header--thumb'})['src'], - 'cover': soup.find('img', {'class' : 'listing-header--backsplash-img'})['src'], - 'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text} - + 'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None, + 'cover': cover_soup.get('src') if cover_soup else None, + 'subscribers': int(soup.find('span', {'class' : 'subscribe-button-count'}).text)} return profile #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/cisticola/scraper/telegram_telethon.py b/cisticola/scraper/telegram_telethon.py index e02ccde..b300551 100644 --- a/cisticola/scraper/telegram_telethon.py +++ b/cisticola/scraper/telegram_telethon.py @@ -138,7 +138,7 @@ class TelegramTelethonScraper(Scraper): date_archived=datetime.now(timezone.utc), raw_data=json.dumps(post.to_dict(), default=str), archived_urls=archived_urls, - media_archived=archive_media)) + media_archived=archive_media) def get_profile(self, channel: Channel) -> dict: diff --git a/pytest.ini b/pytest.ini index f3545f6..8d9973f 100644 --- a/pytest.ini +++ b/pytest.ini @@ -11,6 +11,9 @@ addopts = --cov-report html:reports/coverage --html='reports/tests.html' --self-contained-html +markers = + profile: marks tests for only extracting channel metadata (deselect with '-m + "not profile"') filterwarnings = ignore:the imp module is deprecated:DeprecationWarning ignore:The localize method is no longer necessary, as this time zone supports the fold attribute diff --git a/tests/scraper/gab.py b/tests/scraper/gab.py index c864c37..943f40f 100644 --- a/tests/scraper/gab.py +++ b/tests/scraper/gab.py @@ -1,3 +1,5 @@ +import pytest + from cisticola.base import Channel from cisticola.scraper import GabScraper @@ -14,3 +16,10 @@ def test_scrape_gab_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['gab'])] controller.register_scraper(scraper = GabScraper()) controller.scrape_channels(channels = channels, archive_media = True) + +@pytest.mark.profile +def test_scrape_gab_profile(channel_kwargs): + + scraper = GabScraper() + channel = Channel(**channel_kwargs['gab']) + scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/gettr.py b/tests/scraper/gettr.py index 7dd2f24..6a3b70e 100644 --- a/tests/scraper/gettr.py +++ b/tests/scraper/gettr.py @@ -1,3 +1,5 @@ +import pytest + from cisticola.base import Channel from cisticola.scraper import GettrScraper @@ -14,3 +16,10 @@ def test_scrape_gettr_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['gettr'])] controller.register_scraper(scraper = GettrScraper()) controller.scrape_channels(channels = channels, archive_media = True) + +@pytest.mark.profile +def test_scrape_gettr_profile(channel_kwargs): + + scraper = GettrScraper() + channel = Channel(**channel_kwargs['gettr']) + scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/instagram.py b/tests/scraper/instagram.py index 0beb546..840d6fa 100644 --- a/tests/scraper/instagram.py +++ b/tests/scraper/instagram.py @@ -1,3 +1,5 @@ +import pytest + from cisticola.base import Channel from cisticola.scraper import InstagramScraper @@ -14,3 +16,10 @@ def test_scrape_instagram_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['instagram'])] controller.register_scraper(scraper = InstagramScraper()) controller.scrape_channels(channels = channels, archive_media = True) + +@pytest.mark.profile +def test_scrape_instagram_profile(channel_kwargs): + + scraper = InstagramScraper() + channel = Channel(**channel_kwargs['instagram']) + scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/odysee.py b/tests/scraper/odysee.py index f97700e..8eba07d 100644 --- a/tests/scraper/odysee.py +++ b/tests/scraper/odysee.py @@ -1,3 +1,5 @@ +import pytest + from cisticola.base import Channel from cisticola.scraper import OdyseeScraper @@ -14,3 +16,10 @@ def test_scrape_odysee_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['odysee'])] controller.register_scraper(scraper = OdyseeScraper()) controller.scrape_channels(channels = channels, archive_media = True) + +@pytest.mark.profile +def test_scrape_odysee_profile(channel_kwargs): + + scraper = OdyseeScraper() + channel = Channel(**channel_kwargs['odysee']) + scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/rumble.py b/tests/scraper/rumble.py index 5f640e5..f64b24f 100644 --- a/tests/scraper/rumble.py +++ b/tests/scraper/rumble.py @@ -1,3 +1,5 @@ +import pytest + from cisticola.base import Channel from cisticola.scraper import RumbleScraper @@ -14,3 +16,10 @@ def test_scrape_rumble_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['rumble'])] controller.register_scraper(scraper = RumbleScraper()) controller.scrape_channels(channels = channels, archive_media = True) + +@pytest.mark.profile +def test_scrape_rumble_profile(channel_kwargs): + + scraper = RumbleScraper() + channel = Channel(**channel_kwargs['rumble']) + scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/telegram_snscrape.py b/tests/scraper/telegram_snscrape.py index 3848780..420b917 100644 --- a/tests/scraper/telegram_snscrape.py +++ b/tests/scraper/telegram_snscrape.py @@ -1,3 +1,5 @@ +import pytest + from cisticola.base import Channel from cisticola.scraper import TelegramSnscrapeScraper @@ -14,3 +16,10 @@ def test_scrape_telegram_snscrape_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['telegram'])] controller.register_scraper(scraper = TelegramSnscrapeScraper()) controller.scrape_channels(channels = channels, archive_media = True) + +@pytest.mark.profile +def test_scrape_telegram_snscrape_profile(channel_kwargs): + + scraper = TelegramSnscrapeScraper() + channel = Channel(**channel_kwargs['telegram']) + scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/telegram_telethon.py b/tests/scraper/telegram_telethon.py index c015631..1942fca 100644 --- a/tests/scraper/telegram_telethon.py +++ b/tests/scraper/telegram_telethon.py @@ -1,3 +1,5 @@ +import pytest + from cisticola.base import Channel from cisticola.scraper import TelegramTelethonScraper @@ -14,3 +16,10 @@ def test_scrape_telegram_telethon_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['telegram'])] controller.register_scraper(scraper = TelegramTelethonScraper()) controller.scrape_channels(channels = channels, archive_media = True) + +@pytest.mark.profile +def test_scrape_telegram_telethon_profile(channel_kwargs): + + scraper = TelegramTelethonScraper() + channel = Channel(**channel_kwargs['telegram']) + scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/twitter.py b/tests/scraper/twitter.py index bd79a6a..7512b6a 100644 --- a/tests/scraper/twitter.py +++ b/tests/scraper/twitter.py @@ -1,3 +1,5 @@ +import pytest + from cisticola.base import Channel from cisticola.scraper import TwitterScraper @@ -14,3 +16,10 @@ def test_scrape_twitter_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['twitter'])] controller.register_scraper(scraper = TwitterScraper()) controller.scrape_channels(channels = channels, archive_media = True) + +@pytest.mark.profile +def test_scrape_twitter_profile(channel_kwargs): + + scraper = TwitterScraper() + channel = Channel(**channel_kwargs['twitter']) + scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/vkontakte.py b/tests/scraper/vkontakte.py index ef7cfa1..8b0b757 100644 --- a/tests/scraper/vkontakte.py +++ b/tests/scraper/vkontakte.py @@ -1,3 +1,5 @@ +import pytest + from cisticola.base import Channel from cisticola.scraper import VkontakteScraper @@ -14,3 +16,10 @@ def test_scrape_vkontakte_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['vkontakte'])] controller.register_scraper(scraper = VkontakteScraper()) controller.scrape_channels(channels = channels, archive_media = True) + +@pytest.mark.profile +def test_scrape_vkontakte_profile(channel_kwargs): + + scraper = VkontakteScraper() + channel = Channel(**channel_kwargs['vkontakte']) + scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/youtube.py b/tests/scraper/youtube.py index 9d14760..e987cb8 100644 --- a/tests/scraper/youtube.py +++ b/tests/scraper/youtube.py @@ -1,3 +1,5 @@ +import pytest + from cisticola.base import Channel from cisticola.scraper import YoutubeScraper @@ -14,3 +16,10 @@ def test_scrape_youtube_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['youtube'])] controller.register_scraper(scraper = YoutubeScraper()) controller.scrape_channels(channels = channels, archive_media = True) + +@pytest.mark.profile +def test_scrape_youtube_profile(channel_kwargs): + + scraper = YoutubeScraper() + channel = Channel(**channel_kwargs['youtube']) + scraper.get_profile(channel=channel) \ No newline at end of file From b805d501329086a4fdf9000f4a83d38c73c5cb7c Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Tue, 29 Mar 2022 16:09:51 -0500 Subject: [PATCH 3/5] made tesets work, fixed several issues with Rumble scraper --- Pipfile.lock | 130 +++++++++++++++++++++++++++++---- cisticola/scraper/__init__.py | 2 +- cisticola/scraper/base.py | 5 +- cisticola/scraper/bitchute.py | 12 ++- cisticola/scraper/gab.py | 3 +- cisticola/scraper/gettr.py | 3 +- cisticola/scraper/instagram.py | 6 +- cisticola/scraper/odysee.py | 20 +++-- cisticola/scraper/rumble.py | 36 ++++----- cisticola/scraper/twitter.py | 14 ++-- cisticola/scraper/vkontakte.py | 5 +- cisticola/scraper/youtube.py | 3 +- 12 files changed, 180 insertions(+), 59 deletions(-) diff --git a/Pipfile.lock b/Pipfile.lock index 79b1b1c..d78928c 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "26955249044f1cd4bb4504c14f00f0c50508192338026227fc7b889e9f4fc11c" + "sha256": "3fb247a6b9b76ed811db7636b02ad848365d38dadb0da6a27c090e559e5540ec" }, "pipfile-spec": 6, "requires": { @@ -34,19 +34,19 @@ }, "boto3": { "hashes": [ - "sha256:788aa3281e91413bc201268a251c9d4ca2e9deb3a4af74daea2389cf66e5132e", - "sha256:ca37b9b4ade72f6d4fa2b7bee584dd5b1c7585f07f22ff1edbc9ecc0c4173b1f" + "sha256:127ebdf58c8825b53f1eff111e08c49ffffeb1f6d7a5665c9907ce8128fe14b1", + "sha256:b7ce3bf013f0f60e40c2676d5a7b620ed927cfad0aa348a606b10e9a0387f249" ], "index": "pypi", - "version": "==1.21.28" + "version": "==1.21.29" }, "botocore": { "hashes": [ - "sha256:03c41d26d1e765380b8175d4b136d3144aa051f17a86eebfdf9a885a5a9a6a72", - "sha256:102eb24b44d473adea6bb8728b20fb9547fa5858c3293df7cad67ef17ea736a7" + "sha256:b467d64cd773dc4d49ef31b18a8dded554f284f799720bd12e989fe2138fd5b8", + "sha256:de87907d42682179946ddfa113b9334e3c4258404aef19edd8c92381ff54775c" ], "markers": "python_version >= '3.6'", - "version": "==1.24.28" + "version": "==1.24.29" }, "brotli": { "hashes": [ @@ -138,6 +138,61 @@ ], "version": "==2021.10.8" }, + "cffi": { + "hashes": [ + "sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3", + "sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2", + "sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636", + "sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20", + "sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728", + "sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27", + "sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66", + "sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443", + "sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0", + "sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7", + "sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39", + "sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605", + "sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a", + "sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37", + "sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029", + "sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139", + "sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc", + "sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df", + "sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14", + "sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880", + "sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2", + "sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a", + "sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e", + "sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474", + "sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024", + "sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8", + "sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0", + "sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e", + "sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a", + "sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e", + "sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032", + "sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6", + "sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e", + "sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b", + "sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e", + "sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954", + "sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962", + "sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c", + "sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4", + "sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55", + "sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962", + "sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023", + "sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c", + "sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6", + "sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8", + "sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382", + "sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7", + "sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc", + "sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997", + "sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796" + ], + "version": "==1.15.0" + }, "charset-normalizer": { "hashes": [ "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597", @@ -154,6 +209,46 @@ "markers": "python_version >= '3.7'", "version": "==8.1.0" }, + "cryptg": { + "hashes": [ + "sha256:02b31622a75a49a5dcd25e589c85faae54575f018e055bd21a17df97c8bb9095", + "sha256:0da1b367056e57a5c01d22608da0cd50e597b917c1b2d9631767aa3c0640a99a", + "sha256:135688c6fbda90748924c2cb047f63785ebf4397d81acc4a05357950653c5096", + "sha256:1fb6c6d4561a54406593197c1f5f23662ab320f4af4ab11834e1583e9d27a49a", + "sha256:2516557e89803637fa7342de43dbcc5f84bf68ae05b1064a354a62d423447d9f", + "sha256:29001dafd3d6a054365222b1f89b12876723c89cdd10aa0e5885a05dfd034eeb", + "sha256:2cc8115960e49a038091ffb2d09de59e0acbdc76de10d7d415b7671a06bae0a9", + "sha256:2cd8224eb64af756f45cdceab16d048494313db8acec1e38d75d97716082267b", + "sha256:307bf96a6ac9c87b44531d8da5fe3a6c5d856e1dc69b68136ef9c4fb66ad17ac", + "sha256:31cf7682de69022c9a77739cdcf7116b06522b128b9b51c7593f277f38c38dbf", + "sha256:3bc2f372dec3a7753c0c0d72c69fcbe44af5473f870a3406978e07e8560a1aa6", + "sha256:46960979542155c9d903656a3a39770061b09a3691a23296f06dc168fe4ff962", + "sha256:47ad5916be4558f4d674c12800e8d9663ce938b0046f19cdc869ba3a7ca280ec", + "sha256:5faed49d972c7f44ce4d6fa1a64169c85a11209fa1fbe1c8a333fb1454888725", + "sha256:695636cca0ee938bd7113658ee60bfaf89afa19708c40ecae5f4a222c2ec544a", + "sha256:6c5d66975fc59adca203fa91e2a104240457114468162d30e9213661239ac1d6", + "sha256:72a5485ece10a70160170ceb658b1836db82dccab08a1f7029c54d81cf6b1d43", + "sha256:7fc8e1893775c6f53dceda1959f19833cc27a67a80492c10e2415dc601b36650", + "sha256:890584db41c8e1e046ae40dee0074614470d36ebd6b7e57bb91303300066601f", + "sha256:a1fb178702730b59267f1e6c6dfe16c7bb9c1350cee4183221982ad2dba4e7f5", + "sha256:a4de1730ca56aa8a945f176c25586901ed5e9f15ffb70c6459eedf466eb6299b", + "sha256:b6352555e47f389ed502269bdb537233d0a928b12d9f4caa57e8c707151acd30", + "sha256:b8896394b72ff7dbf38072ad4c2cd59abdd9e388bb55e1c369102beb8e569f9d", + "sha256:bbd05b52d09e78bdc595f229c0481f4f2e1daf3959847322a6b2c1f76119305f", + "sha256:bf00943924cddb0838f8a65f5aae31f6fe2ad64a5d7e6f10a6b900b3f01b0ae0", + "sha256:bf15aae0fa01aeec728ab16b920cf4c6b2793099c71f62f30ff100d6fe8c9859", + "sha256:c09a5b14494532fc3226f5c5f57ef2a651c935ed6a1d2d0f9eff110046725524", + "sha256:c4812802ce4cd6f08189ce0fa8b79e9a96ac941e69e6b3032bb6908baefde2ba", + "sha256:c69c1e19884108e508697919de0cd43e2ca4e9af418962aa235273b3c51a0e37", + "sha256:ce08c04ebb06ce1ac417597c1bb514a3c1b36cf5c286b8c60f23df2e65703bf3", + "sha256:e29b0d944176cf88fe52d1c58f46017b5bddc9cc54ec0fc6fac20043febefc32", + "sha256:e48ab84e0ed364436d5e449c59762c5963f08ad87f6508f4cb7644745b5559a8", + "sha256:eff15f0a1eee678dd9ec747b58ce86edb78b608036ac4e02d8349f5f35202495", + "sha256:fdd62c2be23eeabb9ebd2ad41bf153f5ec48b968885ef14e676515407cd56339" + ], + "index": "pypi", + "version": "==0.2.post4" + }, "dateparser": { "hashes": [ "sha256:038196b1f12c7397e38aad3d61588833257f6f552baa63a1499e6987fa8d42d9", @@ -394,7 +489,7 @@ "sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1", "sha256:9c9f243fcec7f410f138cb12c21c84c64fde4195481a30c9bfb05b5f003adfed" ], - "markers": "python_version >= '3.5' and python_version < '4.0'", + "markers": "python_version >= '3.5' and python_version < '4'", "version": "==1.45.1" }, "numpy": { @@ -517,7 +612,7 @@ }, "polyphemus": { "git": "https://github.com/bellingcat/polyphemus.git", - "ref": "c85dea215ae720e3df71d2ed1aaa82f7b8a6a2ed" + "ref": "00a5123a3768a55ffe29f2c803a4181895f17890" }, "py": { "hashes": [ @@ -569,6 +664,13 @@ ], "version": "==0.2.8" }, + "pycparser": { + "hashes": [ + "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9", + "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206" + ], + "version": "==2.21" + }, "pycryptodomex": { "hashes": [ "sha256:1ca8e1b4c62038bb2da55451385246f51f412c5f5eabd64812c01766a5989b4a", @@ -742,7 +844,6 @@ "version": "==2022.3.2" }, "requests": { - "extras": [], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" @@ -763,7 +864,7 @@ "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17", "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb" ], - "markers": "python_version >= '3.6' and python_version < '4.0'", + "markers": "python_version >= '3.6' and python_version < '4'", "version": "==4.8" }, "s3transfer": { @@ -784,7 +885,7 @@ }, "snscrape": { "git": "https://github.com/bellingcat/snscrape.git", - "ref": "de4ebed81f3f6a4bb4c65630daab6ec63784959b" + "ref": "fb8d73ac95011b7ad848a6048d3eed1880e80f21" }, "soupsieve": { "hashes": [ @@ -872,7 +973,7 @@ "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14", "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", "version": "==1.26.9" }, "websockets": { @@ -1259,7 +1360,6 @@ "version": "==2022.1" }, "requests": { - "extras": [], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" @@ -1359,7 +1459,7 @@ "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14", "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", "version": "==1.26.9" }, "zipp": { diff --git a/cisticola/scraper/__init__.py b/cisticola/scraper/__init__.py index 36e6cd5..8796633 100644 --- a/cisticola/scraper/__init__.py +++ b/cisticola/scraper/__init__.py @@ -1,5 +1,5 @@ from cisticola.utils import make_request -from .base import Scraper, ScraperController +from .base import Scraper, ScraperController, ChannelDoesNotExistError from .bitchute import BitchuteScraper from .gab import GabScraper from .gettr import GettrScraper diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index c887ee1..023fa3c 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -412,4 +412,7 @@ class ScraperController: """ mapper_registry.metadata.drop_all(bind=self.engine) - self.connect_to_db(self.engine) \ No newline at end of file + self.connect_to_db(self.engine) + +class ChannelDoesNotExistError(Exception): + """The specified channel does not exist or has been deleted.""" \ No newline at end of file diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index d8d3f0b..e9a9770 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -58,7 +58,8 @@ class BitchuteScraper(Scraper): date=datetime.fromtimestamp(post['timestamp']), date_archived=datetime.now(timezone.utc), raw_data=json.dumps(post), - archived_urls=archived_urls) + archived_urls=archived_urls, + media_archived=archive_media) def can_handle(self, channel): if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None: @@ -88,14 +89,19 @@ class BitchuteScraper(Scraper): response = session.post(canonical_url + 'counts/', data = data, headers = headers) counts = json.loads(response.text) + owner_soup = soup.find('p', {'class' : 'owner'}) + if owner_soup.text == '[email\xa0protected]': + owner_name = decode_cfemail(owner_soup.find('span', {'class': "__cf_email__"})['data-cfemail']) + else: + owner_name = owner_soup.text + profile = { 'description' : description_soup.text.strip(), 'description_links' : [a['href'] for a in description_soup.find_all('a', href = True)], 'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')), 'videos' : int(info_list[1].text.split('videos')[0].strip()), 'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'], - 'owner_name' : decode_cfemail(soup.find('p', {'class' : 'owner'}).find('span', {'class': "__cf_email__"})['data-cfemail']), - 'category' : info_list[-1].text.split('Category')[1].strip(), + 'owner_name' : owner_name, 'image' : about_soup.find('img', {'alt' : 'Channel Image'}).get('data-src'), 'subscribers': counts['subscriber_count'], 'views': int(counts['about_view_count'].split(' ')[0])} diff --git a/cisticola/scraper/gab.py b/cisticola/scraper/gab.py index aa698ad..126f500 100644 --- a/cisticola/scraper/gab.py +++ b/cisticola/scraper/gab.py @@ -49,7 +49,8 @@ class GabScraper(Scraper): date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc), date_archived=datetime.now(timezone.utc), raw_data=json.dumps(post), - archived_urls=archived_urls) + archived_urls=archived_urls, + media_archived=archive_media) def can_handle(self, channel: Channel) -> bool: if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None: diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index 5e8d7ac..c8e63f9 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -59,7 +59,8 @@ class GettrScraper(Scraper): date=datetime.fromtimestamp(post['cdate']/1000.), date_archived=datetime.now(timezone.utc), raw_data=json.dumps(post), - archived_urls=archived_urls) + archived_urls=archived_urls, + media_archived=archive_media) def can_handle(self, channel): if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None: diff --git a/cisticola/scraper/instagram.py b/cisticola/scraper/instagram.py index a5613a4..f045011 100644 --- a/cisticola/scraper/instagram.py +++ b/cisticola/scraper/instagram.py @@ -80,7 +80,8 @@ class InstagramScraper(Scraper): date=post.date_utc, date_archived=datetime.now(timezone.utc), raw_data=json.dumps(post._asdict(), default=str), - archived_urls=archived_urls) + archived_urls=archived_urls, + media_archived=archive_media) for comment in post.get_comments(): @@ -96,7 +97,8 @@ class InstagramScraper(Scraper): date=comment.created_at_utc, date_archived=datetime.now(timezone.utc), raw_data=json.dumps(comment_dict, default=str), - archived_urls={}) + archived_urls={}, + media_archived=archive_media) def can_handle(self, channel): if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None: diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py index 980653c..4ff80e0 100644 --- a/cisticola/scraper/odysee.py +++ b/cisticola/scraper/odysee.py @@ -3,9 +3,11 @@ import json from typing import Generator from urllib.parse import urlparse -from polyphemus.base import OdyseeChannel import requests +from loguru import logger +from polyphemus.base import OdyseeChannel +from polyphemus.api import get_auth_token from cisticola.base import Channel, ScraperResult from cisticola.scraper.base import Scraper @@ -13,6 +15,10 @@ class OdyseeScraper(Scraper): """An implementation of a Scraper for Odysee, using polyphemus library""" __version__ = "OdyseeScraper 0.0.1" + def __init__(self): + super().__init__() + self.auth_token = get_auth_token() + def get_username_from_url(self, url): username = url.split('odysee.com/')[-1].strip('@').split(':')[0] @@ -22,12 +28,12 @@ class OdyseeScraper(Scraper): def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: username = self.get_username_from_url(channel.url) - odysee_channel = OdyseeChannel(channel_name = username) + odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token) all_videos = odysee_channel.get_all_videos() for video in all_videos: - if since is not None and datetime.fromtimestamp(video['created']) <= since.date: + if since is not None and datetime.fromtimestamp(video.info['created']) <= since.date: break archived_urls = {} @@ -55,7 +61,8 @@ class OdyseeScraper(Scraper): date=datetime.fromtimestamp(video.info['created']), date_archived=datetime.now(timezone.utc), raw_data=json.dumps(video.info), - archived_urls=archived_urls) + archived_urls=archived_urls, + media_archived=archive_media) for comment in all_comments: @@ -67,7 +74,8 @@ class OdyseeScraper(Scraper): date=datetime.fromtimestamp(comment.info['created']), date_archived=datetime.now(), raw_data=json.dumps(comment.info), - archived_urls={}) + archived_urls={}, + media_archived=True) def can_handle(self, channel): if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None: @@ -82,7 +90,7 @@ class OdyseeScraper(Scraper): def get_profile(self, channel: Channel) -> dict: username = self.get_username_from_url(channel.url) - odysee_channel = OdyseeChannel(channel_name = username) + odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token) profile = odysee_channel.info return profile \ No newline at end of file diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index 32e40e8..39a29ba 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -14,18 +14,12 @@ class RumbleScraper(Scraper): """An implementation of a Scraper for Rumble, using custom functions""" __version__ = "RumbleScraper 0.0.1" - def get_username_from_url(self, url): - username = url.split('https://rumble.com/c/')[1] - - return username - def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: - username = self.get_username_from_url(channel.url) - scraper = get_channel_videos(username) + scraper = get_channel_videos(channel.url) for post in scraper: - if since is not None and datetime.fromtimestamp(post['cdate']*0.001) <= since.date: + if since is not None and post['datetime'].replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc): break archived_urls = {} @@ -43,10 +37,11 @@ class RumbleScraper(Scraper): platform="Rumble", channel=channel.id, platform_id=post['media_url'].split('/')[-2], - date=datetime.fromisoformat(post['datetime']).replace(tzinfo=timezone.utc), + date=post['datetime'].replace(tzinfo=timezone.utc), date_archived=datetime.now(timezone.utc), - raw_data=json.dumps(post), - archived_urls=archived_urls) + raw_data=json.dumps(post, default = str), + archived_urls=archived_urls, + media_archived=archive_media) def url_to_key(self, url: str, content_type: str) -> str: ext = '.' + content_type.split('/')[-1] @@ -54,13 +49,12 @@ class RumbleScraper(Scraper): return key def can_handle(self, channel): - if channel.platform == "Rumble" and self.get_username_from_url(channel.url) is not None: + if channel.platform == "Rumble" and channel.url is not None: return True def get_profile(self, channel: Channel) -> dict: - username = self.get_username_from_url(channel.url) - profile = get_channel_profile(username = username) + profile = get_channel_profile(url = channel.url) return profile @@ -69,7 +63,7 @@ class RumbleScraper(Scraper): def get_media_url(url): r = make_request(url = url) - soup = BeautifulSoup(r.content, features = 'lxml') + soup = BeautifulSoup(r.content, features = 'html.parser') script = json.loads(''.join(soup.find('script', {'type':'application/ld+json'}).text)) media_url = script[0]['embedUrl'] @@ -91,16 +85,16 @@ def process_video(video): 'views' : video.find('span', {'class' : 'video-item--views'})['data-value'], 'rumbles' : rumbles, 'duration' : video.find('span', {'class' : 'video-item--duration'})['data-value'], - 'datetime' : video.find('time')['datetime']} + 'datetime' : datetime.fromisoformat(video.find('time')['datetime'])} info['media_url'] = get_media_url(info['link']) return info -def get_channel_videos(username): +def get_channel_videos(url): page = 1 - channel_url = f'{BASE_URL}/c/{username}?page=' + channel_url = f'{url}?page=' while True: url = channel_url + str(page) @@ -118,9 +112,9 @@ def get_channel_videos(username): page += 1 -def get_channel_profile(username): +def get_channel_profile(url): - channel_url = f'{BASE_URL}/c/{username}' + channel_url = f'{url}' r = make_request(url = channel_url) soup = BeautifulSoup(r.content, features = 'lxml') @@ -133,7 +127,7 @@ def get_channel_profile(username): 'verified': verified_svg is not None, 'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None, 'cover': cover_soup.get('src') if cover_soup else None, - 'subscribers': int(soup.find('span', {'class' : 'subscribe-button-count'}).text)} + 'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text} return profile #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index 126d75d..cc0afb9 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -6,7 +6,7 @@ from snscrape.modules.twitter import TwitterProfileScraper, TwitterUserScraper, from loguru import logger from cisticola.base import Channel, ScraperResult -from cisticola.scraper.base import Scraper +from cisticola.scraper.base import Scraper, ChannelDoesNotExistError class TwitterScraper(Scraper): """An implementation of a Scraper for Twitter, using snscrape library""" @@ -67,7 +67,8 @@ class TwitterScraper(Scraper): date=tweet.date, date_archived=datetime.now(timezone.utc), raw_data=tweet.json(), - archived_urls=archived_urls) + archived_urls=archived_urls, + media_archived=archive_media) def can_handle(self, channel): if channel.platform == "Twitter" and channel.platform_id: @@ -92,7 +93,10 @@ class TwitterScraper(Scraper): def get_profile(self, channel: Channel) -> dict: - scraper = TwitterUserScraper(channel.platform_id) + scraper = TwitterUserScraper(channel.screenname) + entity = scraper._get_entity() - profile = scraper._get_entity().__dict__ - return profile \ No newline at end of file + if entity is None: + raise ChannelDoesNotExistError(channel.url) + else: + return entity.__dict__ \ No newline at end of file diff --git a/cisticola/scraper/vkontakte.py b/cisticola/scraper/vkontakte.py index 78dfe98..97724c6 100644 --- a/cisticola/scraper/vkontakte.py +++ b/cisticola/scraper/vkontakte.py @@ -25,7 +25,7 @@ class VkontakteScraper(Scraper): first = True for post in scraper.get_items(): - if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc): + if since is not None and datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc): # with VKontakteUserScraper, the first tweet could be an old pinned tweet if first: first = False @@ -63,7 +63,8 @@ class VkontakteScraper(Scraper): date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc), date_archived=datetime.now(timezone.utc), raw_data=post.json(), - archived_urls=archived_urls) + archived_urls=archived_urls, + media_archived=archive_media) def can_handle(self, channel): if channel.platform == "Vkontakte" and channel.platform_id: diff --git a/cisticola/scraper/youtube.py b/cisticola/scraper/youtube.py index 88c75a4..1e2346b 100644 --- a/cisticola/scraper/youtube.py +++ b/cisticola/scraper/youtube.py @@ -72,7 +72,8 @@ class YoutubeScraper(Scraper): date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc), date_archived=datetime.now(timezone.utc), raw_data=json.dumps(video, default = str), - archived_urls=archived_urls) + archived_urls=archived_urls, + media_archived=archive_media) def can_handle(self, channel): if channel.platform == "Youtube" and channel.url: From 1f99e524365f619c90410515b47eaeed2190721f Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Wed, 30 Mar 2022 08:05:10 -0500 Subject: [PATCH 4/5] refactored Gab scraper to use gabber instead of garc --- Pipfile | 2 +- Pipfile.lock | 76 ++++++++---------------------- cisticola/scraper/gab.py | 51 +++++++++++++++----- docs/source/quickstart.rst | 8 +++- pytest.ini | 2 + tests/scraper/bitchute.py | 1 + tests/scraper/gab.py | 1 + tests/scraper/gettr.py | 1 + tests/scraper/instagram.py | 1 + tests/scraper/odysee.py | 1 + tests/scraper/rumble.py | 1 + tests/scraper/telegram_snscrape.py | 1 + tests/scraper/telegram_telethon.py | 1 + tests/scraper/twitter.py | 1 + tests/scraper/vkontakte.py | 1 + tests/scraper/youtube.py | 1 + tests/transformer/twitter.py | 3 ++ 17 files changed, 82 insertions(+), 71 deletions(-) diff --git a/Pipfile b/Pipfile index 3be4b27..d57b5a9 100644 --- a/Pipfile +++ b/Pipfile @@ -14,7 +14,6 @@ boto3 = "*" snscrape = {git = "https://github.com/bellingcat/snscrape.git"} ffmpeg-python = "*" polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"} -garc = "*" yt-dlp = "*" telethon = "*" pytesseract = "*" @@ -22,6 +21,7 @@ pyexiftool = {git = "https://github.com/smarnach/pyexiftool.git"} instaloader = "*" gspread = "*" cryptg = "*" +gabber = {git = "https://github.com/stanfordio/gabber.git"} [dev-packages] pytest = "*" diff --git a/Pipfile.lock b/Pipfile.lock index d78928c..bb0e2a2 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "3fb247a6b9b76ed811db7636b02ad848365d38dadb0da6a27c090e559e5540ec" + "sha256": "b712e767d64e54e83e8c2d8a27a68203583ed7ad31d4ea3b4b6076a72a2150fd" }, "pipfile-spec": 6, "requires": { @@ -16,14 +16,6 @@ ] }, "default": { - "attrs": { - "hashes": [ - "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4", - "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", - "version": "==21.4.0" - }, "beautifulsoup4": { "hashes": [ "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf", @@ -280,12 +272,9 @@ "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==0.18.2" }, - "garc": { - "hashes": [ - "sha256:6f1da8ccdb30b165b8d9247314b73d1002f60381480e61fdbf108dc9abf3c216" - ], - "index": "pypi", - "version": "==2.1" + "gabber": { + "git": "https://github.com/stanfordio/gabber.git", + "ref": "d80c44c488ad4e087ba4c8f033802fe2071843bd" }, "gogettr": { "hashes": [ @@ -387,13 +376,6 @@ "markers": "python_version >= '3'", "version": "==3.3" }, - "iniconfig": { - "hashes": [ - "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3", - "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32" - ], - "version": "==1.1.1" - }, "instaloader": { "hashes": [ "sha256:7fa6147810eedcc1dedcdec8cfa1f220c9379ab8faeab6a336a7c181d944e2e4" @@ -411,11 +393,11 @@ }, "loguru": { "hashes": [ - "sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c", - "sha256:4e2414d534a2ab57573365b3e6d0234dfb1d84b68b7f3b948e6fb743860a77c3" + "sha256:b28e72ac7a98be3d28ad28570299a393dfcd32e5e3f6a353dec94675767b6319", + "sha256:f8087ac396b5ee5f67c963b495d615ebbceac2796379599820e324419d53667c" ], "index": "pypi", - "version": "==0.6.0" + "version": "==0.5.3" }, "lxml": { "hashes": [ @@ -602,26 +584,10 @@ "markers": "python_version >= '3.7'", "version": "==9.0.1" }, - "pluggy": { - "hashes": [ - "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159", - "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3" - ], - "markers": "python_version >= '3.6'", - "version": "==1.0.0" - }, "polyphemus": { "git": "https://github.com/bellingcat/polyphemus.git", "ref": "00a5123a3768a55ffe29f2c803a4181895f17890" }, - "py": { - "hashes": [ - "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719", - "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", - "version": "==1.11.0" - }, "pyaes": { "hashes": [ "sha256:02c1b1405c38d3c370b085fb952dd8bea3fadcee6411ad99f312cc129c536d8f" @@ -732,14 +698,6 @@ "index": "pypi", "version": "==0.3.9" }, - "pytest": { - "hashes": [ - "sha256:841132caef6b1ad17a9afde46dc4f6cfa59a05f9555aae5151f73bdf2820ca63", - "sha256:92f723789a8fdd7180b6b06483874feca4c48a5c76968e03bb3e7f806a1869ea" - ], - "markers": "python_version >= '3.7'", - "version": "==7.1.1" - }, "python-dateutil": { "hashes": [ "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", @@ -763,6 +721,12 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", "version": "==0.1.0.post0" }, + "ratelimit": { + "hashes": [ + "sha256:af8a9b64b821529aca09ebaf6d8d279100d766f19e90b5059ac6a718ca6dee42" + ], + "version": "==2.2.1" + }, "regex": { "hashes": [ "sha256:0008650041531d0eadecc96a73d37c2dc4821cf51b0766e374cb4f1ddc4e1c14", @@ -944,13 +908,13 @@ "index": "pypi", "version": "==1.24.0" }, - "tomli": { + "tqdm": { "hashes": [ - "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", - "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f" + "sha256:4230a49119a416c88cc47d0d2d32d5d90f1a282d5e497d49801950704e49863d", + "sha256:6461b009d6792008d0000e1b0c7ca50195ec78c0e808a3a6b668a56a3236c3a5" ], - "markers": "python_version >= '3.7'", - "version": "==2.0.1" + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==4.63.1" }, "tzdata": { "hashes": [ @@ -1325,7 +1289,7 @@ "sha256:841132caef6b1ad17a9afde46dc4f6cfa59a05f9555aae5151f73bdf2820ca63", "sha256:92f723789a8fdd7180b6b06483874feca4c48a5c76968e03bb3e7f806a1869ea" ], - "markers": "python_version >= '3.7'", + "index": "pypi", "version": "==7.1.1" }, "pytest-cov": { @@ -1443,7 +1407,7 @@ "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f" ], - "markers": "python_version >= '3.7'", + "markers": "python_full_version < '3.11.0'", "version": "==2.0.1" }, "typing-extensions": { diff --git a/cisticola/scraper/gab.py b/cisticola/scraper/gab.py index 126f500..2307ca5 100644 --- a/cisticola/scraper/gab.py +++ b/cisticola/scraper/gab.py @@ -1,15 +1,16 @@ -from datetime import datetime, timezone +from datetime import datetime, timezone, date import json from typing import Generator +import os -from garc import Garc +from gabber.client import Client, GAB_API_BASE_URL from cisticola.base import Channel, ScraperResult from cisticola.scraper.base import Scraper class GabScraper(Scraper): - """An implementation of a Scraper for Gab, using GARC library""" - __version__ = "GabScraper 0.0.1" + """An implementation of a Scraper for Gab, using gabber library""" + __version__ = "GabScraper 0.0.2" def get_username_from_url(self, url): username = url.split('https://gab.com/')[-1] @@ -17,13 +18,23 @@ class GabScraper(Scraper): return username def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: - client = Garc(profile = 'main') + client = Client( + username = os.environ['GAB_USER'], + password = os.environ['GAB_PASS'], + threads = 25) + username = self.get_username_from_url(channel.url) - scraper = client.userposts(username) + result = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json() + user_id = int(result['id']) + + scraper = client.pull_statuses( + id = user_id, + created_after = date.min, + replies = False) for post in scraper: - if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")) <= since.date: + if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): break media_urls = [] @@ -31,10 +42,18 @@ class GabScraper(Scraper): if archive_media: - media_urls.extend([p['url'] for p in post['media_attachments']]) - - if post.get('repost') is not None: - media_urls.extend([p['url'] for p in post['repost']['media_attachments']]) + for attachment in post.get('media_attachments'): + if attachment.get('type') == 'video': + media_urls.append(attachment['source_mp4']) + else: + media_urls.append(attachment['url']) + + if post.get('reblog') is not None: + for attachment in post['reblog'].get('media_attachments'): + if attachment.get('type') == 'video': + media_urls.append(attachment['source_mp4']) + else: + media_urls.append(attachment['url']) for url in media_urls: media_blob, content_type, key = self.url_to_blob(url) @@ -57,8 +76,14 @@ class GabScraper(Scraper): return True def get_profile(self, channel: Channel) -> dict: - client = Garc(profile = 'main') + + client = Client( + username = os.environ['GAB_USER'], + password = os.environ['GAB_PASS'], + threads = 25) + username = self.get_username_from_url(channel.url) - profile = list(client.user(username))[0] + + profile = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json() return profile \ No newline at end of file diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index a6c5643..4dd87ce 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -75,12 +75,18 @@ For developers, if changes are made to the package structure or additional modul Testing ------- -The *cisticola* application uses pytest_ for unit testing. To run the test suite, run the following command from the package root directory: +The *cisticola* application uses pytest_ for unit testing. To run the full test suite, run the following command from the package root directory: .. code-block:: pipenv run pytest +To run the test suite without archiving media (which can take a long time), run the following command from the package root directory: + +.. code-block:: + + pipenv run pytest -m "not media" + Examples -------- diff --git a/pytest.ini b/pytest.ini index 8d9973f..744f87d 100644 --- a/pytest.ini +++ b/pytest.ini @@ -14,6 +14,8 @@ addopts = markers = profile: marks tests for only extracting channel metadata (deselect with '-m "not profile"') + media: marks tests for archiving all media attachments (deselect with '-m + "not media"') filterwarnings = ignore:the imp module is deprecated:DeprecationWarning ignore:The localize method is no longer necessary, as this time zone supports the fold attribute diff --git a/tests/scraper/bitchute.py b/tests/scraper/bitchute.py index 687a6b0..94707ec 100644 --- a/tests/scraper/bitchute.py +++ b/tests/scraper/bitchute.py @@ -9,6 +9,7 @@ def test_scrape_bitchute_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = BitchuteScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media def test_scrape_bitchute_channel(controller, channel_kwargs): controller.reset_db() diff --git a/tests/scraper/gab.py b/tests/scraper/gab.py index 943f40f..ed9d32a 100644 --- a/tests/scraper/gab.py +++ b/tests/scraper/gab.py @@ -9,6 +9,7 @@ def test_scrape_gab_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = GabScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media def test_scrape_gab_channel(controller, channel_kwargs): controller.reset_db() diff --git a/tests/scraper/gettr.py b/tests/scraper/gettr.py index 6a3b70e..81a8bb8 100644 --- a/tests/scraper/gettr.py +++ b/tests/scraper/gettr.py @@ -9,6 +9,7 @@ def test_scrape_gettr_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = GettrScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media def test_scrape_gettr_channel(controller, channel_kwargs): controller.reset_db() diff --git a/tests/scraper/instagram.py b/tests/scraper/instagram.py index 840d6fa..98a0684 100644 --- a/tests/scraper/instagram.py +++ b/tests/scraper/instagram.py @@ -9,6 +9,7 @@ def test_scrape_instagram_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = InstagramScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media def test_scrape_instagram_channel(controller, channel_kwargs): controller.reset_db() diff --git a/tests/scraper/odysee.py b/tests/scraper/odysee.py index 8eba07d..84a45f8 100644 --- a/tests/scraper/odysee.py +++ b/tests/scraper/odysee.py @@ -9,6 +9,7 @@ def test_scrape_odysee_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = OdyseeScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media def test_scrape_odysee_channel(controller, channel_kwargs): controller.reset_db() diff --git a/tests/scraper/rumble.py b/tests/scraper/rumble.py index f64b24f..18c8749 100644 --- a/tests/scraper/rumble.py +++ b/tests/scraper/rumble.py @@ -9,6 +9,7 @@ def test_scrape_rumble_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = RumbleScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media def test_scrape_rumble_channel(controller, channel_kwargs): controller.reset_db() diff --git a/tests/scraper/telegram_snscrape.py b/tests/scraper/telegram_snscrape.py index 420b917..dbaed43 100644 --- a/tests/scraper/telegram_snscrape.py +++ b/tests/scraper/telegram_snscrape.py @@ -9,6 +9,7 @@ def test_scrape_telegram_snscrape_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = TelegramSnscrapeScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media def test_scrape_telegram_snscrape_channel(controller, channel_kwargs): controller.reset_db() diff --git a/tests/scraper/telegram_telethon.py b/tests/scraper/telegram_telethon.py index 1942fca..c6fb399 100644 --- a/tests/scraper/telegram_telethon.py +++ b/tests/scraper/telegram_telethon.py @@ -9,6 +9,7 @@ def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = TelegramTelethonScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media def test_scrape_telegram_telethon_channel(controller, channel_kwargs): controller.reset_db() diff --git a/tests/scraper/twitter.py b/tests/scraper/twitter.py index 7512b6a..97765aa 100644 --- a/tests/scraper/twitter.py +++ b/tests/scraper/twitter.py @@ -9,6 +9,7 @@ def test_scrape_twitter_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = TwitterScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media def test_scrape_twitter_channel(controller, channel_kwargs): controller.reset_db() diff --git a/tests/scraper/vkontakte.py b/tests/scraper/vkontakte.py index 8b0b757..4209c30 100644 --- a/tests/scraper/vkontakte.py +++ b/tests/scraper/vkontakte.py @@ -9,6 +9,7 @@ def test_scrape_vkontakte_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = VkontakteScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media def test_scrape_vkontakte_channel(controller, channel_kwargs): controller.reset_db() diff --git a/tests/scraper/youtube.py b/tests/scraper/youtube.py index e987cb8..1750b08 100644 --- a/tests/scraper/youtube.py +++ b/tests/scraper/youtube.py @@ -9,6 +9,7 @@ def test_scrape_youtube_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = YoutubeScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media def test_scrape_youtube_channel(controller, channel_kwargs): controller.reset_db() diff --git a/tests/transformer/twitter.py b/tests/transformer/twitter.py index fd95bbe..3c50d1c 100644 --- a/tests/transformer/twitter.py +++ b/tests/transformer/twitter.py @@ -1,11 +1,14 @@ from sqlalchemy.orm import sessionmaker, with_polymorphic import json +import pytest + from cisticola.base import Channel from cisticola.scraper import TwitterScraper from cisticola.transformer import TwitterTransformer from cisticola.base import Post, Media +@pytest.mark.media def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs): controller.reset_db() From b7871b060dc9dc843dfb50c77daf3c5e0d5fe7b5 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Wed, 30 Mar 2022 09:11:07 -0500 Subject: [PATCH 5/5] added capability to scrape Gab group posts --- cisticola/scraper/gab.py | 38 +++++++++++++++++++++++++++++--------- tests/conftest.py | 14 ++++++++++++++ tests/scraper/gab.py | 22 ++++++++++++++++++++++ 3 files changed, 65 insertions(+), 9 deletions(-) diff --git a/cisticola/scraper/gab.py b/cisticola/scraper/gab.py index 2307ca5..d1b6fbb 100644 --- a/cisticola/scraper/gab.py +++ b/cisticola/scraper/gab.py @@ -17,21 +17,34 @@ class GabScraper(Scraper): return username + def get_group_id_from_url(self, url): + group_id = int(url.split('/')[-1]) + + return group_id + def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: client = Client( username = os.environ['GAB_USER'], password = os.environ['GAB_PASS'], threads = 25) - username = self.get_username_from_url(channel.url) + if channel.url.split('/')[-2] == 'groups': - result = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json() - user_id = int(result['id']) + group_id = self.get_group_id_from_url(url = channel.url) + scraper = client.pull_group_posts( + id = group_id, + depth = float('inf')) + else: - scraper = client.pull_statuses( - id = user_id, - created_after = date.min, - replies = False) + username = self.get_username_from_url(channel.url) + + result = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json() + user_id = int(result['id']) + + scraper = client.pull_statuses( + id = user_id, + created_after = date.min, + replies = False) for post in scraper: if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): @@ -82,8 +95,15 @@ class GabScraper(Scraper): password = os.environ['GAB_PASS'], threads = 25) - username = self.get_username_from_url(channel.url) + if channel.url.split('/')[-2] == 'groups': - profile = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json() + group_id = self.get_group_id_from_url(url = channel.url) + profile = client.pull_group(id = group_id) + + else: + + username = self.get_username_from_url(channel.url) + + profile = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json() return profile \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 962fbed..684c15d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -33,6 +33,19 @@ GAB_CHANNEL_KWARGS = { 'chat': False, 'notes': ''} +GAB_GROUP_KWARGS = { + 'name': 'iran group (test)', + 'platform_id': 10001, + 'category': 'test', + 'platform': 'Gab', + 'url': 'https://gab.com/groups/10001', + 'screenname': 'iran group', + 'country': 'IR', + 'influencer': None, + 'public': True, + 'chat': True, + 'notes': ''} + GETTR_CHANNEL_KWARGS = { 'name': 'LizardRepublic (test)', 'platform_id': 'lizardrepublic', @@ -178,6 +191,7 @@ def channel_kwargs(): return { 'bitchute' : BITCHUTE_CHANNEL_KWARGS, 'gab' : GAB_CHANNEL_KWARGS, + 'gab_group' : GAB_GROUP_KWARGS, 'gettr' : GETTR_CHANNEL_KWARGS, 'instagram' : INSTAGRAM_CHANNEL_KWARGS, 'odysee' : ODYSEE_CHANNEL_KWARGS, diff --git a/tests/scraper/gab.py b/tests/scraper/gab.py index ed9d32a..d600429 100644 --- a/tests/scraper/gab.py +++ b/tests/scraper/gab.py @@ -23,4 +23,26 @@ def test_scrape_gab_profile(channel_kwargs): scraper = GabScraper() channel = Channel(**channel_kwargs['gab']) + scraper.get_profile(channel=channel) + +def test_scrape_gab_group_no_media(controller, channel_kwargs): + + channels = [Channel(**channel_kwargs['gab_group'])] + controller.register_scraper(scraper = GabScraper()) + controller.scrape_channels(channels = channels, archive_media = False) + +@pytest.mark.media +def test_scrape_gab_group(controller, channel_kwargs): + + controller.reset_db() + + channels = [Channel(**channel_kwargs['gab_group'])] + controller.register_scraper(scraper = GabScraper()) + controller.scrape_channels(channels = channels, archive_media = True) + +@pytest.mark.profile +def test_scrape_gab_group_profile(channel_kwargs): + + scraper = GabScraper() + channel = Channel(**channel_kwargs['gab_group']) scraper.get_profile(channel=channel) \ No newline at end of file