Merge pull request #3 from bellingcat/media

Expanding media archiving, implementing Odysee scraper
This commit is contained in:
Logan Williams
2022-03-07 11:47:08 +01:00
committed by GitHub
29 changed files with 1358 additions and 282 deletions

16
.gitignore vendored
View File

@@ -1,7 +1,17 @@
.DS_Store
# Sphinx documentation
docs/build/
docs/source/_*
# Miscellaneous files
**/.DS_Store
*.pyc
*.ipynb
*.db
docs/build/
docs/source/_*
.env
# Unit test / coverage reports
reports
.coverage
.cache
.pytest_cache/
cover/

View File

@@ -13,8 +13,16 @@ dateparser = "*"
sphinx = "*"
boto3 = "*"
snscrape = {git = "https://github.com/bellingcat/snscrape.git"}
ffmpeg-python = "*"
polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"}
garc = "*"
youtube-dl = "*"
[dev-packages]
pytest = "*"
pytest-cov = "*"
pytest-html = "*"
pytest-metadata = "*"
[requires]
python_version = "3.9"

454
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "d3ee112521273c2b0b9df074b4eb9a20649a2854bfffa433171749019acf8561"
"sha256": "ea2a1f1dff68fa0bd30dab06553e913f467c3b1399388b97f0ed913ab74c6e85"
},
"pipfile-spec": 6,
"requires": {
@@ -23,6 +23,14 @@
],
"version": "==0.7.12"
},
"attrs": {
"hashes": [
"sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4",
"sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==21.4.0"
},
"babel": {
"hashes": [
"sha256:ab49e12b91d937cd11f0b67cb259a57ab4ad2b59ac7a3b41d6c06c0ac5b0def9",
@@ -41,19 +49,19 @@
},
"boto3": {
"hashes": [
"sha256:0e8d4d814f94031947035a4c2bb2c23832d5de941a6a492fb85794a02bafc44d",
"sha256:95d9b5b6fe3383fbf8f33d58f62258d3b3ea138d4369017031339b60fd5b8887"
"sha256:75709628320cea8ce137975dc33b75213c2e4f6e7cd09e55290de7245e2c79e2",
"sha256:c92ec20a670721b5a1bc013b305a84db2b7f9c716653b3056ce7e2fbd2a180ef"
],
"index": "pypi",
"version": "==1.21.6"
"version": "==1.21.12"
},
"botocore": {
"hashes": [
"sha256:359b9ea3870a1f8264113cb0b1216baa94bf1e8cee5d5d8af63a2e7ca6e7b33c",
"sha256:69aaa5a78ac7371f573e463be51fb962213c42fab08ef82380e84b9a87386949"
"sha256:0174999a04b0a2e42457106093ace9b36fa94772a442d9bcf60750263d1d073e",
"sha256:0cd7395311a3fef4aad8df8f511b4f7d221c24ae30934bd5c03458b0fc096d0c"
],
"markers": "python_version >= '3.6'",
"version": "==1.24.6"
"version": "==1.24.12"
},
"bs4": {
"hashes": [
@@ -101,6 +109,14 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==0.17.1"
},
"ffmpeg-python": {
"hashes": [
"sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127",
"sha256:ac441a0404e053f8b6a1113a77c0f452f1cfc62f6344a769475ffdc0f56c23c5"
],
"index": "pypi",
"version": "==0.2.0"
},
"filelock": {
"hashes": [
"sha256:9cd540a9352e432c7246a48fe4e8712b10acb1df2ad1f30e8c070b82ae1fed85",
@@ -109,6 +125,20 @@
"markers": "python_version >= '3.7'",
"version": "==3.6.0"
},
"future": {
"hashes": [
"sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d"
],
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.18.2"
},
"garc": {
"hashes": [
"sha256:6f1da8ccdb30b165b8d9247314b73d1002f60381480e61fdbf108dc9abf3c216"
],
"index": "pypi",
"version": "==2.1"
},
"gogettr": {
"hashes": [
"sha256:9f5c90e3b1befe6eb561d4bca9ca124faddbe5787d8b429f02703c68dd51d255",
@@ -175,7 +205,7 @@
"sha256:fa877ca7f6b48054f847b61d6fa7bed5cebb663ebc55e018fda12db09dcc664c",
"sha256:fdcec0b8399108577ec290f55551d926d9a1fa6cad45882093a7a07ac5ec147b"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"markers": "python_version >= '3' and platform_machine == 'aarch64' or (platform_machine == 'ppc64le' or (platform_machine == 'x86_64' or (platform_machine == 'amd64' or (platform_machine == 'AMD64' or (platform_machine == 'win32' or platform_machine == 'WIN32')))))",
"version": "==1.1.2"
},
"idna": {
@@ -196,11 +226,18 @@
},
"importlib-metadata": {
"hashes": [
"sha256:175f4ee440a0317f6e8d81b7f8d4869f93316170a65ad2b007d2929186c8052c",
"sha256:e0bc84ff355328a4adfc5240c4f211e0ab386f80aa640d1b11f0618a1d282094"
"sha256:b36ffa925fe3139b2f6ff11d6925ffd4fa7bc47870165e3ac260ac7b4f91e6ac",
"sha256:d16e8c1deb60de41b8e8ed21c1a7b947b0bc62fab7e1d470bcdf331cea2e6735"
],
"markers": "python_version < '3.10'",
"version": "==4.11.1"
"version": "==4.11.2"
},
"iniconfig": {
"hashes": [
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
"sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
],
"version": "==1.1.1"
},
"jinja2": {
"hashes": [
@@ -339,6 +376,31 @@
"markers": "python_version >= '3.7'",
"version": "==2.1.0"
},
"numpy": {
"hashes": [
"sha256:03ae5850619abb34a879d5f2d4bb4dcd025d6d8fb72f5e461dae84edccfe129f",
"sha256:076aee5a3763d41da6bef9565fdf3cb987606f567cd8b104aded2b38b7b47abf",
"sha256:0b536b6840e84c1c6a410f3a5aa727821e6108f3454d81a5cd5900999ef04f89",
"sha256:15efb7b93806d438e3bc590ca8ef2f953b0ce4f86f337ef4559d31ec6cf9d7dd",
"sha256:168259b1b184aa83a514f307352c25c56af111c269ffc109d9704e81f72e764b",
"sha256:2638389562bda1635b564490d76713695ff497242a83d9b684d27bb4a6cc9d7a",
"sha256:3556c5550de40027d3121ebbb170f61bbe19eb639c7ad0c7b482cd9b560cd23b",
"sha256:4a176959b6e7e00b5a0d6f549a479f869829bfd8150282c590deee6d099bbb6e",
"sha256:515a8b6edbb904594685da6e176ac9fbea8f73a5ebae947281de6613e27f1956",
"sha256:55535c7c2f61e2b2fc817c5cbe1af7cb907c7f011e46ae0a52caa4be1f19afe2",
"sha256:59153979d60f5bfe9e4c00e401e24dfe0469ef8da6d68247439d3278f30a180f",
"sha256:60cb8e5933193a3cc2912ee29ca331e9c15b2da034f76159b7abc520b3d1233a",
"sha256:6767ad399e9327bfdbaa40871be4254d1995f4a3ca3806127f10cec778bd9896",
"sha256:76a4f9bce0278becc2da7da3b8ef854bed41a991f4226911a24a9711baad672c",
"sha256:8cf33634b60c9cef346663a222d9841d3bbbc0a2f00221d6bcfd0d993d5543f6",
"sha256:94dd11d9f13ea1be17bac39c1942f527cbf7065f94953cf62dfe805653da2f8f",
"sha256:aafa46b5a39a27aca566198d3312fb3bde95ce9677085efd02c86f7ef6be4ec7",
"sha256:badca914580eb46385e7f7e4e426fea6de0a37b9e06bec252e481ae7ec287082",
"sha256:d76a26c5118c4d96e264acc9e3242d72e1a2b92e739807b3b69d8d47684b6677"
],
"markers": "python_version < '3.10' and platform_machine != 'aarch64' and platform_machine != 'arm64'",
"version": "==1.22.2"
},
"packaging": {
"hashes": [
"sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb",
@@ -347,6 +409,53 @@
"markers": "python_version >= '3.6'",
"version": "==21.3"
},
"pandas": {
"hashes": [
"sha256:0259cd11e7e6125aaea3af823b80444f3adad6149ff4c97fef760093598b3e34",
"sha256:04dd15d9db538470900c851498e532ef28d4e56bfe72c9523acb32042de43dfb",
"sha256:0b1a13f647e4209ed7dbb5da3497891d0045da9785327530ab696417ef478f84",
"sha256:19f7c632436b1b4f84615c3b127bbd7bc603db95e3d4332ed259dc815c9aaa26",
"sha256:1b384516dbb4e6aae30e3464c2e77c563da5980440fbdfbd0968e3942f8f9d70",
"sha256:1d85d5f6be66dfd6d1d8d13b9535e342a2214260f1852654b19fa4d7b8d1218b",
"sha256:2e5a7a1e0ecaac652326af627a3eca84886da9e667d68286866d4e33f6547caf",
"sha256:3129a35d9dad1d80c234dd78f8f03141b914395d23f97cf92a366dcd19f8f8bf",
"sha256:358b0bc98a5ff067132d23bf7a2242ee95db9ea5b7bbc401cf79205f11502fd3",
"sha256:3dfb32ed50122fe8c5e7f2b8d97387edd742cc78f9ec36f007ee126cd3720907",
"sha256:4e1176f45981c8ccc8161bc036916c004ca51037a7ed73f2d2a9857e6dbe654f",
"sha256:508c99debccd15790d526ce6b1624b97a5e1e4ca5b871319fb0ebfd46b8f4dad",
"sha256:6105af6533f8b63a43ea9f08a2ede04e8f43e49daef0209ab0d30352bcf08bee",
"sha256:6d6ad1da00c7cc7d8dd1559a6ba59ba3973be6b15722d49738b2be0977eb8a0c",
"sha256:7ea47ba1d6f359680130bd29af497333be6110de8f4c35b9211eec5a5a9630fa",
"sha256:8db93ec98ac7cb5f8ac1420c10f5e3c43533153f253fe7fb6d891cf5aa2b80d2",
"sha256:96e9ece5759f9b47ae43794b6359bbc54805d76e573b161ae770c1ea59393106",
"sha256:bbb15ad79050e8b8d39ec40dd96a30cd09b886a2ae8848d0df1abba4d5502a67",
"sha256:c614001129b2a5add5e3677c3a213a9e6fd376204cb8d17c04e84ff7dfc02a73",
"sha256:e6a7bbbb7950063bfc942f8794bc3e31697c020a14f1cd8905fc1d28ec674a01",
"sha256:f02e85e6d832be37d7f16cf6ac8bb26b519ace3e5f3235564a91c7f658ab2a43"
],
"markers": "python_version >= '3.8'",
"version": "==1.4.1"
},
"pluggy": {
"hashes": [
"sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
"sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
],
"markers": "python_version >= '3.6'",
"version": "==1.0.0"
},
"polyphemus": {
"git": "https://github.com/bellingcat/polyphemus.git",
"ref": "8506fd43770661cdcf92c5cac2356cba74778834"
},
"py": {
"hashes": [
"sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719",
"sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==1.11.0"
},
"pygments": {
"hashes": [
"sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65",
@@ -371,6 +480,14 @@
],
"version": "==1.7.1"
},
"pytest": {
"hashes": [
"sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db",
"sha256:e30905a0c131d3d94b89624a1cc5afec3e0ba2fbdb151867d8e0ebd49850f171"
],
"markers": "python_version >= '3.6'",
"version": "==7.0.1"
},
"python-dateutil": {
"hashes": [
"sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
@@ -396,82 +513,83 @@
},
"regex": {
"hashes": [
"sha256:04611cc0f627fc4a50bc4a9a2e6178a974c6a6a4aa9c1cca921635d2c47b9c87",
"sha256:0b5d6f9aed3153487252d00a18e53f19b7f52a1651bc1d0c4b5844bc286dfa52",
"sha256:0d2f5c3f7057530afd7b739ed42eb04f1011203bc5e4663e1e1d01bb50f813e3",
"sha256:11772be1eb1748e0e197a40ffb82fb8fd0d6914cd147d841d9703e2bef24d288",
"sha256:1333b3ce73269f986b1fa4d5d395643810074dc2de5b9d262eb258daf37dc98f",
"sha256:16f81025bb3556eccb0681d7946e2b35ff254f9f888cff7d2120e8826330315c",
"sha256:1a171eaac36a08964d023eeff740b18a415f79aeb212169080c170ec42dd5184",
"sha256:1d6301f5288e9bdca65fab3de6b7de17362c5016d6bf8ee4ba4cbe833b2eda0f",
"sha256:1e031899cb2bc92c0cf4d45389eff5b078d1936860a1be3aa8c94fa25fb46ed8",
"sha256:1f8c0ae0a0de4e19fddaaff036f508db175f6f03db318c80bbc239a1def62d02",
"sha256:2245441445099411b528379dee83e56eadf449db924648e5feb9b747473f42e3",
"sha256:22709d701e7037e64dae2a04855021b62efd64a66c3ceed99dfd684bfef09e38",
"sha256:24c89346734a4e4d60ecf9b27cac4c1fee3431a413f7aa00be7c4d7bbacc2c4d",
"sha256:25716aa70a0d153cd844fe861d4f3315a6ccafce22b39d8aadbf7fcadff2b633",
"sha256:2dacb3dae6b8cc579637a7b72f008bff50a94cde5e36e432352f4ca57b9e54c4",
"sha256:34316bf693b1d2d29c087ee7e4bb10cdfa39da5f9c50fa15b07489b4ab93a1b5",
"sha256:36b2d700a27e168fa96272b42d28c7ac3ff72030c67b32f37c05616ebd22a202",
"sha256:37978254d9d00cda01acc1997513f786b6b971e57b778fbe7c20e30ae81a97f3",
"sha256:38289f1690a7e27aacd049e420769b996826f3728756859420eeee21cc857118",
"sha256:385ccf6d011b97768a640e9d4de25412204fbe8d6b9ae39ff115d4ff03f6fe5d",
"sha256:3c7ea86b9ca83e30fa4d4cd0eaf01db3ebcc7b2726a25990966627e39577d729",
"sha256:49810f907dfe6de8da5da7d2b238d343e6add62f01a15d03e2195afc180059ed",
"sha256:519c0b3a6fbb68afaa0febf0d28f6c4b0a1074aefc484802ecb9709faf181607",
"sha256:51f02ca184518702975b56affde6c573ebad4e411599005ce4468b1014b4786c",
"sha256:552a39987ac6655dad4bf6f17dd2b55c7b0c6e949d933b8846d2e312ee80005a",
"sha256:596f5ae2eeddb79b595583c2e0285312b2783b0ec759930c272dbf02f851ff75",
"sha256:6014038f52b4b2ac1fa41a58d439a8a00f015b5c0735a0cd4b09afe344c94899",
"sha256:61ebbcd208d78658b09e19c78920f1ad38936a0aa0f9c459c46c197d11c580a0",
"sha256:6213713ac743b190ecbf3f316d6e41d099e774812d470422b3a0f137ea635832",
"sha256:637e27ea1ebe4a561db75a880ac659ff439dec7f55588212e71700bb1ddd5af9",
"sha256:6aa427c55a0abec450bca10b64446331b5ca8f79b648531138f357569705bc4a",
"sha256:6ca45359d7a21644793de0e29de497ef7f1ae7268e346c4faf87b421fea364e6",
"sha256:6db1b52c6f2c04fafc8da17ea506608e6be7086715dab498570c3e55e4f8fbd1",
"sha256:752e7ddfb743344d447367baa85bccd3629c2c3940f70506eb5f01abce98ee68",
"sha256:760c54ad1b8a9b81951030a7e8e7c3ec0964c1cb9fee585a03ff53d9e531bb8e",
"sha256:768632fd8172ae03852e3245f11c8a425d95f65ff444ce46b3e673ae5b057b74",
"sha256:7a0b9f6a1a15d494b35f25ed07abda03209fa76c33564c09c9e81d34f4b919d7",
"sha256:7e070d3aef50ac3856f2ef5ec7214798453da878bb5e5a16c16a61edf1817cc3",
"sha256:7e12949e5071c20ec49ef00c75121ed2b076972132fc1913ddf5f76cae8d10b4",
"sha256:7e26eac9e52e8ce86f915fd33380f1b6896a2b51994e40bb094841e5003429b4",
"sha256:85ffd6b1cb0dfb037ede50ff3bef80d9bf7fa60515d192403af6745524524f3b",
"sha256:8618d9213a863c468a865e9d2ec50221015f7abf52221bc927152ef26c484b4c",
"sha256:8acef4d8a4353f6678fd1035422a937c2170de58a2b29f7da045d5249e934101",
"sha256:8d2f355a951f60f0843f2368b39970e4667517e54e86b1508e76f92b44811a8a",
"sha256:90b6840b6448203228a9d8464a7a0d99aa8fa9f027ef95fe230579abaf8a6ee1",
"sha256:9187500d83fd0cef4669385cbb0961e227a41c0c9bc39219044e35810793edf7",
"sha256:93c20777a72cae8620203ac11c4010365706062aa13aaedd1a21bb07adbb9d5d",
"sha256:93cce7d422a0093cfb3606beae38a8e47a25232eea0f292c878af580a9dc7605",
"sha256:94c623c331a48a5ccc7d25271399aff29729fa202c737ae3b4b28b89d2b0976d",
"sha256:97f32dc03a8054a4c4a5ab5d761ed4861e828b2c200febd4e46857069a483916",
"sha256:9a2bf98ac92f58777c0fafc772bf0493e67fcf677302e0c0a630ee517a43b949",
"sha256:a602bdc8607c99eb5b391592d58c92618dcd1537fdd87df1813f03fed49957a6",
"sha256:a9d24b03daf7415f78abc2d25a208f234e2c585e5e6f92f0204d2ab7b9ab48e3",
"sha256:abfcb0ef78df0ee9df4ea81f03beea41849340ce33a4c4bd4dbb99e23ec781b6",
"sha256:b013f759cd69cb0a62de954d6d2096d648bc210034b79b1881406b07ed0a83f9",
"sha256:b02e3e72665cd02afafb933453b0c9f6c59ff6e3708bd28d0d8580450e7e88af",
"sha256:b52cc45e71657bc4743a5606d9023459de929b2a198d545868e11898ba1c3f59",
"sha256:ba37f11e1d020969e8a779c06b4af866ffb6b854d7229db63c5fdddfceaa917f",
"sha256:bb804c7d0bfbd7e3f33924ff49757de9106c44e27979e2492819c16972ec0da2",
"sha256:bf594cc7cc9d528338d66674c10a5b25e3cde7dd75c3e96784df8f371d77a298",
"sha256:c38baee6bdb7fe1b110b6b3aaa555e6e872d322206b7245aa39572d3fc991ee4",
"sha256:c73d2166e4b210b73d1429c4f1ca97cea9cc090e5302df2a7a0a96ce55373f1c",
"sha256:c9099bf89078675c372339011ccfc9ec310310bf6c292b413c013eb90ffdcafc",
"sha256:cf0db26a1f76aa6b3aa314a74b8facd586b7a5457d05b64f8082a62c9c49582a",
"sha256:d19a34f8a3429bd536996ad53597b805c10352a8561d8382e05830df389d2b43",
"sha256:da80047524eac2acf7c04c18ac7a7da05a9136241f642dd2ed94269ef0d0a45a",
"sha256:de2923886b5d3214be951bc2ce3f6b8ac0d6dfd4a0d0e2a4d2e5523d8046fdfb",
"sha256:defa0652696ff0ba48c8aff5a1fac1eef1ca6ac9c660b047fc8e7623c4eb5093",
"sha256:e54a1eb9fd38f2779e973d2f8958fd575b532fe26013405d1afb9ee2374e7ab8",
"sha256:e5c31d70a478b0ca22a9d2d76d520ae996214019d39ed7dd93af872c7f301e52",
"sha256:ebaeb93f90c0903233b11ce913a7cb8f6ee069158406e056f884854c737d2442",
"sha256:ecfe51abf7f045e0b9cdde71ca9e153d11238679ef7b5da6c82093874adf3338",
"sha256:f99112aed4fb7cee00c7f77e8b964a9b10f69488cdff626ffd797d02e2e4484f",
"sha256:fd914db437ec25bfa410f8aa0aa2f3ba87cdfc04d9919d608d02330947afaeab"
"sha256:0008650041531d0eadecc96a73d37c2dc4821cf51b0766e374cb4f1ddc4e1c14",
"sha256:03299b0bcaa7824eb7c0ebd7ef1e3663302d1b533653bfe9dc7e595d453e2ae9",
"sha256:06b1df01cf2aef3a9790858af524ae2588762c8a90e784ba00d003f045306204",
"sha256:09b4b6ccc61d4119342b26246ddd5a04accdeebe36bdfe865ad87a0784efd77f",
"sha256:0be0c34a39e5d04a62fd5342f0886d0e57592a4f4993b3f9d257c1f688b19737",
"sha256:0d96eec8550fd2fd26f8e675f6d8b61b159482ad8ffa26991b894ed5ee19038b",
"sha256:0eb0e2845e81bdea92b8281a3969632686502565abf4a0b9e4ab1471c863d8f3",
"sha256:13bbf0c9453c6d16e5867bda7f6c0c7cff1decf96c5498318bb87f8136d2abd4",
"sha256:17e51ad1e6131c496b58d317bc9abec71f44eb1957d32629d06013a21bc99cac",
"sha256:1977bb64264815d3ef016625adc9df90e6d0e27e76260280c63eca993e3f455f",
"sha256:1e30762ddddb22f7f14c4f59c34d3addabc789216d813b0f3e2788d7bcf0cf29",
"sha256:1e73652057473ad3e6934944af090852a02590c349357b79182c1b681da2c772",
"sha256:20e6a27959f162f979165e496add0d7d56d7038237092d1aba20b46de79158f1",
"sha256:286ff9ec2709d56ae7517040be0d6c502642517ce9937ab6d89b1e7d0904f863",
"sha256:297c42ede2c81f0cb6f34ea60b5cf6dc965d97fa6936c11fc3286019231f0d66",
"sha256:320c2f4106962ecea0f33d8d31b985d3c185757c49c1fb735501515f963715ed",
"sha256:35ed2f3c918a00b109157428abfc4e8d1ffabc37c8f9abc5939ebd1e95dabc47",
"sha256:3d146e5591cb67c5e836229a04723a30af795ef9b70a0bbd913572e14b7b940f",
"sha256:42bb37e2b2d25d958c25903f6125a41aaaa1ed49ca62c103331f24b8a459142f",
"sha256:42d6007722d46bd2c95cce700181570b56edc0dcbadbfe7855ec26c3f2d7e008",
"sha256:43eba5c46208deedec833663201752e865feddc840433285fbadee07b84b464d",
"sha256:452519bc4c973e961b1620c815ea6dd8944a12d68e71002be5a7aff0a8361571",
"sha256:4b9c16a807b17b17c4fa3a1d8c242467237be67ba92ad24ff51425329e7ae3d0",
"sha256:5510932596a0f33399b7fff1bd61c59c977f2b8ee987b36539ba97eb3513584a",
"sha256:55820bc631684172b9b56a991d217ec7c2e580d956591dc2144985113980f5a3",
"sha256:57484d39447f94967e83e56db1b1108c68918c44ab519b8ecfc34b790ca52bf7",
"sha256:58ba41e462653eaf68fc4a84ec4d350b26a98d030be1ab24aba1adcc78ffe447",
"sha256:5bc5f921be39ccb65fdda741e04b2555917a4bced24b4df14eddc7569be3b493",
"sha256:5dcc4168536c8f68654f014a3db49b6b4a26b226f735708be2054314ed4964f4",
"sha256:5f92a7cdc6a0ae2abd184e8dfd6ef2279989d24c85d2c85d0423206284103ede",
"sha256:67250b36edfa714ba62dc62d3f238e86db1065fccb538278804790f578253640",
"sha256:6df070a986fc064d865c381aecf0aaff914178fdf6874da2f2387e82d93cc5bd",
"sha256:729aa8ca624c42f309397c5fc9e21db90bf7e2fdd872461aabdbada33de9063c",
"sha256:72bc3a5effa5974be6d965ed8301ac1e869bc18425c8a8fac179fbe7876e3aee",
"sha256:74d86e8924835f863c34e646392ef39039405f6ce52956d8af16497af4064a30",
"sha256:79e5af1ff258bc0fe0bdd6f69bc4ae33935a898e3cbefbbccf22e88a27fa053b",
"sha256:7b103dffb9f6a47ed7ffdf352b78cfe058b1777617371226c1894e1be443afec",
"sha256:83f03f0bd88c12e63ca2d024adeee75234d69808b341e88343b0232329e1f1a1",
"sha256:86d7a68fa53688e1f612c3246044157117403c7ce19ebab7d02daf45bd63913e",
"sha256:878c626cbca3b649e14e972c14539a01191d79e58934e3f3ef4a9e17f90277f8",
"sha256:878f5d649ba1db9f52cc4ef491f7dba2d061cdc48dd444c54260eebc0b1729b9",
"sha256:87bc01226cd288f0bd9a4f9f07bf6827134dc97a96c22e2d28628e824c8de231",
"sha256:8babb2b5751105dc0aef2a2e539f4ba391e738c62038d8cb331c710f6b0f3da7",
"sha256:91e0f7e7be77250b808a5f46d90bf0032527d3c032b2131b63dee54753a4d729",
"sha256:9557545c10d52c845f270b665b52a6a972884725aa5cf12777374e18f2ea8960",
"sha256:9ccb0a4ab926016867260c24c192d9df9586e834f5db83dfa2c8fffb3a6e5056",
"sha256:9d828c5987d543d052b53c579a01a52d96b86f937b1777bbfe11ef2728929357",
"sha256:9efa41d1527b366c88f265a227b20bcec65bda879962e3fc8a2aee11e81266d7",
"sha256:aaf5317c961d93c1a200b9370fb1c6b6836cc7144fef3e5a951326912bf1f5a3",
"sha256:ab69b4fe09e296261377d209068d52402fb85ef89dc78a9ac4a29a895f4e24a7",
"sha256:ad397bc7d51d69cb07ef89e44243f971a04ce1dca9bf24c992c362406c0c6573",
"sha256:ae17fc8103f3b63345709d3e9654a274eee1c6072592aec32b026efd401931d0",
"sha256:af4d8cc28e4c7a2f6a9fed544228c567340f8258b6d7ea815b62a72817bbd178",
"sha256:b22ff939a8856a44f4822da38ef4868bd3a9ade22bb6d9062b36957c850e404f",
"sha256:b549d851f91a4efb3e65498bd4249b1447ab6035a9972f7fc215eb1f59328834",
"sha256:be319f4eb400ee567b722e9ea63d5b2bb31464e3cf1b016502e3ee2de4f86f5c",
"sha256:c0446b2871335d5a5e9fcf1462f954586b09a845832263db95059dcd01442015",
"sha256:c68d2c04f7701a418ec2e5631b7f3552efc32f6bcc1739369c6eeb1af55f62e0",
"sha256:c87ac58b9baaf50b6c1b81a18d20eda7e2883aa9a4fb4f1ca70f2e443bfcdc57",
"sha256:caa2734ada16a44ae57b229d45091f06e30a9a52ace76d7574546ab23008c635",
"sha256:cb34c2d66355fb70ae47b5595aafd7218e59bb9c00ad8cc3abd1406ca5874f07",
"sha256:cb3652bbe6720786b9137862205986f3ae54a09dec8499a995ed58292bdf77c2",
"sha256:cf668f26604e9f7aee9f8eaae4ca07a948168af90b96be97a4b7fa902a6d2ac1",
"sha256:d326ff80ed531bf2507cba93011c30fff2dd51454c85f55df0f59f2030b1687b",
"sha256:d6c2441538e4fadd4291c8420853431a229fcbefc1bf521810fbc2629d8ae8c2",
"sha256:d6ecfd1970b3380a569d7b3ecc5dd70dba295897418ed9e31ec3c16a5ab099a5",
"sha256:e5602a9b5074dcacc113bba4d2f011d2748f50e3201c8139ac5b68cf2a76bd8b",
"sha256:ef806f684f17dbd6263d72a54ad4073af42b42effa3eb42b877e750c24c76f86",
"sha256:f3356afbb301ec34a500b8ba8b47cba0b44ed4641c306e1dd981a08b416170b5",
"sha256:f6f7ee2289176cb1d2c59a24f50900f8b9580259fa9f1a739432242e7d254f93",
"sha256:f7e8f1ee28e0a05831c92dc1c0c1c94af5289963b7cf09eca5b5e3ce4f8c91b0",
"sha256:f8169ec628880bdbca67082a9196e2106060a4a5cbd486ac51881a4df805a36f",
"sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d",
"sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4"
],
"version": "==2022.1.18"
"markers": "python_version >= '3.6'",
"version": "==2022.3.2"
},
"requests": {
"extras": [
@@ -486,11 +604,11 @@
},
"s3transfer": {
"hashes": [
"sha256:25c140f5c66aa79e1ac60be50dcd45ddc59e83895f062a3aab263b870102911f",
"sha256:69d264d3e760e569b78aaa0f22c97e955891cd22e32b10c51f784eeda4d9d10a"
"sha256:7a6f4c4d1fdb9a2b640244008e142cbc2cd3ae34b386584ef044dd0f27101971",
"sha256:95c58c194ce657a5f4fb0b9e60a84968c808888aed628cd98ab8771fe1db98ed"
],
"markers": "python_version >= '3.6'",
"version": "==0.5.1"
"version": "==0.5.2"
},
"six": {
"hashes": [
@@ -617,6 +735,14 @@
"index": "pypi",
"version": "==1.4.31"
},
"tomli": {
"hashes": [
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
"sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
],
"markers": "python_version >= '3.7'",
"version": "==2.0.1"
},
"tzdata": {
"hashes": [
"sha256:3eee491e22ebfe1e5cfcc97a4137cd70f092ce59144d81f8924a844de05ba8f5",
@@ -638,9 +764,17 @@
"sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
"sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'",
"version": "==1.26.8"
},
"youtube-dl": {
"hashes": [
"sha256:bc59e86c5d15d887ac590454511f08ce2c47698d5a82c27bfe27b5d814bbaed2",
"sha256:f1336d5de68647e0364a47b3c0712578e59ec76f02048ff5c50ef1c69d79cd55"
],
"index": "pypi",
"version": "==2021.12.17"
},
"zipp": {
"hashes": [
"sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d",
@@ -650,5 +784,143 @@
"version": "==3.7.0"
}
},
"develop": {}
"develop": {
"attrs": {
"hashes": [
"sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4",
"sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==21.4.0"
},
"coverage": {
"extras": [
"toml"
],
"hashes": [
"sha256:03e2a7826086b91ef345ff18742ee9fc47a6839ccd517061ef8fa1976e652ce9",
"sha256:07e6db90cd9686c767dcc593dff16c8c09f9814f5e9c51034066cad3373b914d",
"sha256:18d520c6860515a771708937d2f78f63cc47ab3b80cb78e86573b0a760161faf",
"sha256:1ebf730d2381158ecf3dfd4453fbca0613e16eaa547b4170e2450c9707665ce7",
"sha256:21b7745788866028adeb1e0eca3bf1101109e2dc58456cb49d2d9b99a8c516e6",
"sha256:26e2deacd414fc2f97dd9f7676ee3eaecd299ca751412d89f40bc01557a6b1b4",
"sha256:2c6dbb42f3ad25760010c45191e9757e7dce981cbfb90e42feef301d71540059",
"sha256:2fea046bfb455510e05be95e879f0e768d45c10c11509e20e06d8fcaa31d9e39",
"sha256:34626a7eee2a3da12af0507780bb51eb52dca0e1751fd1471d0810539cefb536",
"sha256:37d1141ad6b2466a7b53a22e08fe76994c2d35a5b6b469590424a9953155afac",
"sha256:46191097ebc381fbf89bdce207a6c107ac4ec0890d8d20f3360345ff5976155c",
"sha256:4dd8bafa458b5c7d061540f1ee9f18025a68e2d8471b3e858a9dad47c8d41903",
"sha256:4e21876082ed887baed0146fe222f861b5815455ada3b33b890f4105d806128d",
"sha256:58303469e9a272b4abdb9e302a780072c0633cdcc0165db7eec0f9e32f901e05",
"sha256:5ca5aeb4344b30d0bec47481536b8ba1181d50dbe783b0e4ad03c95dc1296684",
"sha256:68353fe7cdf91f109fc7d474461b46e7f1f14e533e911a2a2cbb8b0fc8613cf1",
"sha256:6f89d05e028d274ce4fa1a86887b071ae1755082ef94a6740238cd7a8178804f",
"sha256:7a15dc0a14008f1da3d1ebd44bdda3e357dbabdf5a0b5034d38fcde0b5c234b7",
"sha256:8bdde1177f2311ee552f47ae6e5aa7750c0e3291ca6b75f71f7ffe1f1dab3dca",
"sha256:8ce257cac556cb03be4a248d92ed36904a59a4a5ff55a994e92214cde15c5bad",
"sha256:8cf5cfcb1521dc3255d845d9dca3ff204b3229401994ef8d1984b32746bb45ca",
"sha256:8fbbdc8d55990eac1b0919ca69eb5a988a802b854488c34b8f37f3e2025fa90d",
"sha256:9548f10d8be799551eb3a9c74bbf2b4934ddb330e08a73320123c07f95cc2d92",
"sha256:96f8a1cb43ca1422f36492bebe63312d396491a9165ed3b9231e778d43a7fca4",
"sha256:9b27d894748475fa858f9597c0ee1d4829f44683f3813633aaf94b19cb5453cf",
"sha256:9baff2a45ae1f17c8078452e9e5962e518eab705e50a0aa8083733ea7d45f3a6",
"sha256:a2a8b8bcc399edb4347a5ca8b9b87e7524c0967b335fbb08a83c8421489ddee1",
"sha256:acf53bc2cf7282ab9b8ba346746afe703474004d9e566ad164c91a7a59f188a4",
"sha256:b0be84e5a6209858a1d3e8d1806c46214e867ce1b0fd32e4ea03f4bd8b2e3359",
"sha256:b31651d018b23ec463e95cf10070d0b2c548aa950a03d0b559eaa11c7e5a6fa3",
"sha256:b78e5afb39941572209f71866aa0b206c12f0109835aa0d601e41552f9b3e620",
"sha256:c76aeef1b95aff3905fb2ae2d96e319caca5b76fa41d3470b19d4e4a3a313512",
"sha256:dd035edafefee4d573140a76fdc785dc38829fe5a455c4bb12bac8c20cfc3d69",
"sha256:dd6fe30bd519694b356cbfcaca9bd5c1737cddd20778c6a581ae20dc8c04def2",
"sha256:e5f4e1edcf57ce94e5475fe09e5afa3e3145081318e5fd1a43a6b4539a97e518",
"sha256:ec6bc7fe73a938933d4178c9b23c4e0568e43e220aef9472c4f6044bfc6dd0f0",
"sha256:f1555ea6d6da108e1999b2463ea1003fe03f29213e459145e70edbaf3e004aaa",
"sha256:f5fa5803f47e095d7ad8443d28b01d48c0359484fec1b9d8606d0e3282084bc4",
"sha256:f7331dbf301b7289013175087636bbaf5b2405e57259dd2c42fdcc9fcc47325e",
"sha256:f9987b0354b06d4df0f4d3e0ec1ae76d7ce7cbca9a2f98c25041eb79eec766f1",
"sha256:fd9e830e9d8d89b20ab1e5af09b32d33e1a08ef4c4e14411e559556fd788e6b2"
],
"markers": "python_version >= '3.7'",
"version": "==6.3.2"
},
"iniconfig": {
"hashes": [
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
"sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
],
"version": "==1.1.1"
},
"packaging": {
"hashes": [
"sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb",
"sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"
],
"markers": "python_version >= '3.6'",
"version": "==21.3"
},
"pluggy": {
"hashes": [
"sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
"sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
],
"markers": "python_version >= '3.6'",
"version": "==1.0.0"
},
"py": {
"hashes": [
"sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719",
"sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==1.11.0"
},
"pyparsing": {
"hashes": [
"sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea",
"sha256:a6c06a88f252e6c322f65faf8f418b16213b51bdfaece0524c1c1bc30c63c484"
],
"markers": "python_version >= '3.6'",
"version": "==3.0.7"
},
"pytest": {
"hashes": [
"sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db",
"sha256:e30905a0c131d3d94b89624a1cc5afec3e0ba2fbdb151867d8e0ebd49850f171"
],
"markers": "python_version >= '3.6'",
"version": "==7.0.1"
},
"pytest-cov": {
"hashes": [
"sha256:578d5d15ac4a25e5f961c938b85a05b09fdaae9deef3bb6de9a6e766622ca7a6",
"sha256:e7f0f5b1617d2210a2cabc266dfe2f4c75a8d32fb89eafb7ad9d06f6d076d470"
],
"index": "pypi",
"version": "==3.0.0"
},
"pytest-html": {
"hashes": [
"sha256:3ee1cf319c913d19fe53aeb0bc400e7b0bc2dbeb477553733db1dad12eb75ee3",
"sha256:b7f82f123936a3f4d2950bc993c2c1ca09ce262c9ae12f9ac763a2401380b455"
],
"index": "pypi",
"version": "==3.1.1"
},
"pytest-metadata": {
"hashes": [
"sha256:576055b8336dd4a9006dd2a47615f76f2f8c30ab12b1b1c039d99e834583523f",
"sha256:71b506d49d34e539cc3cfdb7ce2c5f072bea5c953320002c95968e0238f8ecf1"
],
"index": "pypi",
"version": "==1.11.0"
},
"tomli": {
"hashes": [
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
"sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
],
"markers": "python_version >= '3.7'",
"version": "==2.0.1"
}
}
}

View File

@@ -1,71 +1,3 @@
from typing import List
import cisticola.base
import cisticola.scraper.base
from sqlalchemy.orm import sessionmaker
from loguru import logger
class ScraperController:
"""Registers scrapers, uses them to generate ScraperResults. Synchronizes
everything with database via ORM."""
def __init__(self):
self.scrapers = []
self.session = None
self.mapper_registry = None
def register_scraper(self, scraper: cisticola.scraper.base.Scraper):
self.scrapers.append(scraper)
def scrape_channels(self, channels: List[cisticola.base.Channel]):
if self.session is None:
logger.error("No DB session")
return
for channel in channels:
handled = False
for scraper in self.scrapers:
if scraper.can_handle(channel):
session = self.session()
handled = True
added = 0
# get most recent post
session = self.session()
rows = session.query(cisticola.base.ScraperResult).where(
cisticola.base.ScraperResult.channel == channel.id).order_by(
cisticola.base.ScraperResult.date.desc()).limit(1).all()
if len(rows) == 1:
since = rows[0]
else:
since = None
posts = scraper.get_posts(channel, since=since)
for post in posts:
session.add(post)
added += 1
session.commit()
logger.info(
f"{scraper} found {added} new posts from {channel}")
break
if not handled:
logger.warning(f"No handler found for Channel {channel}")
def connect_to_db(self, engine):
# create tables
cisticola.base.mapper_registry.metadata.create_all(bind=engine)
self.session = sessionmaker()
self.session.configure(bind=engine)
class ETLController:
"""This class will transform the raw_data tables into a format more conducive to analysis."""
def __init__(self):
pass
from . import base
from . import scraper
from . import transformer

View File

@@ -1,11 +1,12 @@
from typing import List
from dataclasses import dataclass
from datetime import datetime
from sqlalchemy.orm import registry
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey
mapper_registry = registry()
@dataclass
class ScraperResult:
"""A minimally processed result from a scraper"""
@@ -84,4 +85,4 @@ analysis_table = Table('analysis', mapper_registry.metadata,
Column('author_username', String)
)
mapper_registry.map_imperatively(TransformedResult, analysis_table)
mapper_registry.map_imperatively(TransformedResult, analysis_table)

View File

@@ -0,0 +1,8 @@
from .base import Scraper, ScraperController
from .bitchute import BitchuteScraper
from .gab import GabScraper
from .gettr import GettrScraper
from .odysee import OdyseeScraper
from .rumble import RumbleScraper
from .telegram_snscrape import TelegramSnscrapeScraper
from .twitter import TwitterScraper

View File

@@ -1,10 +1,16 @@
from typing import Generator
import cisticola.base
import requests
from typing import Generator, Tuple, List
import os
import boto3
from io import BytesIO
from urllib.parse import urlparse
import tempfile
import requests
import boto3
from loguru import logger
import ffmpeg
from sqlalchemy.orm import sessionmaker
from cisticola.base import Channel, ScraperResult, mapper_registry
class Scraper:
__version__ = "Scraper 0.0.0"
@@ -19,31 +25,64 @@ class Scraper:
'DO_SPACES_KEY'),
aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
self.headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'}
pass
def __str__(self):
return self.__version__
def archive_media(self, url: str, key: str = None) -> str:
def url_to_key(self, url: str, content_type: str) -> str:
key = urlparse(url).path.split('/')[-1]
return key
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
n_retries = 0
r = requests.get(url)
r = requests.get(url, headers = self.headers)
while r.status_code != 200 and n_retries < 5:
logger.warning(f"{n_retries}/5: Request for {url} failed")
n_retries += 1
r = requests.get(url)
r = requests.get(url, headers = self.headers)
if r.status_code != 200:
logger.error(f"Could not fetch URL {url}")
return url
blob = r.content
content_type = r.headers.get('Content-Type')
if key is None:
key = url.split('/')[-1]
key = key.split('?')[0]
key = self.url_to_key(url, content_type)
return blob, content_type, key
def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
content_type = 'video/mp4'
ext = '.' + content_type.split('/')[-1]
with tempfile.NamedTemporaryFile(suffix = ext) as temp_file:
(
ffmpeg
.input(url)
.output(temp_file.name, vcodec='copy')
.global_args('-loglevel', 'error')
.run(overwrite_output=True))
temp_file.seek(0)
blob = temp_file.read()
if key is None:
key = self.url_to_key(url = url, content_type = content_type)
return blob, content_type, key
def archive_media(self, blob: bytes, content_type: str, key: str) -> str:
filename = self.__version__.replace(' ', '_') + '/' + key
@@ -54,8 +93,77 @@ class Scraper:
return archived_url
def can_handle(self, channel: cisticola.base.Channel) -> bool:
def can_handle(self, channel: Channel) -> bool:
pass
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
pass
class ScraperController:
"""Registers scrapers, uses them to generate ScraperResults. Synchronizes
everything with database via ORM."""
def __init__(self):
self.scrapers = []
self.session = None
self.mapper_registry = None
def register_scraper(self, scraper: Scraper):
self.scrapers.append(scraper)
def register_scrapers(self, scraper: List[Scraper]):
self.scrapers.extend(scraper)
def scrape_channels(self, channels: List[Channel]):
if self.session is None:
logger.error("No DB session")
return
for channel in channels:
handled = False
for scraper in self.scrapers:
if scraper.can_handle(channel):
session = self.session()
handled = True
added = 0
# get most recent post
session = self.session()
rows = session.query(ScraperResult).where(
ScraperResult.channel == channel.id).order_by(
ScraperResult.date.desc()).limit(1).all()
if len(rows) == 1:
since = rows[0]
else:
since = None
posts = scraper.get_posts(channel, since=since)
for post in posts:
session.add(post)
added += 1
session.commit()
logger.info(
f"{scraper} found {added} new posts from {channel}")
break
if not handled:
logger.warning(f"No handler found for Channel {channel}")
def connect_to_db(self, engine):
# create tables
mapper_registry.metadata.create_all(bind=engine)
self.session = sessionmaker()
self.session.configure(bind=engine)
class ETLController:
"""This class will transform the raw_data tables into a format more conducive to analysis."""
def __init__(self):
pass

View File

@@ -9,53 +9,54 @@ from typing import Generator
import requests
from bs4 import BeautifulSoup
import cisticola.base
class BitchuteScraper(cisticola.scraper.Scraper):
from cisticola.base import Channel, ScraperResult
from cisticola.scraper.base import Scraper
class BitchuteScraper(Scraper):
"""An implementation of a Scraper for Bitchute, using classes from the 4cat
library"""
__version__ = "BitchuteScraper 0.0.1"
# TODO snscrape should be able to scrape from user ID alone, but there is
# currently a bug/other issue, so it is extracting the username from URL
def get_username_from_url(url):
username = url.split('bitchute.com/channel/')[-1].strip('/')
return username
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
session = requests.Session()
session.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"
session.headers.update(self.headers)
request = session.get("https://www.bitchute.com/search")
csrftoken = BeautifulSoup(request.text, 'html.parser').findAll(
"input", {"name": "csrfmiddlewaretoken"})[0].get("value")
time.sleep(0.25)
# Don't scrape comment information
#TODO implement framework for processing and storing comments
detail = 'basic'
detail = 'comments'
posts = []
username = BitchuteScraper.get_username_from_url(channel.url)
scraper = get_videos_user(session, username, csrftoken, detail)
for i, post in enumerate(scraper):
for post in scraper:
if since is not None and post['timestamp'] <= since.date_archived.timestamp():
print( f'\n\nBREAK ON VIDEO: {i}\n\n')
if since is not None and datetime.fromtimestamp(post['timestamp']) <= since.date:
break
posts.append(cisticola.base.ScraperResult(
archived_urls = {}
if 'video_url' in post:
url = post['video_url']
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
platform="Bitchute",
channel=channel.id,
platform_id=post['id'],
date=datetime.fromtimestamp(post['timestamp']),
date_archived=datetime.now(),
raw_data=json.dumps(post)))
return posts
raw_data=json.dumps(post),
archived_urls=archived_urls)
def can_handle(self, channel):
if channel.platform == "Bitchute" and BitchuteScraper.get_username_from_url(channel.url) is not None:

53
cisticola/scraper/gab.py Normal file
View File

@@ -0,0 +1,53 @@
from datetime import datetime
import json
from typing import Generator
from garc import Garc
from cisticola.base import Channel, ScraperResult
from cisticola.scraper.base import Scraper
class GabScraper(Scraper):
"""An implementation of a Scraper for Gab, using GARC library"""
__version__ = "GabScraper 0.0.1"
def get_username_from_url(url):
username = url.split('https://gab.com/')[-1]
return username
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
client = Garc(profile = 'main')
username = GabScraper.get_username_from_url(channel.url)
scraper = client.userposts(username)
for post in scraper:
if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")) <= since.date:
break
media_urls = []
archived_urls = {}
media_urls.extend([p['url'] for p in post['media_attachments']])
if post.get('repost') is not None:
media_urls.extend([p['url'] for p in post['repost']['media_attachments']])
for url in media_urls:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
platform="Gab",
channel=channel.id,
platform_id=post['id'],
date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo = None),
date_archived=datetime.now(),
raw_data=json.dumps(post),
archived_urls=archived_urls)
def can_handle(self, channel):
if channel.platform == "Gab" and GabScraper.get_username_from_url(channel.url) is not None:
return True

View File

@@ -1,11 +1,13 @@
import cisticola.base
import cisticola.scraper.base
from datetime import datetime
import json
from typing import Generator
from typing import Generator, Tuple
from urllib.parse import urlparse
from gogettr import PublicClient
class GettrScraper(cisticola.scraper.base.Scraper):
from cisticola.base import Channel, ScraperResult
from cisticola.scraper.base import Scraper
class GettrScraper(Scraper):
"""An implementation of a Scraper for Gettr, using gogettr library"""
__version__ = "GettrScraper 0.0.1"
@@ -16,7 +18,7 @@ class GettrScraper(cisticola.scraper.base.Scraper):
return username
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
client = PublicClient()
username = GettrScraper.get_username_from_url(channel.url)
scraper = client.user_activity(username=username, type="posts")
@@ -30,19 +32,23 @@ class GettrScraper(cisticola.scraper.base.Scraper):
if 'imgs' in post:
for img in post['imgs']:
url = "https://media.gettr.com/" + img
archived_url = self.archive_media(url)
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[img] = archived_url
if 'main' in post:
archived_url = self.archive_media("https://media.gettr.com/" + post['main'])
url = "https://media.gettr.com/" + post['main']
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[post['main']] = archived_url
# TODO this is just archiving the playlist file, not the actual video
if 'vid' in post:
archived_url = self.archive_media("https://media.gettr.com/" + post['vid'])
url = "https://media.gettr.com/" + post['vid']
media_blob, content_type, key = self.m3u8_url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[post['vid']] = archived_url
yield cisticola.base.ScraperResult(
yield ScraperResult(
scraper=self.__version__,
platform="Gettr",
channel=channel.id,
@@ -55,3 +61,8 @@ class GettrScraper(cisticola.scraper.base.Scraper):
def can_handle(self, channel):
if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None:
return True
def url_to_key(self, url: str, content_type: str) -> str:
ext = '.' + content_type.split('/')[-1]
key = urlparse(url).path.split('/')[-2] + ext
return key

View File

@@ -0,0 +1,78 @@
from datetime import datetime
import json
from typing import Generator
from urllib.parse import urlparse
from polyphemus.base import OdyseeChannel
import requests
from cisticola.base import Channel, ScraperResult
from cisticola.scraper.base import Scraper
class OdyseeScraper(Scraper):
"""An implementation of a Scraper for Odysee, using polyphemus library"""
__version__ = "OdyseeScraper 0.0.1"
def get_username_from_url(url):
username = url.split('odysee.com/')[-1].strip('@').split(':')[0]
return username
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
username = OdyseeScraper.get_username_from_url(channel.url)
odysee_channel = OdyseeChannel(channel_name = username)
all_videos = odysee_channel.get_all_videos()
for video in all_videos:
if since is not None and datetime.fromtimestamp(video['created']) <= since.date:
break
archived_urls = {}
url = video.info['streaming_url']
# Check if file is a video file or an m3u8 file
r = requests.head(url)
if r.headers['Content-Type'] == 'text/html; charset=utf-8':
media_blob, content_type, key = self.m3u8_url_to_blob(url)
else:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[url] = archived_url
all_comments = video.get_all_comments()
yield ScraperResult(
scraper=self.__version__,
platform="Odysee",
channel=channel.id,
platform_id=video.info['claim_id'],
date=datetime.fromtimestamp(video.info['created']),
date_archived=datetime.now(),
raw_data=json.dumps(video.info),
archived_urls=archived_urls)
for comment in all_comments:
yield ScraperResult(
scraper=self.__version__,
platform="Odysee",
channel=channel.id,
platform_id=comment.info['claim_id'],
date=datetime.fromtimestamp(comment.info['created']),
date_archived=datetime.now(),
raw_data=json.dumps(comment.info),
archived_urls={})
def can_handle(self, channel):
if channel.platform == "Odysee" and OdyseeScraper.get_username_from_url(channel.url) is not None:
return True
def url_to_key(self, url: str, content_type: str) -> str:
key = urlparse(url).path.split('/')[-2]
ext = content_type.split('/')[-1]
return f'{key}.{ext}'

143
cisticola/scraper/rumble.py Normal file
View File

@@ -0,0 +1,143 @@
from datetime import datetime
import json
from typing import Generator, Tuple
import tempfile
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
import youtube_dl
from cisticola.base import Channel, ScraperResult
from cisticola.scraper.base import Scraper
BASE_URL = 'https://rumble.com'
class RumbleScraper(Scraper):
"""An implementation of a Scraper for Rumble, using custom functions"""
__version__ = "RumbleScraper 0.0.1"
def get_username_from_url(url):
username = url.split('https://rumble.com/c/')[1]
return username
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
username = RumbleScraper.get_username_from_url(channel.url)
scraper = get_channel_videos(username)
for post in scraper:
if since is not None and datetime.fromtimestamp(post['cdate']*0.001) <= since.date:
break
archived_urls = {}
url = post['media_url']
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[post['media_url']] = archived_url
yield ScraperResult(
scraper=self.__version__,
platform="Rumble",
channel=channel.id,
platform_id=post['media_url'].split('/')[-2],
date=datetime.fromisoformat(post['datetime']).replace(tzinfo=None),
date_archived=datetime.now(),
raw_data=json.dumps(post),
archived_urls=archived_urls)
def can_handle(self, channel):
if channel.platform == "Rumble" and RumbleScraper.get_username_from_url(channel.url) is not None:
return True
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
content_type = 'video/mp4'
ext = '.' + content_type.split('/')[-1]
with tempfile.TemporaryDirectory() as temp_dir:
ydl_opts = {
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4",
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
"noplaylist": True,
'quiet': True,
"verbose": False,}
ydl = youtube_dl.YoutubeDL(ydl_opts)
try:
meta = ydl.extract_info(
url,
download=True,)
except youtube_dl.utils.DownloadError as e:
raise e
else:
video_id = meta["id"]
video_ext = meta["ext"]
with open(f"{temp_dir}/{video_id}.{video_ext}", "rb") as f:
blob = f.read()
if key is None:
key = urlparse(url).path.split('/')[-2] + ext
return blob, content_type, key
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def get_media_url(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, features = 'lxml')
script = json.loads(''.join(soup.find('script', {'type':'application/ld+json'}).text))
media_url = script[0]['embedUrl']
return media_url
def process_video(video):
rumble_soup = video.find('span', {'class' : 'video-item--rumbles'})
if rumble_soup is None:
rumbles = '0'
else:
rumbles = rumble_soup['data-value']
info = {
'title' : video.find('h3').text,
'thumbnail' : video.find('img')['src'],
'link' : BASE_URL + video.find('a', href = True)['href'],
'views' : video.find('span', {'class' : 'video-item--views'})['data-value'],
'rumbles' : rumbles,
'duration' : video.find('span', {'class' : 'video-item--duration'})['data-value'],
'datetime' : video.find('time')['datetime']}
info['media_url'] = get_media_url(info['link'])
return info
def get_channel_videos(channel):
page = 1
channel_url = f'{BASE_URL}/c/{channel}?page='
while True:
url = channel_url + str(page)
r = requests.get(url)
if r.status_code == 404:
break
soup = BeautifulSoup(r.content, features = 'lxml')
video_list = soup.find_all('li', {'class' : 'video-listing-entry'})
for video in video_list:
yield process_video(video)
page += 1
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -1,18 +1,19 @@
import cisticola.base
import cisticola.scraper.base
from typing import Generator
import snscrape.modules
from datetime import datetime, timezone
import snscrape.modules
class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper):
from cisticola.base import Channel, ScraperResult
from cisticola.scraper.base import Scraper
class TelegramSnscrapeScraper(Scraper):
__version__ = "TelegramSnscrapeScraper 0.0.1"
def can_handle(self, channel):
if channel.platform == "Telegram" and channel.public and not channel.chat:
return True
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
scr = snscrape.modules.telegram.TelegramChannelScraper(
channel.screenname)
@@ -25,14 +26,16 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper):
archived_urls = {}
for image_url in post.images:
archive_url = self.archive_media(image_url)
archived_urls[image_url] = archive_url
media_blob, content_type, key = self.url_to_blob(image_url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[image_url] = archived_url
if post.video:
video_archive_url = self.archive_media(post.video)
archived_urls[post.video] = video_archive_url
media_blob, content_type, key = self.url_to_blob(post.video)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[post.video] = archived_url
yield cisticola.base.ScraperResult(
yield ScraperResult(
scraper=self.__version__,
platform="Telegram",
channel=channel.id,

View File

@@ -1,17 +1,19 @@
import cisticola.base
import cisticola.scraper.base
from datetime import datetime, timezone
from typing import Generator
import snscrape.modules
from urllib.parse import urlparse, parse_qs
from snscrape.modules.twitter import TwitterProfileScraper, Video, Gif, Photo
from loguru import logger
from cisticola.base import Channel, ScraperResult
from cisticola.scraper.base import Scraper
class TwitterScraper(cisticola.scraper.base.Scraper):
class TwitterScraper(Scraper):
"""An implementation of a Scraper for Twitter, using snscrape library"""
__version__ = "TwitterScraper 0.0.1"
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
scraper = snscrape.modules.twitter.TwitterProfileScraper(channel.platform_id)
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
scraper = TwitterProfileScraper(channel.platform_id)
first = True
@@ -28,23 +30,24 @@ class TwitterScraper(cisticola.scraper.base.Scraper):
if tweet.media:
for media in tweet.media:
if type(media) == snscrape.modules.twitter.Video:
if type(media) == Video:
variant = max(
[v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
url = variant.url
elif type(media) == snscrape.modules.twitter.Gif:
elif type(media) == Gif:
url = media.variants[0].url
elif type(media) == snscrape.modules.twitter.Photo:
elif type(media) == Photo:
url = media.fullUrl
else:
logger.warning(f"Could not get media URL of {media}")
url = None
if url is not None:
archived_url = self.archive_media(url)
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[url] = archived_url
yield cisticola.base.ScraperResult(
yield ScraperResult(
scraper=self.__version__,
platform="Twitter",
channel=channel.id,
@@ -57,3 +60,16 @@ class TwitterScraper(cisticola.scraper.base.Scraper):
def can_handle(self, channel):
if channel.platform == "Twitter" and channel.platform_id:
return True
def url_to_key(self, url: str, content_type: str) -> str:
parsed_url = urlparse(url)
queries = parse_qs(parsed_url.query)
# TODO might require additional statements for other media formats
if 'jpg' in queries.get('format', []):
ext = '.jpg'
elif parsed_url.path.endswith('.mp4'):
ext = ''
key = parsed_url.path.split('/')[-1] + ext
return key

View File

@@ -1,16 +1,2 @@
import cisticola.base
class Transformer:
"""Interface class for transformers"""
__version__ = "Transformer 0.0.0"
def __init__(self):
pass
def can_handle(data: cisticola.base.ScraperResult) -> bool:
pass
def transform(data: cisticola.base.ScraperResult) -> cisticola.base.TransformedResult:
pass
from . import base
from .twitter import TwitterTransformer

View File

@@ -0,0 +1,16 @@
from cisticola.base import ScraperResult, TransformedResult
class Transformer:
"""Interface class for transformers"""
__version__ = "Transformer 0.0.0"
def __init__(self):
pass
def can_handle(data: ScraperResult) -> bool:
pass
def transform(data: ScraperResult) -> TransformedResult:
pass

View File

@@ -1,17 +1,17 @@
import cisticola.transformer
import cisticola.base
import json
from cisticola.transformer.base import Transformer
from cisticola.base import ScraperResult, TransformedResult
class TwitterTransformer(cisticola.transformer.Transformer):
class TwitterTransformer(Transformer):
"""A Twitter specific ScraperResult, with a method ETL/transforming"""
__version__ = "TwitterTransformer 0.0.1"
def transform(self, data: cisticola.base.ScraperResult) -> cisticola.base.TransformedResult:
def transform(self, data: ScraperResult) -> TransformedResult:
raw = json.loads(data.raw_data)
transformed = cisticola.base.TransformedResult(
transformed = TransformedResult(
raw_id=data.id,
scraper=data.scraper,
transformer=self.__version__,

View File

@@ -0,0 +1,131 @@
from sqlalchemy import create_engine
from cisticola.base import Channel
from cisticola.scraper import (
ScraperController,
TelegramSnscrapeScraper)
test_channels = [
Channel(
id=0,
name="QAnon Россия",
platform_id=-1001319637748,
category="Qanon",
followers=94048,
platform="Telegram",
url="https://t.me/qanonrus",
screenname="qanonrus",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=1,
name="The Great Awakening | Q",
platform_id=-1001325597521,
category="Qanon",
followers=5715,
platform="Telegram",
url="https://t.me/greatawakin",
screenname="greatawakin",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=2,
name="Великое Пробуждение",
platform_id=-1001285898079,
category="Qanon",
followers=5861,
platform="Telegram",
url="https://t.me/greatawakeningrus",
screenname="greatawakeningrus",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=3,
name="T🕊Редакция Президент Гордон🕊",
platform_id=-1001101170442,
category="Qanon",
followers=5743,
platform="Telegram",
url="https://t.me/prezidentgordonteam",
screenname="prezidentgordonteam",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=4,
name="ПРОЕКТ АВРОРА",
platform_id=-1001279171101,
category="Qanon",
followers=5930,
platform="Telegram",
url="https://t.me/project_aurora",
screenname="project_aurora",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=5,
name="Сон Разума",
platform_id=-1001202338312,
category="Qanon",
followers=27099,
platform="Telegram",
url="https://t.me/error_288",
screenname="error_288",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=6,
name="Пробуждающий Мир - официальный канал",
platform_id=-1001492521207,
category="Qanon",
followers=19097,
platform="Telegram",
url="https://t.me/promirru",
screenname="promirru",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=7,
name="ЦЕЛЬНОЗОР",
platform_id=-1001642737506,
category="Qanon",
followers=13654,
platform="Telegram",
url="https://t.me/tselnozor",
screenname="tselnozor",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),]
controller = ScraperController()
telegram = TelegramSnscrapeScraper()
controller.register_scraper(telegram)
engine = create_engine('sqlite:///russian_telegram.db')
controller.connect_to_db(engine)
controller.scrape_channels(test_channels)

13
pytest.ini Normal file
View File

@@ -0,0 +1,13 @@
[pytest]
minversion =
6.0.2
testpaths =
tests/
python_files =
*.py
addopts =
-vvv
--cov='cisticola'
--cov-report html:reports/coverage
--html='reports/tests.html'
--self-contained-html

149
test.py
View File

@@ -1,44 +1,127 @@
import cisticola
import cisticola.scraper.telegram_snscrape
import cisticola.scraper.twitter
import cisticola.scraper.gettr
from sqlalchemy import create_engine
from cisticola.base import Channel
from cisticola.scraper import (
ScraperController,
BitchuteScraper,
GabScraper,
GettrScraper,
OdyseeScraper,
RumbleScraper,
TelegramSnscrapeScraper,
TwitterScraper)
test_channels = [
cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132,
category="test", followers=None, platform="Twitter",
url="https://twitter.com/obtusatum", screenname="obtusatum", country="US",
influencer=None, public=True, chat=False,
notes=""),
cisticola.base.Channel(id=1, name="JQHN SPARTAN", platform_id=-1001181961026,
category="qanon", followers=None, platform="Telegram",
url="https://t.me/jqhnspartan", screenname="jqhnspartan", country="FR",
influencer="JQNH SPARTAN", public=True, chat=False, notes=""),
cisticola.base.Channel(id=2, name="LizardRepublic", platform_id='lizardrepublic',
category="qanon", followers=None, platform="Gettr",
url="https://www.gettr.com/user/lizardrepublic", screenname="lizardrepublic", country="US",
influencer=None, public=True, chat=False, notes=""),
cisticola.base.Channel(id=3, name="Patriot Front", platform_id='OVv9QZL4sEsC',
category="nazi", followers=None, platform="Bitchute",
url="https://www.bitchute.com/channel/OVv9QZL4sEsC/", screenname=None, country="US",
influencer=None, public=True, chat=False, notes=""),]
Channel(
id=0,
name="Logan Williams (test)",
platform_id=891729132,
category="test",
followers=None,
platform="Twitter",
url="https://twitter.com/obtusatum",
screenname="obtusatum",
country="US",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=1,
name="South West Ohio Proud Boys (test)",
platform_id=-1001276612436,
category="test",
followers=None,
platform="Telegram",
url="https://t.me/SouthwestOhioPB",
screenname="SouthwestOhioPB",
country="US",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=2,
name="LizardRepublic (test)",
platform_id='lizardrepublic',
category="test",
followers=None,
platform="Gettr",
url="https://www.gettr.com/user/lizardrepublic",
screenname="lizardrepublic",
country="US",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=4,
name="bestonlinejewelrystoresusa@gmail.com (test)", platform_id='bestonlinejewelrystoresusagmailcom',
category="test",
followers=None,
platform="Bitchute",
url="https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/", screenname=None,
country="US",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=5,
name="Mak1n' Bacon (test)",
platform_id='Mak1nBacon',
category="test",
followers=None,
platform="Odysee",
url="https://odysee.com/@Mak1nBacon",
screenname='Mak1nBacon',
country="US",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=6,
name="Capt. Marc Simon (test)",
platform_id='marc_capt',
category="test",
followers=None,
platform="Gab",
url="https://gab.com/marc_capt",
screenname='marc_capt',
country="CA",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=7,
name="we are uploading videos wow products and problem solving products.please share like and subscribe our channelwe are uploading videos wow products and problem solving products.please share like and subscribe our channel", platform_id='c-916305',
category="test",
followers=None,
platform="Rumble",
url="https://rumble.com/c/c-916305",
screenname='we are uploading',
country="CA",
influencer=None,
public=True,
chat=False,
notes="")]
controller = ScraperController()
controller = cisticola.ScraperController()
scrapers = [
BitchuteScraper(),
GabScraper(),
GettrScraper(),
OdyseeScraper(),
RumbleScraper(),
TelegramSnscrapeScraper(),
TwitterScraper()]
twitter = cisticola.scraper.twitter.TwitterScraper()
controller.register_scraper(twitter)
telegram = cisticola.scraper.telegram_snscrape.TelegramSnscrapeScraper()
controller.register_scraper(telegram)
gettr = cisticola.scraper.gettr.GettrScraper()
controller.register_scraper(gettr)
controller.register_scrapers(scrapers)
engine = create_engine('sqlite:///test3.db')
controller.connect_to_db(engine)
controller.scrape_channels(test_channels)
controller.scrape_channels(test_channels)

0
tests/__init__.py Normal file
View File

147
tests/conftest.py Normal file
View File

@@ -0,0 +1,147 @@
import pytest
from sqlalchemy import create_engine
from cisticola.scraper import ScraperController
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
BITCHUTE_CHANNEL_KWARGS = {
'id': 0,
'name': 'bestonlinejewelrystoresusa@gmail.com (test)',
'platform_id': 'bestonlinejewelrystoresusagmailcom',
'category': 'test',
'followers': None,
'platform': 'Bitchute',
'url': 'https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/',
'screenname': None,
'country': 'US',
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
GAB_CHANNEL_KWARGS = {
'id': 1,
'name': 'Capt. Marc Simon (test)',
'platform_id': 'marc_capt',
'category': 'test',
'followers': None,
'platform': 'Gab',
'url': 'https://gab.com/marc_capt',
'screenname': 'marc_capt',
'country': 'CA',
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
GETTR_CHANNEL_KWARGS = {
'id': 2,
'name': 'LizardRepublic (test)',
'platform_id': 'lizardrepublic',
'category': 'test',
'followers': None,
'platform': 'Gettr',
'url': 'https://www.gettr.com/user/lizardrepublic',
'screenname': 'lizardrepublic',
'country': 'US',
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
ODYSEE_CHANNEL_KWARGS = {
'id': 3,
'name': "Mak1n' Bacon (test)",
'platform_id': 'Mak1nBacon',
'category': 'test',
'followers': None,
'platform': 'Odysee',
'url': 'https://odysee.com/@Mak1nBacon',
'screenname': 'Mak1nBacon',
'country': 'US',
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
RUMBLE_CHANNEL_KWARGS = {
'id': 4,
'name': 'we are uploading videos wow products',
'platform_id': 'c-916305',
'category': 'test',
'followers': None,
'platform': 'Rumble',
'url': 'https://rumble.com/c/c-916305',
'screenname': 'we are uploading',
'country': 'CA',
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
TELEGRAM_SNSCRAPE_CHANNEL_KWARGS = {
'id': 5,
'name': 'South West Ohio Proud Boys (test)',
'platform_id': -1001276612436,
'category': 'test',
'followers': None,
'platform': 'Telegram',
'url': 'https://t.me/SouthwestOhioPB',
'screenname': 'SouthwestOhioPB',
'country': 'US',
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
TWITTER_CHANNEL_KWARGS = {
'id': 5,
'name': 'Logan Williams (test)',
'platform_id': 891729132,
'category': 'test',
'followers': None,
'platform': 'Twitter',
'url': 'https://twitter.com/obtusatum',
'screenname': 'obtusatum',
'country': 'US',
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
@pytest.fixture(scope='package')
def controller(tmpdir_factory):
"""Initialize ScraperController and SQLite database file to be used for all
tests in the package.
"""
file = tmpdir_factory.mktemp('test_data').join('test.db')
engine = create_engine(f'sqlite:///{file}')
scraper_controller = ScraperController()
scraper_controller.connect_to_db(engine)
return scraper_controller
@pytest.fixture(scope='package')
def channel_kwargs():
"""Define keyword arguments to use for defining test channels for each
platform to be scraped.
"""
return {
'bitchute' : BITCHUTE_CHANNEL_KWARGS,
'gab' : GAB_CHANNEL_KWARGS,
'gettr' : GETTR_CHANNEL_KWARGS,
'odysee' : ODYSEE_CHANNEL_KWARGS,
'rumble' : RUMBLE_CHANNEL_KWARGS,
'telegram_snscrape' : TELEGRAM_SNSCRAPE_CHANNEL_KWARGS,
'twitter' : TWITTER_CHANNEL_KWARGS}
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -0,0 +1,8 @@
from cisticola.base import Channel
from cisticola.scraper import BitchuteScraper
def test_scrape_bitchute_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['bitchute'])]
controller.register_scraper(BitchuteScraper())
controller.scrape_channels(channels)

8
tests/scraper/gab.py Normal file
View File

@@ -0,0 +1,8 @@
from cisticola.base import Channel
from cisticola.scraper import GabScraper
def test_scrape_gab_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['gab'])]
controller.register_scraper(GabScraper())
controller.scrape_channels(channels)

8
tests/scraper/gettr.py Normal file
View File

@@ -0,0 +1,8 @@
from cisticola.base import Channel
from cisticola.scraper import GettrScraper
def test_scrape_gettr_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['gettr'])]
controller.register_scraper(GettrScraper())
controller.scrape_channels(channels)

8
tests/scraper/odysee.py Normal file
View File

@@ -0,0 +1,8 @@
from cisticola.base import Channel
from cisticola.scraper import OdyseeScraper
def test_scrape_odysee_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['odysee'])]
controller.register_scraper(OdyseeScraper())
controller.scrape_channels(channels)

8
tests/scraper/rumble.py Normal file
View File

@@ -0,0 +1,8 @@
from cisticola.base import Channel
from cisticola.scraper import RumbleScraper
def test_scrape_rumble_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['rumble'])]
controller.register_scraper(RumbleScraper())
controller.scrape_channels(channels)

View File

@@ -0,0 +1,8 @@
from cisticola.base import Channel
from cisticola.scraper import TelegramSnscrapeScraper
def test_scrape_telegram_snscrape_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['telegram_snscrape'])]
controller.register_scraper(TelegramSnscrapeScraper())
controller.scrape_channels(channels)

8
tests/scraper/twitter.py Normal file
View File

@@ -0,0 +1,8 @@
from cisticola.base import Channel
from cisticola.scraper import TwitterScraper
def test_scrape_twitter_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['twitter'])]
controller.register_scraper(TwitterScraper())
controller.scrape_channels(channels)