Merge branch 'main' into channel-db

This commit is contained in:
Logan Williams
2022-03-22 11:49:07 +01:00
committed by GitHub
41 changed files with 970 additions and 313 deletions

View File

@@ -10,7 +10,6 @@ gogettr = "*"
requests = "*"
bs4 = "*"
dateparser = "*"
sphinx = "*"
boto3 = "*"
snscrape = {git = "https://github.com/bellingcat/snscrape.git"}
ffmpeg-python = "*"
@@ -29,6 +28,8 @@ pytest-cov = "*"
pytest-html = "*"
pytest-metadata = "*"
black = "*"
sphinx = "*"
sphinx_rtd_theme = "*"
[requires]
python_version = "3.9"

418
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "d465f2d09a728ee76cb0af521890ecc1e1bce672acbd1caf2e4d01b6567480d5"
"sha256": "e3b96b0ac8c80d4817f9adac4ab171bf4b7e07e80927c7b152a24e8bbdbf7faa"
},
"pipfile-spec": 6,
"requires": {
@@ -16,13 +16,6 @@
]
},
"default": {
"alabaster": {
"hashes": [
"sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359",
"sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02"
],
"version": "==0.7.12"
},
"attrs": {
"hashes": [
"sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4",
@@ -31,14 +24,6 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==21.4.0"
},
"babel": {
"hashes": [
"sha256:ab49e12b91d937cd11f0b67cb259a57ab4ad2b59ac7a3b41d6c06c0ac5b0def9",
"sha256:bc0c176f9f6a994582230df350aa6e05ba2ebe4b3ac317eab29d9be5d2768da0"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.9.1"
},
"beautifulsoup4": {
"hashes": [
"sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf",
@@ -49,19 +34,19 @@
},
"boto3": {
"hashes": [
"sha256:8d6f3c548f0ee03d742f404c96515e7579fc6968135aaa50dd855a046698ff79",
"sha256:d857feb6af9932e1ee3a748060a2cd9fd6043dbbccf66976eda54586597efdc0"
"sha256:76d5b90400c54b25278150768e946edf166acce2c1597c0ecfbebb1dbe9acf2c",
"sha256:7bb2e6506a6ad44d111dd20a5d510374b6958fe989b4ef887109c79d812f926f"
],
"index": "pypi",
"version": "==1.21.18"
"version": "==1.21.19"
},
"botocore": {
"hashes": [
"sha256:7ea8ef1ff7c4882ab59b337662f90ddf5ea860e95e7e209dca593a34ea585b1b",
"sha256:d2da7ccbc5ddd61fe3cd45fcbd3de380d9e3a15bfa8fbfd2d9259a93dcc60c56"
"sha256:5ed2be0e413961134f4c17eab16396d41a5b4b73a637588260c04d20806d52ea",
"sha256:d0d77bce152ca51f3c2cd0f9bf05cb3b623e719406ad58b4c20444e237fe82eb"
],
"markers": "python_version >= '3.6'",
"version": "==1.24.18"
"version": "==1.24.19"
},
"brotli": {
"hashes": [
@@ -169,14 +154,6 @@
"index": "pypi",
"version": "==1.1.0"
},
"docutils": {
"hashes": [
"sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125",
"sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==0.17.1"
},
"ffmpeg-python": {
"hashes": [
"sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127",
@@ -284,22 +261,6 @@
"markers": "python_version >= '3'",
"version": "==3.3"
},
"imagesize": {
"hashes": [
"sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c",
"sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.3.0"
},
"importlib-metadata": {
"hashes": [
"sha256:1208431ca90a8cca1a6b8af391bb53c1a2db74e5d1cef6ddced95d4b2062edc6",
"sha256:ea4c597ebf37142f827b8f39299579e31685c31d3a438b59f469406afd0f2539"
],
"markers": "python_version < '3.10'",
"version": "==4.11.3"
},
"iniconfig": {
"hashes": [
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
@@ -314,14 +275,6 @@
"index": "pypi",
"version": "==4.8.4"
},
"jinja2": {
"hashes": [
"sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8",
"sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7"
],
"markers": "python_version >= '3.6'",
"version": "==3.0.3"
},
"jmespath": {
"hashes": [
"sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9",
@@ -405,52 +358,6 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==4.8.0"
},
"markupsafe": {
"hashes": [
"sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3",
"sha256:09c86c9643cceb1d87ca08cdc30160d1b7ab49a8a21564868921959bd16441b8",
"sha256:142119fb14a1ef6d758912b25c4e803c3ff66920635c44078666fe7cc3f8f759",
"sha256:1d1fb9b2eec3c9714dd936860850300b51dbaa37404209c8d4cb66547884b7ed",
"sha256:204730fd5fe2fe3b1e9ccadb2bd18ba8712b111dcabce185af0b3b5285a7c989",
"sha256:24c3be29abb6b34052fd26fc7a8e0a49b1ee9d282e3665e8ad09a0a68faee5b3",
"sha256:290b02bab3c9e216da57c1d11d2ba73a9f73a614bbdcc027d299a60cdfabb11a",
"sha256:3028252424c72b2602a323f70fbf50aa80a5d3aa616ea6add4ba21ae9cc9da4c",
"sha256:30c653fde75a6e5eb814d2a0a89378f83d1d3f502ab710904ee585c38888816c",
"sha256:3cace1837bc84e63b3fd2dfce37f08f8c18aeb81ef5cf6bb9b51f625cb4e6cd8",
"sha256:4056f752015dfa9828dce3140dbadd543b555afb3252507348c493def166d454",
"sha256:454ffc1cbb75227d15667c09f164a0099159da0c1f3d2636aa648f12675491ad",
"sha256:598b65d74615c021423bd45c2bc5e9b59539c875a9bdb7e5f2a6b92dfcfc268d",
"sha256:599941da468f2cf22bf90a84f6e2a65524e87be2fce844f96f2dd9a6c9d1e635",
"sha256:5ddea4c352a488b5e1069069f2f501006b1a4362cb906bee9a193ef1245a7a61",
"sha256:62c0285e91414f5c8f621a17b69fc0088394ccdaa961ef469e833dbff64bd5ea",
"sha256:679cbb78914ab212c49c67ba2c7396dc599a8479de51b9a87b174700abd9ea49",
"sha256:6e104c0c2b4cd765b4e83909cde7ec61a1e313f8a75775897db321450e928cce",
"sha256:736895a020e31b428b3382a7887bfea96102c529530299f426bf2e636aacec9e",
"sha256:75bb36f134883fdbe13d8e63b8675f5f12b80bb6627f7714c7d6c5becf22719f",
"sha256:7d2f5d97fcbd004c03df8d8fe2b973fe2b14e7bfeb2cfa012eaa8759ce9a762f",
"sha256:80beaf63ddfbc64a0452b841d8036ca0611e049650e20afcb882f5d3c266d65f",
"sha256:84ad5e29bf8bab3ad70fd707d3c05524862bddc54dc040982b0dbcff36481de7",
"sha256:8da5924cb1f9064589767b0f3fc39d03e3d0fb5aa29e0cb21d43106519bd624a",
"sha256:961eb86e5be7d0973789f30ebcf6caab60b844203f4396ece27310295a6082c7",
"sha256:96de1932237abe0a13ba68b63e94113678c379dca45afa040a17b6e1ad7ed076",
"sha256:a0a0abef2ca47b33fb615b491ce31b055ef2430de52c5b3fb19a4042dbc5cadb",
"sha256:b2a5a856019d2833c56a3dcac1b80fe795c95f401818ea963594b345929dffa7",
"sha256:b8811d48078d1cf2a6863dafb896e68406c5f513048451cd2ded0473133473c7",
"sha256:c532d5ab79be0199fa2658e24a02fce8542df196e60665dd322409a03db6a52c",
"sha256:d3b64c65328cb4cd252c94f83e66e3d7acf8891e60ebf588d7b493a55a1dbf26",
"sha256:d4e702eea4a2903441f2735799d217f4ac1b55f7d8ad96ab7d4e25417cb0827c",
"sha256:d5653619b3eb5cbd35bfba3c12d575db2a74d15e0e1c08bf1db788069d410ce8",
"sha256:d66624f04de4af8bbf1c7f21cc06649c1c69a7f84109179add573ce35e46d448",
"sha256:e67ec74fada3841b8c5f4c4f197bea916025cb9aa3fe5abf7d52b655d042f956",
"sha256:e6f7f3f41faffaea6596da86ecc2389672fa949bd035251eab26dc6697451d05",
"sha256:f02cf7221d5cd915d7fa58ab64f7ee6dd0f6cddbb48683debf5d04ae9b1c2cc1",
"sha256:f0eddfcabd6936558ec020130f932d479930581171368fd728efcfb6ef0dd357",
"sha256:fabbe18087c3d33c5824cb145ffca52eccd053061df1d79d4b66dafa5ad2a5ea",
"sha256:fc3150f85e2dbcf99e65238c842d1cfe69d3e7649b19864c1cc043213d9cd730"
],
"markers": "python_version >= '3.7'",
"version": "==2.1.0"
},
"mutagen": {
"hashes": [
"sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1",
@@ -642,14 +549,6 @@
"git": "https://github.com/smarnach/pyexiftool.git",
"ref": "3db3764895e687d75b42d3ae4e554ca8664a7f6f"
},
"pygments": {
"hashes": [
"sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65",
"sha256:4e426f72023d88d03b2fa258de560726ce890ff3b630f88c21cbb8b2503b8c6a"
],
"markers": "python_version >= '3.5'",
"version": "==2.11.2"
},
"pyparsing": {
"hashes": [
"sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea",
@@ -786,6 +685,9 @@
"version": "==2022.3.2"
},
"requests": {
"extras": [
"socks"
],
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
@@ -817,13 +719,6 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.16.0"
},
"snowballstemmer": {
"hashes": [
"sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1",
"sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"
],
"version": "==2.2.0"
},
"snscrape": {
"git": "https://github.com/bellingcat/snscrape.git",
"ref": "de4ebed81f3f6a4bb4c65630daab6ec63784959b"
@@ -836,62 +731,6 @@
"markers": "python_version >= '3.6'",
"version": "==2.3.1"
},
"sphinx": {
"hashes": [
"sha256:5da895959511473857b6d0200f56865ed62c31e8f82dd338063b84ec022701fe",
"sha256:6caad9786055cb1fa22b4a365c1775816b876f91966481765d7d50e9f0dd35cc"
],
"index": "pypi",
"version": "==4.4.0"
},
"sphinxcontrib-applehelp": {
"hashes": [
"sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a",
"sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.2"
},
"sphinxcontrib-devhelp": {
"hashes": [
"sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e",
"sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.2"
},
"sphinxcontrib-htmlhelp": {
"hashes": [
"sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07",
"sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2"
],
"markers": "python_version >= '3.6'",
"version": "==2.0.0"
},
"sphinxcontrib-jsmath": {
"hashes": [
"sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178",
"sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.1"
},
"sphinxcontrib-qthelp": {
"hashes": [
"sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72",
"sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.3"
},
"sphinxcontrib-serializinghtml": {
"hashes": [
"sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd",
"sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952"
],
"markers": "python_version >= '3.5'",
"version": "==1.1.5"
},
"sqlalchemy": {
"hashes": [
"sha256:04164e0063feb7aedd9d073db0fd496edb244be40d46ea1f0d8990815e4b8c34",
@@ -1034,17 +873,16 @@
],
"index": "pypi",
"version": "==2022.3.8.2"
},
"zipp": {
"hashes": [
"sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d",
"sha256:b47250dd24f92b7dd6a0a8fc5244da14608f3ca90a5efcd37a3b1642fac9a375"
],
"markers": "python_version >= '3.7'",
"version": "==3.7.0"
}
},
"develop": {
"alabaster": {
"hashes": [
"sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359",
"sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02"
],
"version": "==0.7.12"
},
"attrs": {
"hashes": [
"sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4",
@@ -1053,6 +891,29 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==21.4.0"
},
"babel": {
"hashes": [
"sha256:ab49e12b91d937cd11f0b67cb259a57ab4ad2b59ac7a3b41d6c06c0ac5b0def9",
"sha256:bc0c176f9f6a994582230df350aa6e05ba2ebe4b3ac317eab29d9be5d2768da0"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.9.1"
},
"certifi": {
"hashes": [
"sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
"sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
],
"version": "==2021.10.8"
},
"charset-normalizer": {
"hashes": [
"sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597",
"sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"
],
"markers": "python_version >= '3'",
"version": "==2.0.12"
},
"coverage": {
"extras": [
"toml"
@@ -1103,6 +964,38 @@
"markers": "python_version >= '3.7'",
"version": "==6.3.2"
},
"docutils": {
"hashes": [
"sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125",
"sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==0.17.1"
},
"idna": {
"hashes": [
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
],
"markers": "python_version >= '3'",
"version": "==3.3"
},
"imagesize": {
"hashes": [
"sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c",
"sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.3.0"
},
"importlib-metadata": {
"hashes": [
"sha256:1208431ca90a8cca1a6b8af391bb53c1a2db74e5d1cef6ddced95d4b2062edc6",
"sha256:ea4c597ebf37142f827b8f39299579e31685c31d3a438b59f469406afd0f2539"
],
"markers": "python_version < '3.10'",
"version": "==4.11.3"
},
"iniconfig": {
"hashes": [
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
@@ -1110,6 +1003,60 @@
],
"version": "==1.1.1"
},
"jinja2": {
"hashes": [
"sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8",
"sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7"
],
"markers": "python_version >= '3.6'",
"version": "==3.0.3"
},
"markupsafe": {
"hashes": [
"sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003",
"sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88",
"sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5",
"sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7",
"sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a",
"sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603",
"sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1",
"sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135",
"sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247",
"sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6",
"sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601",
"sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77",
"sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02",
"sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e",
"sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63",
"sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f",
"sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980",
"sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b",
"sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812",
"sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff",
"sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96",
"sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1",
"sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925",
"sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a",
"sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6",
"sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e",
"sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f",
"sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4",
"sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f",
"sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3",
"sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c",
"sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a",
"sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417",
"sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a",
"sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a",
"sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37",
"sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452",
"sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933",
"sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a",
"sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"
],
"markers": "python_version >= '3.7'",
"version": "==2.1.1"
},
"packaging": {
"hashes": [
"sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb",
@@ -1134,6 +1081,14 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==1.11.0"
},
"pygments": {
"hashes": [
"sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65",
"sha256:4e426f72023d88d03b2fa258de560726ce890ff3b630f88c21cbb8b2503b8c6a"
],
"markers": "python_version >= '3.5'",
"version": "==2.11.2"
},
"pyparsing": {
"hashes": [
"sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea",
@@ -1174,6 +1129,95 @@
"index": "pypi",
"version": "==1.11.0"
},
"pytz": {
"hashes": [
"sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c",
"sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"
],
"version": "==2021.3"
},
"requests": {
"extras": [
"socks"
],
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
],
"index": "pypi",
"version": "==2.27.1"
},
"snowballstemmer": {
"hashes": [
"sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1",
"sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"
],
"version": "==2.2.0"
},
"sphinx": {
"hashes": [
"sha256:5da895959511473857b6d0200f56865ed62c31e8f82dd338063b84ec022701fe",
"sha256:6caad9786055cb1fa22b4a365c1775816b876f91966481765d7d50e9f0dd35cc"
],
"index": "pypi",
"version": "==4.4.0"
},
"sphinx-rtd-theme": {
"hashes": [
"sha256:4d35a56f4508cfee4c4fb604373ede6feae2a306731d533f409ef5c3496fdbd8",
"sha256:eec6d497e4c2195fa0e8b2016b337532b8a699a68bcb22a512870e16925c6a5c"
],
"index": "pypi",
"version": "==1.0.0"
},
"sphinxcontrib-applehelp": {
"hashes": [
"sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a",
"sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.2"
},
"sphinxcontrib-devhelp": {
"hashes": [
"sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e",
"sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.2"
},
"sphinxcontrib-htmlhelp": {
"hashes": [
"sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07",
"sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2"
],
"markers": "python_version >= '3.6'",
"version": "==2.0.0"
},
"sphinxcontrib-jsmath": {
"hashes": [
"sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178",
"sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.1"
},
"sphinxcontrib-qthelp": {
"hashes": [
"sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72",
"sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.3"
},
"sphinxcontrib-serializinghtml": {
"hashes": [
"sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd",
"sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952"
],
"markers": "python_version >= '3.5'",
"version": "==1.1.5"
},
"tomli": {
"hashes": [
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
@@ -1181,6 +1225,22 @@
],
"markers": "python_version >= '3.7'",
"version": "==2.0.1"
},
"urllib3": {
"hashes": [
"sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
"sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==1.26.8"
},
"zipp": {
"hashes": [
"sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d",
"sha256:b47250dd24f92b7dd6a0a8fc5244da14608f3ca90a5efcd37a3b1642fac9a375"
],
"markers": "python_version >= '3.7'",
"version": "==3.7.0"
}
}
}

View File

@@ -1,33 +1,47 @@
from typing import List
from dataclasses import dataclass
from datetime import datetime
import tempfile
import json
import io
from sqlalchemy.orm import registry
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean
import pytesseract
import PIL
import io
import exiftool
import json
import os
from .utils import make_request
mapper_registry = registry()
@dataclass
class ScraperResult:
"""A minimally processed result from a scraper"""
"""A minimally processed result from a scraper
"""
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
scraper: str
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
platform: str
#: Foreign key of channel ID that this was scraped from
channel: int
#: String that uniquely identifies the scraped post on the given platform, e.g. ``"1503397267675533313"``
platform_id: str
#: Datetime (relative to UTC) that the scraped post was created at.
date: datetime
#: JSON dump of dict that contains all data scraped for the post.
raw_data: str
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime
#: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files.
archived_urls: dict
raw_data_table = Table('raw_data', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
@@ -40,22 +54,45 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
Column('date_archived', DateTime),
Column('archived_urls', JSON))
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
@dataclass
class Channel:
"""Information about a specific channel to be scraped.
"""
#: Name of channel (different from username because it can be non-unique and contain emojis), e.g. ``T🕊Редакция Президент Гордон🕊"``.
name: str
#: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
platform_id: str
#: User-specified category for the channel, e.g. ``"explicit_qanon"``.
category: str
#: Name of platform the given channel is on, e.g. ``"Telegram"``.
platform: str
#: URL for the given channel on the platform, e.g. ``"https://t.me/prezidentgordonteam"``
url: str
#: Screen name/username of channel.
screenname: str
#: 2 digit country code for the country of origin for the channel, e.g. ``"RU"``.
country: str = None
#: Name of influencer, if channel belongs to an influencer that operates on multiple platforms.
influencer: str = None
#: Whether or not the channel is publicly-accessible.
public: bool = None
#: Whether or not the channel is a chat (i.e. allows users who are not the channel creator to post/message)
chat: bool = None
#: Any other additional notes about the channel.
notes: str = ""
#: Did the channel come from a researcher or a scraping process?
source: str = None
def hydrate(self):
@@ -82,26 +119,52 @@ mapper_registry.map_imperatively(Channel, channel_table)
@dataclass
class Post:
"""An object with fields for columns in the analysis table"""
#: ID number of the scraped post in the ``raw_data`` table
raw_id: int
#: Platform specific post ID
platform_id: str
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
scraper: str
#: String specifying name and version of transformer used to tranform result, e.g. ``"TwitterTransformer 0.0.1"``.
transformer: str
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
platform: str
#: User-specified integer that uniquely identifies a channel, e.g. ``15``.
channel: int
#: Datetime (relative to UTC) that the scraped post was created at.
date: datetime
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime
#: URL of the original post
url: str
#: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
author_id: str
#: Username of author who made post.
author_username: str
#: Text of the original post
content: str
#: The ID of the Channel that the post was forwarded or quoted from
forwarded_from: int = None
#: The ID of the Post that this Post is a reply to or reblog of
reply_to: int = None
def hydrate(self):
pass
post_table = Table('posts', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
@@ -125,39 +188,64 @@ mapper_registry.map_imperatively(Post, post_table)
@dataclass
class Media:
"""Base class for organizing information about a media file.
"""
#: ID number of the media's corresponding scraped post in the ``raw_data`` table.
raw_id: int
#: ID number of the media's corresponging scraped post in the ``analysis`` table.
post: int
#: URL of the original post.
url: str
#: Original URL of the media from the the original post.
original_url: str
#: JSON dump of the dict containing metadata information for the media file.
exif: str = None
def get_blob(self):
"""Download media file as bytes blob.
"""
blob = make_request(self.url)
return blob.content
def hydrate(self, blob = None):
"""Download media file as bytes blob and extract data from content.
"""
if blob is None:
blob = self.get_blob()
self.hydrate_exif(blob)
def hydrate_exif(self, blob):
f = open('tmp', 'wb')
f.write(blob)
f.close()
"""Extract Exif metadata from bytes blob.
"""
with exiftool.ExifTool() as et:
exif = et.get_metadata('tmp')
self.exif = json.dumps(exif)
with tempfile.NamedTemporaryFile() as temp_file:
temp_file.write(blob)
os.remove('tmp')
with exiftool.ExifTool() as et:
exif = et.get_metadata(temp_file.name)
self.exif = json.dumps(exif)
@dataclass
class Image(Media):
"""Class for organizing information about an image file.
"""
#: Extracted OCR content from image
ocr: str = None
def hydrate(self, blob=None):
"""Download image file as bytes blob and extract Exif and OCR content
from the image.
"""
if blob is None:
blob = self.get_blob()
@@ -165,25 +253,62 @@ class Image(Media):
self.hydrate_ocr(blob)
def hydrate_ocr(self, blob):
"""Extract OCR (optical character recognition) data from image bytes blob.
"""
image = PIL.Image.open(io.BytesIO(blob))
self.ocr = pytesseract.image_to_string(image)
@dataclass
class Video(Media):
"""Class for organizing information about an image file.
"""
pass
mapper_registry = registry()
raw_data_table = Table('raw_data', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('scraper', String),
Column('platform', String),
Column('channel', Integer),
Column('platform_id', String),
Column('date', DateTime),
Column('raw_data', String),
Column('date_archived', DateTime),
Column('archived_urls', JSON))
analysis_table = Table('analysis', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('raw_id', Integer, ForeignKey('raw_data.id')),
Column('scraper', String),
Column('transformer', String),
Column('platform', String),
Column('channel', Integer),
Column('date', DateTime),
Column('date_archived', DateTime),
Column('url', String),
Column('author_id', String),
Column('author_username', String),
Column('content', String))
media_table = Table('media', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('type', String),
Column('type', String),
Column('raw_id', Integer, ForeignKey('raw_data.id')),
Column('post', Integer, ForeignKey('posts.id')),
Column('url', String),
Column('original_url', String),
Column('exif', String),
Column('ocr', String)
)
Column('ocr', String))
mapper_registry.map_imperatively(TransformedResult, analysis_table)
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')

View File

@@ -14,29 +14,91 @@ from cisticola.base import Channel, ScraperResult, mapper_registry
from cisticola.utils import make_request
class Scraper:
"""Base class for defining platform-specific scrapers for scraping all posts
from a given channel on that specific platform.
"""
__version__ = "Scraper 0.0.0"
def __init__(self):
self.s3_client = boto3.client('s3',
region_name=os.environ['DO_SPACES_REGION'],
endpoint_url='https://{}.digitaloceanspaces.com'.format(
os.environ['DO_SPACES_REGION']),
aws_access_key_id=os.environ['DO_SPACES_KEY'],
aws_secret_access_key=os.environ['DO_SPACES_SECRET'])
# Initialize client to transfer files to the storage archive
self.s3_client = boto3.client(
service_name='s3',
region_name=os.environ['DO_SPACES_REGION'],
endpoint_url=f'https://{os.environ["DO_SPACES_REGION"]}.digitaloceanspaces.com',
aws_access_key_id=os.environ['DO_SPACES_KEY'],
aws_secret_access_key=os.environ['DO_SPACES_SECRET'])
# Define request headers (necessary to bypass scraping protection
# for several platform scrapers)
self.headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'}
pass
def __str__(self):
return self.__version__
def get_username_from_url(self, url: str) -> str:
"""Extract a channel's username from its URL.
Parameters
----------
url: str
URL of the channel on a given platform
e.g. ``"https://twitter.com/EliotHiggins"``
Returns
-------
username: str
Extracted username of the channel.
e.g. ``"EliotHiggins"``
"""
raise NotImplementedError
def url_to_key(self, url: str, content_type: str) -> str:
"""Generate a unique identifier for media from a specified post.
Parameters
---------
url: str
URL of original post.
e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"``
content_type: str
Content-Type of media.
e.g. ``"image/jpeg"``
Returns
-------
key: str
Unique identifier for the media file from a specified post based on
the original post URL and the media's Content-Type.
"""
key = urlparse(url).path.split('/')[-1]
return key
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
"""Download media file from a specified media file URL.
Parameters
---------
url: str
URL of media file from original post.
e.g. ``"https://pbs.twimg.com/media/FN0j0dYWUAcQxfK?format=png&name=medium"``
key: str or None
Pre-defined unique identifier for the media file.
Returns
-------
blob: bytes
Raw bytes of the downloaded media file.
content_type: str
Content-Type of media.
e.g. ``"image/jpeg"``.
key: str
Unique identifier for the media file.
"""
r = make_request(url, headers = self.headers)
@@ -49,6 +111,27 @@ class Scraper:
return blob, content_type, key
def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
"""Download media file from a specified media URL, where the media file
is formatted as an m3u8 playlist, which is then decoded to an mp4 file.
Parameters
---------
url: str
URL of m3u8 playlist file from original post.
e.g. ``"https://media.gettr.com/group47/origin/2022/03/15/01/cbc436c1-1a1a-4b97-671d-c42109f3ec9b/out.m3u8"``
key: str or None
Pre-defined unique identifier for the media file.
Returns
-------
blob: bytes
Raw bytes of the downloaded media file.
content_type: str
Content-Type of media.
e.g. ``"video/mp4"``.
key: str
Unique identifier for the media file.
"""
content_type = 'video/mp4'
ext = '.' + content_type.split('/')[-1]
@@ -71,7 +154,28 @@ class Scraper:
return blob, content_type, key
def ytdlp_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
"""Download media file from a specified media URL, using a fork of
youtube-dl that enables faster downloading.
Parameters
---------
url: str
URL of media file from original post.
e.g. ``"https://rumble.com/embed/vgt7gh/"``
key: str or None
Pre-defined unique identifier for the media file.
Returns
-------
blob: bytes
Raw bytes of the downloaded media file.
content_type: str
Content-Type of media.
e.g. ``"video/mp4"``.
key: str
Unique identifier for the media file.
"""
content_type = 'video/mp4'
with tempfile.TemporaryDirectory() as temp_dir:
@@ -103,6 +207,23 @@ class Scraper:
return blob, content_type, key
def archive_blob(self, blob: bytes, content_type: str, key: str) -> str:
"""Upload raw bytes of a media file to the storage archive.
Parameters
----------
blob: bytes
Raw bytes of the media file to be archived.
content_type: str
Content-Type of media.
e.g. ``"video/mp4"``.
key: str
Unique identifier for the media file.
Returns
-------
archived_url: str
URL specifying the file on the storage archive.
"""
filename = self.__version__.replace(' ', '_') + '/' + key
@@ -114,9 +235,42 @@ class Scraper:
return archived_url
def can_handle(self, channel: Channel) -> bool:
"""Whether or not the scraper can scrape the specified channel.
Parameters
----------
channel: Channel
Channel to be scraped.
Returns
-------
bool
``True`` if the scraper is capable of scraping ``channel``,
``False`` if not.
"""
raise NotImplementedError
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
"""Scrape all posts from the specified Channel.
Parameters
----------
channel: Channel
Channel to be scraped.
since: ScraperResult or None
Most recently scraped ScraperResult from a previous scrape, or
``None`` if scraper has not run before.
archive_media: bool
If ``True``, any media files (images, video, etc.) from posts are archived.
If ``False``, media files are not archived.
Yields
------
ScraperResult
Scraper result from a single post/comment from the specified Channel.
"""
raise NotImplementedError
@@ -129,9 +283,13 @@ class ScraperController:
self.session = None
def register_scraper(self, scraper: Scraper):
"""Register a single Scraper instance to the controller.
"""
self.scrapers.append(scraper)
def register_scrapers(self, scraper: List[Scraper]):
"""Register a list of Scraper instances to the controller.
"""
self.scrapers.extend(scraper)
def scrape_all_channels(self, archive_media: bool = True):
@@ -147,6 +305,17 @@ class ScraperController:
@logger.catch(reraise = True)
def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
"""Scrape all posts for all specified channels.
Parameters
----------
channels: list<Channel>
List of Channel instances to be scraped
archive_media: bool
If ``True``, any media files (images, video, etc.) from posts are archived.
If ``False``, media files are not archived.
"""
if self.session is None:
logger.error("No DB session")
return
@@ -185,6 +354,9 @@ class ScraperController:
logger.warning(f"No handler found for Channel {channel}")
def connect_to_db(self, engine):
"""Connect the specified SQLAlchemy engine to the controller.
"""
# create tables
mapper_registry.metadata.create_all(bind=engine)
@@ -193,8 +365,8 @@ class ScraperController:
self.session.configure(bind=self.engine)
def reset_db(self):
"""Drop all data from the connected SQLAlchemy database.
"""
mapper_registry.metadata.drop_all(bind=self.engine)
self.connect_to_db(self.engine)
self.connect_to_db(self.engine)

View File

@@ -17,7 +17,7 @@ class BitchuteScraper(Scraper):
library"""
__version__ = "BitchuteScraper 0.0.1"
def get_username_from_url(url):
def get_username_from_url(self, url):
username = url.split('bitchute.com/channel/')[-1].strip('/')
return username
@@ -33,7 +33,7 @@ class BitchuteScraper(Scraper):
detail = 'comments'
username = BitchuteScraper.get_username_from_url(channel.url)
username = self.get_username_from_url(channel.url)
scraper = get_videos_user(session, username, csrftoken, detail)
for post in scraper:
@@ -61,7 +61,7 @@ class BitchuteScraper(Scraper):
archived_urls=archived_urls)
def can_handle(self, channel):
if channel.platform == "Bitchute" and BitchuteScraper.get_username_from_url(channel.url) is not None:
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
return True
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -11,14 +11,14 @@ class GabScraper(Scraper):
"""An implementation of a Scraper for Gab, using GARC library"""
__version__ = "GabScraper 0.0.1"
def get_username_from_url(url):
def get_username_from_url(self, url):
username = url.split('https://gab.com/')[-1]
return username
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
client = Garc(profile = 'main')
username = GabScraper.get_username_from_url(channel.url)
username = self.get_username_from_url(channel.url)
scraper = client.userposts(username)
@@ -52,5 +52,5 @@ class GabScraper(Scraper):
archived_urls=archived_urls)
def can_handle(self, channel):
if channel.platform == "Gab" and GabScraper.get_username_from_url(channel.url) is not None:
if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:
return True

View File

@@ -12,7 +12,7 @@ class GettrScraper(Scraper):
"""An implementation of a Scraper for Gettr, using gogettr library"""
__version__ = "GettrScraper 0.0.1"
def get_username_from_url(url):
def get_username_from_url(self, url):
username = url.split("gettr.com/user/")[1]
if len(username.split("/")) > 1:
return None
@@ -21,7 +21,7 @@ class GettrScraper(Scraper):
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
client = PublicClient()
username = GettrScraper.get_username_from_url(channel.url)
username = self.get_username_from_url(channel.url)
scraper = client.user_activity(username=username, type="posts")
for post in scraper:
@@ -62,7 +62,7 @@ class GettrScraper(Scraper):
archived_urls=archived_urls)
def can_handle(self, channel):
if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None:
if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None:
return True
def url_to_key(self, url: str, content_type: str) -> str:

View File

@@ -18,6 +18,7 @@ CONTENT_TYPES = {
'mp4' : 'video/mp4'}
class InstagramScraper(Scraper):
"""An implementation of a Scraper for Instagram, using instaloader library"""
__version__ = "InstagramScraper 0.0.1"
def get_username_from_url(self, url):

View File

@@ -13,7 +13,7 @@ class OdyseeScraper(Scraper):
"""An implementation of a Scraper for Odysee, using polyphemus library"""
__version__ = "OdyseeScraper 0.0.1"
def get_username_from_url(url):
def get_username_from_url(self, url):
username = url.split('odysee.com/')[-1].strip('@').split(':')[0]
@@ -21,7 +21,7 @@ class OdyseeScraper(Scraper):
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = OdyseeScraper.get_username_from_url(channel.url)
username = self.get_username_from_url(channel.url)
odysee_channel = OdyseeChannel(channel_name = username)
all_videos = odysee_channel.get_all_videos()
@@ -70,7 +70,7 @@ class OdyseeScraper(Scraper):
archived_urls={})
def can_handle(self, channel):
if channel.platform == "Odysee" and OdyseeScraper.get_username_from_url(channel.url) is not None:
if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None:
return True
def url_to_key(self, url: str, content_type: str) -> str:

View File

@@ -14,14 +14,14 @@ class RumbleScraper(Scraper):
"""An implementation of a Scraper for Rumble, using custom functions"""
__version__ = "RumbleScraper 0.0.1"
def get_username_from_url(url):
def get_username_from_url(self, url):
username = url.split('https://rumble.com/c/')[1]
return username
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = RumbleScraper.get_username_from_url(channel.url)
username = self.get_username_from_url(channel.url)
scraper = get_channel_videos(username)
for post in scraper:
@@ -54,7 +54,7 @@ class RumbleScraper(Scraper):
return key
def can_handle(self, channel):
if channel.platform == "Rumble" and RumbleScraper.get_username_from_url(channel.url) is not None:
if channel.platform == "Rumble" and self.get_username_from_url(channel.url) is not None:
return True
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -8,6 +8,7 @@ from cisticola.base import Channel, ScraperResult
from cisticola.scraper.base import Scraper
class TelegramSnscrapeScraper(Scraper):
"""An implementation of a Scraper for Telegram, using snscrape library"""
__version__ = "TelegramSnscrapeScraper 0.0.1"
def can_handle(self, channel):

View File

@@ -14,6 +14,7 @@ from cisticola.scraper.base import Scraper
MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
class TelegramTelethonScraper(Scraper):
"""An implementation of a Scraper for Telegram, using Telethon library"""
__version__ = "TelegramTelethonScraper 0.0.1"
def get_username_from_url(self, url):
@@ -30,9 +31,9 @@ class TelegramTelethonScraper(Scraper):
username = self.get_username_from_url(channel.url)
api_id = os.environ['TELEGRAM_API_ID_1']
api_hash = os.environ['TELEGRAM_API_HASH_1']
phone = os.environ['TELEGRAM_PHONE_1']
api_id = os.environ['TELEGRAM_API_ID']
api_hash = os.environ['TELEGRAM_API_HASH']
phone = os.environ['TELEGRAM_PHONE']
with TelegramClient(phone, api_id, api_hash) as client:

View File

@@ -8,12 +8,24 @@ SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build
SPHINXAPIDOC = sphinx-apidoc
APIDOCFLAGS = --separate --private --module-first
MODULEPATH = ../cisticola
SOURCEFILES = cisticola.*
MODULEFILE = modules.rst
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Custom process and flags for generating Sphinx sources
apidoc:
rm $(SOURCEDIR)/$(SOURCEFILES)
$(SPHINXAPIDOC) $(APIDOCFLAGS) -o "$(SOURCEDIR)" "$(MODULEPATH)"
rm $(SOURCEDIR)/$(MODULEFILE)
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 7.0 KiB

BIN
docs/images/favicon.ico Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 614 B

View File

@@ -10,6 +10,12 @@ if "%SPHINXBUILD%" == "" (
set SOURCEDIR=source
set BUILDDIR=build
set SPHINXAPIDOC=sphinx-apidoc
set APIDOCFLAGS=--separate --private --module-first
set MODULEPATH=../cisticola
set SOURCEFILES=cisticola.*
set MODULEFILE=modules.rst
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
@@ -28,6 +34,11 @@ if errorlevel 9009 (
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:apidoc
del %SOURCEDIR%\%SOURCEFILES%
%SPHINXAPIDOC% %APIDOCFLAGS% -o %SOURCEDIR% %MODULEPATH%
del %SOURCEDIR%\%MODULEFILE%
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%

View File

@@ -0,0 +1,8 @@
cisticola.base module
=====================
.. automodule:: cisticola.base
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -1,6 +1,12 @@
cisticola package
=================
.. automodule:: cisticola
:members:
:undoc-members:
:show-inheritance:
:private-members:
Subpackages
-----------
@@ -13,18 +19,8 @@ Subpackages
Submodules
----------
cisticola.base module
---------------------
.. toctree::
:maxdepth: 4
.. automodule:: cisticola.base
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: cisticola
:members:
:undoc-members:
:show-inheritance:
cisticola.base
cisticola.utils

View File

@@ -0,0 +1,8 @@
cisticola.scraper.base module
=============================
.. automodule:: cisticola.scraper.base
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.bitchute module
=================================
.. automodule:: cisticola.scraper.bitchute
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.gab module
============================
.. automodule:: cisticola.scraper.gab
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.gettr module
==============================
.. automodule:: cisticola.scraper.gettr
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.instagram module
==================================
.. automodule:: cisticola.scraper.instagram
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.odysee module
===============================
.. automodule:: cisticola.scraper.odysee
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -1,37 +1,27 @@
cisticola.scraper package
=========================
Submodules
----------
cisticola.scraper.bitchute module
---------------------------------
.. automodule:: cisticola.scraper.bitchute
:members:
:undoc-members:
:show-inheritance:
cisticola.scraper.gettr module
------------------------------
.. automodule:: cisticola.scraper.gettr
:members:
:undoc-members:
:show-inheritance:
cisticola.scraper.twitter module
--------------------------------
.. automodule:: cisticola.scraper.twitter
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: cisticola.scraper
:members:
:undoc-members:
:show-inheritance:
:private-members:
Submodules
----------
.. toctree::
:maxdepth: 4
cisticola.scraper.base
cisticola.scraper.bitchute
cisticola.scraper.gab
cisticola.scraper.gettr
cisticola.scraper.instagram
cisticola.scraper.odysee
cisticola.scraper.rumble
cisticola.scraper.telegram_snscrape
cisticola.scraper.telegram_telethon
cisticola.scraper.twitter
cisticola.scraper.vkontakte
cisticola.scraper.youtube

View File

@@ -0,0 +1,8 @@
cisticola.scraper.rumble module
===============================
.. automodule:: cisticola.scraper.rumble
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.telegram\_snscrape module
===========================================
.. automodule:: cisticola.scraper.telegram_snscrape
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.telegram\_telethon module
===========================================
.. automodule:: cisticola.scraper.telegram_telethon
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.twitter module
================================
.. automodule:: cisticola.scraper.twitter
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.vkontakte module
==================================
.. automodule:: cisticola.scraper.vkontakte
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.youtube module
================================
.. automodule:: cisticola.scraper.youtube
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.transformer.base module
=================================
.. automodule:: cisticola.transformer.base
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.transformer.bitchute module
=====================================
.. automodule:: cisticola.transformer.bitchute
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -1,21 +1,18 @@
cisticola.transformer package
=============================
Submodules
----------
cisticola.transformer.twitter module
------------------------------------
.. automodule:: cisticola.transformer.twitter
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: cisticola.transformer
:members:
:undoc-members:
:show-inheritance:
:private-members:
Submodules
----------
.. toctree::
:maxdepth: 4
cisticola.transformer.base
cisticola.transformer.bitchute
cisticola.transformer.twitter

View File

@@ -0,0 +1,8 @@
cisticola.transformer.twitter module
====================================
.. automodule:: cisticola.transformer.twitter
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.utils module
======================
.. automodule:: cisticola.utils
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -43,9 +43,18 @@ exclude_patterns = []
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'alabaster'
html_theme = 'sphinx_rtd_theme'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = []
# -- Default flags for autodoc------------------------------------------------
autodoc_default_options = {'exclude-members': '_sa_class_manager'}
html_favicon = '../images/favicon.ico'
html_logo = '../images/cisticola_logo.svg'
html_theme_options = {'style_nav_header_background': '#000000'}

View File

@@ -2,16 +2,7 @@ Welcome to Cisticola's documentation!
=====================================
.. toctree::
:maxdepth: 2
:caption: Contents:
:maxdepth: 1
modules
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
quickstart
cisticola

View File

@@ -1,7 +0,0 @@
cisticola
=========
.. toctree::
:maxdepth: 4
cisticola

View File

@@ -0,0 +1,96 @@
Quickstart
==========
Installation
------------
The *cisticola* application uses pipenv_ for dependency management. To install the dependencies of *cisticola*, first install pipenv using the following command:
.. code-block::
pip install pipenv
and then install the dependencies using the following command from the package root directory:
.. code-block::
pipenv install
To install the necessary dependencies for building the documentation and running unit tests, run the following command from the package root directory:
.. code-block::
pipenv install --dev
Environment Variables
---------------------
Three of the scrapers in *cisticola* (:py:mod:`~cisticola.scraper.gab.GabScraper`, :py:mod:`~cisticola.scraper.instagram.InstagramScraper`, and :py:mod:`~cisticola.scraper.telegram_telethon.TelegramTelethonScraper`) require platform credentials to work correctly.
Gab
"""
The Gab credentials can be configured by running the following command from the root directory:
.. code-block::
pipenv run garc configure
which will direct you to provide the username and password for your Gab account.
Instagram
"""""""""
The Instagram credentials can be configured by setting the following environment variables, either in the project's ``.env`` file or in the system's environment:
- ``INSTAGRAM_USERNAME``: username of your Instagram account
- ``INSTAGRAM_PASSWORD``: password of your Instagram account
Telegram Telethon
"""""""""""""""""
The Telegram credentials can be configured by setting the following environment variables, either in the project's ``.env`` file or in the system's environment:
- ``TELEGRAM_API_ID``: API ID number for your Telegram application
- ``TELEGRAM_API_HASH``: API hash for your Telegram application
- ``TELEGRAM_PHONE``: phone number for the account corresponding to your your Telegram application
If you do not already have a Telegram application, you can create one by following the instructions on `this page`_.
Documentation
-------------
The *cisticola* application uses Sphinx_ to generate and display its documentation. To build the documentation in the HTML format, run the following command from the ``docs/`` directory:
.. code-block::
pipenv run make html
For developers, if changes are made to the package structure or additional modules are created, you can update the Sphinx source ``*.rst`` files by running the following command from the ``docs/`` directory:
.. code-block::
pipenv run make apidoc
Testing
-------
The *cisticola* application uses pytest_ for unit testing. To run the test suite, run the following command from the package root directory:
.. code-block::
pipenv run pytest
Examples
--------
An example of a *cisticola* ingest file ``russian_telegram_ingest.py`` is included in the package root directory, showing how the list of channels to scrape is defined, and how the :py:mod:`~cisticola.scraper.base.ScraperController` and :py:mod:`~cisticola.transformer.base.Transformer` classes are used. To run the ingest script, run the following command from the package root directory:
.. code-block::
pipenv run python russian_telegram_ingest.py
.. _pipenv: https://pipenv.pypa.io/en/latest/
.. _Sphinx: https://www.sphinx-doc.org/en/master/
.. _pytest: https://docs.pytest.org/en/7.1.x/
.. _this page: https://core.telegram.org/api/obtaining_api_id

View File

@@ -1,6 +1,6 @@
[pytest]
minversion =
6.0.2
7.0.0
testpaths =
tests/
python_files =
@@ -13,4 +13,5 @@ addopts =
--self-contained-html
filterwarnings =
ignore:the imp module is deprecated:DeprecationWarning
ignore:The localize method is no longer necessary, as this time zone supports the fold attribute
ignore:The localize method is no longer necessary, as this time zone supports the fold attribute
ignore:invalid escape sequence:DeprecationWarning