mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-10 20:38:33 +03:00
Merge branch 'main' into channel-db
This commit is contained in:
3
Pipfile
3
Pipfile
@@ -10,7 +10,6 @@ gogettr = "*"
|
||||
requests = "*"
|
||||
bs4 = "*"
|
||||
dateparser = "*"
|
||||
sphinx = "*"
|
||||
boto3 = "*"
|
||||
snscrape = {git = "https://github.com/bellingcat/snscrape.git"}
|
||||
ffmpeg-python = "*"
|
||||
@@ -29,6 +28,8 @@ pytest-cov = "*"
|
||||
pytest-html = "*"
|
||||
pytest-metadata = "*"
|
||||
black = "*"
|
||||
sphinx = "*"
|
||||
sphinx_rtd_theme = "*"
|
||||
|
||||
[requires]
|
||||
python_version = "3.9"
|
||||
|
||||
418
Pipfile.lock
generated
418
Pipfile.lock
generated
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "d465f2d09a728ee76cb0af521890ecc1e1bce672acbd1caf2e4d01b6567480d5"
|
||||
"sha256": "e3b96b0ac8c80d4817f9adac4ab171bf4b7e07e80927c7b152a24e8bbdbf7faa"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
@@ -16,13 +16,6 @@
|
||||
]
|
||||
},
|
||||
"default": {
|
||||
"alabaster": {
|
||||
"hashes": [
|
||||
"sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359",
|
||||
"sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02"
|
||||
],
|
||||
"version": "==0.7.12"
|
||||
},
|
||||
"attrs": {
|
||||
"hashes": [
|
||||
"sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4",
|
||||
@@ -31,14 +24,6 @@
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==21.4.0"
|
||||
},
|
||||
"babel": {
|
||||
"hashes": [
|
||||
"sha256:ab49e12b91d937cd11f0b67cb259a57ab4ad2b59ac7a3b41d6c06c0ac5b0def9",
|
||||
"sha256:bc0c176f9f6a994582230df350aa6e05ba2ebe4b3ac317eab29d9be5d2768da0"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.9.1"
|
||||
},
|
||||
"beautifulsoup4": {
|
||||
"hashes": [
|
||||
"sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf",
|
||||
@@ -49,19 +34,19 @@
|
||||
},
|
||||
"boto3": {
|
||||
"hashes": [
|
||||
"sha256:8d6f3c548f0ee03d742f404c96515e7579fc6968135aaa50dd855a046698ff79",
|
||||
"sha256:d857feb6af9932e1ee3a748060a2cd9fd6043dbbccf66976eda54586597efdc0"
|
||||
"sha256:76d5b90400c54b25278150768e946edf166acce2c1597c0ecfbebb1dbe9acf2c",
|
||||
"sha256:7bb2e6506a6ad44d111dd20a5d510374b6958fe989b4ef887109c79d812f926f"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.21.18"
|
||||
"version": "==1.21.19"
|
||||
},
|
||||
"botocore": {
|
||||
"hashes": [
|
||||
"sha256:7ea8ef1ff7c4882ab59b337662f90ddf5ea860e95e7e209dca593a34ea585b1b",
|
||||
"sha256:d2da7ccbc5ddd61fe3cd45fcbd3de380d9e3a15bfa8fbfd2d9259a93dcc60c56"
|
||||
"sha256:5ed2be0e413961134f4c17eab16396d41a5b4b73a637588260c04d20806d52ea",
|
||||
"sha256:d0d77bce152ca51f3c2cd0f9bf05cb3b623e719406ad58b4c20444e237fe82eb"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==1.24.18"
|
||||
"version": "==1.24.19"
|
||||
},
|
||||
"brotli": {
|
||||
"hashes": [
|
||||
@@ -169,14 +154,6 @@
|
||||
"index": "pypi",
|
||||
"version": "==1.1.0"
|
||||
},
|
||||
"docutils": {
|
||||
"hashes": [
|
||||
"sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125",
|
||||
"sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==0.17.1"
|
||||
},
|
||||
"ffmpeg-python": {
|
||||
"hashes": [
|
||||
"sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127",
|
||||
@@ -284,22 +261,6 @@
|
||||
"markers": "python_version >= '3'",
|
||||
"version": "==3.3"
|
||||
},
|
||||
"imagesize": {
|
||||
"hashes": [
|
||||
"sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c",
|
||||
"sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==1.3.0"
|
||||
},
|
||||
"importlib-metadata": {
|
||||
"hashes": [
|
||||
"sha256:1208431ca90a8cca1a6b8af391bb53c1a2db74e5d1cef6ddced95d4b2062edc6",
|
||||
"sha256:ea4c597ebf37142f827b8f39299579e31685c31d3a438b59f469406afd0f2539"
|
||||
],
|
||||
"markers": "python_version < '3.10'",
|
||||
"version": "==4.11.3"
|
||||
},
|
||||
"iniconfig": {
|
||||
"hashes": [
|
||||
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
|
||||
@@ -314,14 +275,6 @@
|
||||
"index": "pypi",
|
||||
"version": "==4.8.4"
|
||||
},
|
||||
"jinja2": {
|
||||
"hashes": [
|
||||
"sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8",
|
||||
"sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.0.3"
|
||||
},
|
||||
"jmespath": {
|
||||
"hashes": [
|
||||
"sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9",
|
||||
@@ -405,52 +358,6 @@
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==4.8.0"
|
||||
},
|
||||
"markupsafe": {
|
||||
"hashes": [
|
||||
"sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3",
|
||||
"sha256:09c86c9643cceb1d87ca08cdc30160d1b7ab49a8a21564868921959bd16441b8",
|
||||
"sha256:142119fb14a1ef6d758912b25c4e803c3ff66920635c44078666fe7cc3f8f759",
|
||||
"sha256:1d1fb9b2eec3c9714dd936860850300b51dbaa37404209c8d4cb66547884b7ed",
|
||||
"sha256:204730fd5fe2fe3b1e9ccadb2bd18ba8712b111dcabce185af0b3b5285a7c989",
|
||||
"sha256:24c3be29abb6b34052fd26fc7a8e0a49b1ee9d282e3665e8ad09a0a68faee5b3",
|
||||
"sha256:290b02bab3c9e216da57c1d11d2ba73a9f73a614bbdcc027d299a60cdfabb11a",
|
||||
"sha256:3028252424c72b2602a323f70fbf50aa80a5d3aa616ea6add4ba21ae9cc9da4c",
|
||||
"sha256:30c653fde75a6e5eb814d2a0a89378f83d1d3f502ab710904ee585c38888816c",
|
||||
"sha256:3cace1837bc84e63b3fd2dfce37f08f8c18aeb81ef5cf6bb9b51f625cb4e6cd8",
|
||||
"sha256:4056f752015dfa9828dce3140dbadd543b555afb3252507348c493def166d454",
|
||||
"sha256:454ffc1cbb75227d15667c09f164a0099159da0c1f3d2636aa648f12675491ad",
|
||||
"sha256:598b65d74615c021423bd45c2bc5e9b59539c875a9bdb7e5f2a6b92dfcfc268d",
|
||||
"sha256:599941da468f2cf22bf90a84f6e2a65524e87be2fce844f96f2dd9a6c9d1e635",
|
||||
"sha256:5ddea4c352a488b5e1069069f2f501006b1a4362cb906bee9a193ef1245a7a61",
|
||||
"sha256:62c0285e91414f5c8f621a17b69fc0088394ccdaa961ef469e833dbff64bd5ea",
|
||||
"sha256:679cbb78914ab212c49c67ba2c7396dc599a8479de51b9a87b174700abd9ea49",
|
||||
"sha256:6e104c0c2b4cd765b4e83909cde7ec61a1e313f8a75775897db321450e928cce",
|
||||
"sha256:736895a020e31b428b3382a7887bfea96102c529530299f426bf2e636aacec9e",
|
||||
"sha256:75bb36f134883fdbe13d8e63b8675f5f12b80bb6627f7714c7d6c5becf22719f",
|
||||
"sha256:7d2f5d97fcbd004c03df8d8fe2b973fe2b14e7bfeb2cfa012eaa8759ce9a762f",
|
||||
"sha256:80beaf63ddfbc64a0452b841d8036ca0611e049650e20afcb882f5d3c266d65f",
|
||||
"sha256:84ad5e29bf8bab3ad70fd707d3c05524862bddc54dc040982b0dbcff36481de7",
|
||||
"sha256:8da5924cb1f9064589767b0f3fc39d03e3d0fb5aa29e0cb21d43106519bd624a",
|
||||
"sha256:961eb86e5be7d0973789f30ebcf6caab60b844203f4396ece27310295a6082c7",
|
||||
"sha256:96de1932237abe0a13ba68b63e94113678c379dca45afa040a17b6e1ad7ed076",
|
||||
"sha256:a0a0abef2ca47b33fb615b491ce31b055ef2430de52c5b3fb19a4042dbc5cadb",
|
||||
"sha256:b2a5a856019d2833c56a3dcac1b80fe795c95f401818ea963594b345929dffa7",
|
||||
"sha256:b8811d48078d1cf2a6863dafb896e68406c5f513048451cd2ded0473133473c7",
|
||||
"sha256:c532d5ab79be0199fa2658e24a02fce8542df196e60665dd322409a03db6a52c",
|
||||
"sha256:d3b64c65328cb4cd252c94f83e66e3d7acf8891e60ebf588d7b493a55a1dbf26",
|
||||
"sha256:d4e702eea4a2903441f2735799d217f4ac1b55f7d8ad96ab7d4e25417cb0827c",
|
||||
"sha256:d5653619b3eb5cbd35bfba3c12d575db2a74d15e0e1c08bf1db788069d410ce8",
|
||||
"sha256:d66624f04de4af8bbf1c7f21cc06649c1c69a7f84109179add573ce35e46d448",
|
||||
"sha256:e67ec74fada3841b8c5f4c4f197bea916025cb9aa3fe5abf7d52b655d042f956",
|
||||
"sha256:e6f7f3f41faffaea6596da86ecc2389672fa949bd035251eab26dc6697451d05",
|
||||
"sha256:f02cf7221d5cd915d7fa58ab64f7ee6dd0f6cddbb48683debf5d04ae9b1c2cc1",
|
||||
"sha256:f0eddfcabd6936558ec020130f932d479930581171368fd728efcfb6ef0dd357",
|
||||
"sha256:fabbe18087c3d33c5824cb145ffca52eccd053061df1d79d4b66dafa5ad2a5ea",
|
||||
"sha256:fc3150f85e2dbcf99e65238c842d1cfe69d3e7649b19864c1cc043213d9cd730"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==2.1.0"
|
||||
},
|
||||
"mutagen": {
|
||||
"hashes": [
|
||||
"sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1",
|
||||
@@ -642,14 +549,6 @@
|
||||
"git": "https://github.com/smarnach/pyexiftool.git",
|
||||
"ref": "3db3764895e687d75b42d3ae4e554ca8664a7f6f"
|
||||
},
|
||||
"pygments": {
|
||||
"hashes": [
|
||||
"sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65",
|
||||
"sha256:4e426f72023d88d03b2fa258de560726ce890ff3b630f88c21cbb8b2503b8c6a"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==2.11.2"
|
||||
},
|
||||
"pyparsing": {
|
||||
"hashes": [
|
||||
"sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea",
|
||||
@@ -786,6 +685,9 @@
|
||||
"version": "==2022.3.2"
|
||||
},
|
||||
"requests": {
|
||||
"extras": [
|
||||
"socks"
|
||||
],
|
||||
"hashes": [
|
||||
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
|
||||
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
|
||||
@@ -817,13 +719,6 @@
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==1.16.0"
|
||||
},
|
||||
"snowballstemmer": {
|
||||
"hashes": [
|
||||
"sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1",
|
||||
"sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"
|
||||
],
|
||||
"version": "==2.2.0"
|
||||
},
|
||||
"snscrape": {
|
||||
"git": "https://github.com/bellingcat/snscrape.git",
|
||||
"ref": "de4ebed81f3f6a4bb4c65630daab6ec63784959b"
|
||||
@@ -836,62 +731,6 @@
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2.3.1"
|
||||
},
|
||||
"sphinx": {
|
||||
"hashes": [
|
||||
"sha256:5da895959511473857b6d0200f56865ed62c31e8f82dd338063b84ec022701fe",
|
||||
"sha256:6caad9786055cb1fa22b4a365c1775816b876f91966481765d7d50e9f0dd35cc"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==4.4.0"
|
||||
},
|
||||
"sphinxcontrib-applehelp": {
|
||||
"hashes": [
|
||||
"sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a",
|
||||
"sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==1.0.2"
|
||||
},
|
||||
"sphinxcontrib-devhelp": {
|
||||
"hashes": [
|
||||
"sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e",
|
||||
"sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==1.0.2"
|
||||
},
|
||||
"sphinxcontrib-htmlhelp": {
|
||||
"hashes": [
|
||||
"sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07",
|
||||
"sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2.0.0"
|
||||
},
|
||||
"sphinxcontrib-jsmath": {
|
||||
"hashes": [
|
||||
"sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178",
|
||||
"sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==1.0.1"
|
||||
},
|
||||
"sphinxcontrib-qthelp": {
|
||||
"hashes": [
|
||||
"sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72",
|
||||
"sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==1.0.3"
|
||||
},
|
||||
"sphinxcontrib-serializinghtml": {
|
||||
"hashes": [
|
||||
"sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd",
|
||||
"sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==1.1.5"
|
||||
},
|
||||
"sqlalchemy": {
|
||||
"hashes": [
|
||||
"sha256:04164e0063feb7aedd9d073db0fd496edb244be40d46ea1f0d8990815e4b8c34",
|
||||
@@ -1034,17 +873,16 @@
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2022.3.8.2"
|
||||
},
|
||||
"zipp": {
|
||||
"hashes": [
|
||||
"sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d",
|
||||
"sha256:b47250dd24f92b7dd6a0a8fc5244da14608f3ca90a5efcd37a3b1642fac9a375"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==3.7.0"
|
||||
}
|
||||
},
|
||||
"develop": {
|
||||
"alabaster": {
|
||||
"hashes": [
|
||||
"sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359",
|
||||
"sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02"
|
||||
],
|
||||
"version": "==0.7.12"
|
||||
},
|
||||
"attrs": {
|
||||
"hashes": [
|
||||
"sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4",
|
||||
@@ -1053,6 +891,29 @@
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==21.4.0"
|
||||
},
|
||||
"babel": {
|
||||
"hashes": [
|
||||
"sha256:ab49e12b91d937cd11f0b67cb259a57ab4ad2b59ac7a3b41d6c06c0ac5b0def9",
|
||||
"sha256:bc0c176f9f6a994582230df350aa6e05ba2ebe4b3ac317eab29d9be5d2768da0"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.9.1"
|
||||
},
|
||||
"certifi": {
|
||||
"hashes": [
|
||||
"sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
|
||||
"sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
|
||||
],
|
||||
"version": "==2021.10.8"
|
||||
},
|
||||
"charset-normalizer": {
|
||||
"hashes": [
|
||||
"sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597",
|
||||
"sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"
|
||||
],
|
||||
"markers": "python_version >= '3'",
|
||||
"version": "==2.0.12"
|
||||
},
|
||||
"coverage": {
|
||||
"extras": [
|
||||
"toml"
|
||||
@@ -1103,6 +964,38 @@
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==6.3.2"
|
||||
},
|
||||
"docutils": {
|
||||
"hashes": [
|
||||
"sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125",
|
||||
"sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==0.17.1"
|
||||
},
|
||||
"idna": {
|
||||
"hashes": [
|
||||
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
|
||||
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
|
||||
],
|
||||
"markers": "python_version >= '3'",
|
||||
"version": "==3.3"
|
||||
},
|
||||
"imagesize": {
|
||||
"hashes": [
|
||||
"sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c",
|
||||
"sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==1.3.0"
|
||||
},
|
||||
"importlib-metadata": {
|
||||
"hashes": [
|
||||
"sha256:1208431ca90a8cca1a6b8af391bb53c1a2db74e5d1cef6ddced95d4b2062edc6",
|
||||
"sha256:ea4c597ebf37142f827b8f39299579e31685c31d3a438b59f469406afd0f2539"
|
||||
],
|
||||
"markers": "python_version < '3.10'",
|
||||
"version": "==4.11.3"
|
||||
},
|
||||
"iniconfig": {
|
||||
"hashes": [
|
||||
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
|
||||
@@ -1110,6 +1003,60 @@
|
||||
],
|
||||
"version": "==1.1.1"
|
||||
},
|
||||
"jinja2": {
|
||||
"hashes": [
|
||||
"sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8",
|
||||
"sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.0.3"
|
||||
},
|
||||
"markupsafe": {
|
||||
"hashes": [
|
||||
"sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003",
|
||||
"sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88",
|
||||
"sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5",
|
||||
"sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7",
|
||||
"sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a",
|
||||
"sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603",
|
||||
"sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1",
|
||||
"sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135",
|
||||
"sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247",
|
||||
"sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6",
|
||||
"sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601",
|
||||
"sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77",
|
||||
"sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02",
|
||||
"sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e",
|
||||
"sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63",
|
||||
"sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f",
|
||||
"sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980",
|
||||
"sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b",
|
||||
"sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812",
|
||||
"sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff",
|
||||
"sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96",
|
||||
"sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1",
|
||||
"sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925",
|
||||
"sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a",
|
||||
"sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6",
|
||||
"sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e",
|
||||
"sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f",
|
||||
"sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4",
|
||||
"sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f",
|
||||
"sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3",
|
||||
"sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c",
|
||||
"sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a",
|
||||
"sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417",
|
||||
"sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a",
|
||||
"sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a",
|
||||
"sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37",
|
||||
"sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452",
|
||||
"sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933",
|
||||
"sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a",
|
||||
"sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==2.1.1"
|
||||
},
|
||||
"packaging": {
|
||||
"hashes": [
|
||||
"sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb",
|
||||
@@ -1134,6 +1081,14 @@
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==1.11.0"
|
||||
},
|
||||
"pygments": {
|
||||
"hashes": [
|
||||
"sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65",
|
||||
"sha256:4e426f72023d88d03b2fa258de560726ce890ff3b630f88c21cbb8b2503b8c6a"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==2.11.2"
|
||||
},
|
||||
"pyparsing": {
|
||||
"hashes": [
|
||||
"sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea",
|
||||
@@ -1174,6 +1129,95 @@
|
||||
"index": "pypi",
|
||||
"version": "==1.11.0"
|
||||
},
|
||||
"pytz": {
|
||||
"hashes": [
|
||||
"sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c",
|
||||
"sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"
|
||||
],
|
||||
"version": "==2021.3"
|
||||
},
|
||||
"requests": {
|
||||
"extras": [
|
||||
"socks"
|
||||
],
|
||||
"hashes": [
|
||||
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
|
||||
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.27.1"
|
||||
},
|
||||
"snowballstemmer": {
|
||||
"hashes": [
|
||||
"sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1",
|
||||
"sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"
|
||||
],
|
||||
"version": "==2.2.0"
|
||||
},
|
||||
"sphinx": {
|
||||
"hashes": [
|
||||
"sha256:5da895959511473857b6d0200f56865ed62c31e8f82dd338063b84ec022701fe",
|
||||
"sha256:6caad9786055cb1fa22b4a365c1775816b876f91966481765d7d50e9f0dd35cc"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==4.4.0"
|
||||
},
|
||||
"sphinx-rtd-theme": {
|
||||
"hashes": [
|
||||
"sha256:4d35a56f4508cfee4c4fb604373ede6feae2a306731d533f409ef5c3496fdbd8",
|
||||
"sha256:eec6d497e4c2195fa0e8b2016b337532b8a699a68bcb22a512870e16925c6a5c"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.0.0"
|
||||
},
|
||||
"sphinxcontrib-applehelp": {
|
||||
"hashes": [
|
||||
"sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a",
|
||||
"sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==1.0.2"
|
||||
},
|
||||
"sphinxcontrib-devhelp": {
|
||||
"hashes": [
|
||||
"sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e",
|
||||
"sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==1.0.2"
|
||||
},
|
||||
"sphinxcontrib-htmlhelp": {
|
||||
"hashes": [
|
||||
"sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07",
|
||||
"sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2.0.0"
|
||||
},
|
||||
"sphinxcontrib-jsmath": {
|
||||
"hashes": [
|
||||
"sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178",
|
||||
"sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==1.0.1"
|
||||
},
|
||||
"sphinxcontrib-qthelp": {
|
||||
"hashes": [
|
||||
"sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72",
|
||||
"sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==1.0.3"
|
||||
},
|
||||
"sphinxcontrib-serializinghtml": {
|
||||
"hashes": [
|
||||
"sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd",
|
||||
"sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==1.1.5"
|
||||
},
|
||||
"tomli": {
|
||||
"hashes": [
|
||||
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
|
||||
@@ -1181,6 +1225,22 @@
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==2.0.1"
|
||||
},
|
||||
"urllib3": {
|
||||
"hashes": [
|
||||
"sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
|
||||
"sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
|
||||
"version": "==1.26.8"
|
||||
},
|
||||
"zipp": {
|
||||
"hashes": [
|
||||
"sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d",
|
||||
"sha256:b47250dd24f92b7dd6a0a8fc5244da14608f3ca90a5efcd37a3b1642fac9a375"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==3.7.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,33 +1,47 @@
|
||||
from typing import List
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
import tempfile
|
||||
import json
|
||||
import io
|
||||
|
||||
from sqlalchemy.orm import registry
|
||||
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean
|
||||
import pytesseract
|
||||
import PIL
|
||||
import io
|
||||
import exiftool
|
||||
import json
|
||||
import os
|
||||
|
||||
from .utils import make_request
|
||||
|
||||
mapper_registry = registry()
|
||||
|
||||
@dataclass
|
||||
class ScraperResult:
|
||||
"""A minimally processed result from a scraper"""
|
||||
"""A minimally processed result from a scraper
|
||||
"""
|
||||
|
||||
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
|
||||
scraper: str
|
||||
|
||||
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
|
||||
platform: str
|
||||
|
||||
#: Foreign key of channel ID that this was scraped from
|
||||
channel: int
|
||||
|
||||
#: String that uniquely identifies the scraped post on the given platform, e.g. ``"1503397267675533313"``
|
||||
platform_id: str
|
||||
|
||||
#: Datetime (relative to UTC) that the scraped post was created at.
|
||||
date: datetime
|
||||
|
||||
#: JSON dump of dict that contains all data scraped for the post.
|
||||
raw_data: str
|
||||
|
||||
#: Datetime (relative to UTC) that the scraped post was archived at.
|
||||
date_archived: datetime
|
||||
|
||||
#: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files.
|
||||
archived_urls: dict
|
||||
|
||||
|
||||
|
||||
raw_data_table = Table('raw_data', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
@@ -40,22 +54,45 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
|
||||
Column('date_archived', DateTime),
|
||||
Column('archived_urls', JSON))
|
||||
|
||||
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Channel:
|
||||
"""Information about a specific channel to be scraped.
|
||||
"""
|
||||
|
||||
#: Name of channel (different from username because it can be non-unique and contain emojis), e.g. ``T🕊Редакция Президент Гордон🕊"``.
|
||||
name: str
|
||||
|
||||
#: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
|
||||
platform_id: str
|
||||
|
||||
#: User-specified category for the channel, e.g. ``"explicit_qanon"``.
|
||||
category: str
|
||||
|
||||
#: Name of platform the given channel is on, e.g. ``"Telegram"``.
|
||||
platform: str
|
||||
|
||||
#: URL for the given channel on the platform, e.g. ``"https://t.me/prezidentgordonteam"``
|
||||
url: str
|
||||
|
||||
#: Screen name/username of channel.
|
||||
screenname: str
|
||||
|
||||
#: 2 digit country code for the country of origin for the channel, e.g. ``"RU"``.
|
||||
country: str = None
|
||||
|
||||
#: Name of influencer, if channel belongs to an influencer that operates on multiple platforms.
|
||||
influencer: str = None
|
||||
|
||||
#: Whether or not the channel is publicly-accessible.
|
||||
public: bool = None
|
||||
|
||||
#: Whether or not the channel is a chat (i.e. allows users who are not the channel creator to post/message)
|
||||
chat: bool = None
|
||||
|
||||
#: Any other additional notes about the channel.
|
||||
notes: str = ""
|
||||
|
||||
#: Did the channel come from a researcher or a scraping process?
|
||||
source: str = None
|
||||
|
||||
def hydrate(self):
|
||||
@@ -82,26 +119,52 @@ mapper_registry.map_imperatively(Channel, channel_table)
|
||||
@dataclass
|
||||
class Post:
|
||||
"""An object with fields for columns in the analysis table"""
|
||||
|
||||
#: ID number of the scraped post in the ``raw_data`` table
|
||||
raw_id: int
|
||||
|
||||
#: Platform specific post ID
|
||||
platform_id: str
|
||||
|
||||
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
|
||||
scraper: str
|
||||
|
||||
#: String specifying name and version of transformer used to tranform result, e.g. ``"TwitterTransformer 0.0.1"``.
|
||||
transformer: str
|
||||
|
||||
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
|
||||
platform: str
|
||||
|
||||
#: User-specified integer that uniquely identifies a channel, e.g. ``15``.
|
||||
channel: int
|
||||
|
||||
#: Datetime (relative to UTC) that the scraped post was created at.
|
||||
date: datetime
|
||||
|
||||
#: Datetime (relative to UTC) that the scraped post was archived at.
|
||||
date_archived: datetime
|
||||
|
||||
#: URL of the original post
|
||||
url: str
|
||||
|
||||
#: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
|
||||
author_id: str
|
||||
|
||||
#: Username of author who made post.
|
||||
author_username: str
|
||||
|
||||
#: Text of the original post
|
||||
content: str
|
||||
|
||||
#: The ID of the Channel that the post was forwarded or quoted from
|
||||
forwarded_from: int = None
|
||||
|
||||
#: The ID of the Post that this Post is a reply to or reblog of
|
||||
reply_to: int = None
|
||||
|
||||
def hydrate(self):
|
||||
pass
|
||||
|
||||
|
||||
|
||||
post_table = Table('posts', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
@@ -125,39 +188,64 @@ mapper_registry.map_imperatively(Post, post_table)
|
||||
|
||||
@dataclass
|
||||
class Media:
|
||||
"""Base class for organizing information about a media file.
|
||||
"""
|
||||
|
||||
#: ID number of the media's corresponding scraped post in the ``raw_data`` table.
|
||||
raw_id: int
|
||||
|
||||
#: ID number of the media's corresponging scraped post in the ``analysis`` table.
|
||||
post: int
|
||||
|
||||
#: URL of the original post.
|
||||
url: str
|
||||
|
||||
#: Original URL of the media from the the original post.
|
||||
original_url: str
|
||||
|
||||
#: JSON dump of the dict containing metadata information for the media file.
|
||||
exif: str = None
|
||||
|
||||
def get_blob(self):
|
||||
"""Download media file as bytes blob.
|
||||
"""
|
||||
|
||||
blob = make_request(self.url)
|
||||
return blob.content
|
||||
|
||||
def hydrate(self, blob = None):
|
||||
"""Download media file as bytes blob and extract data from content.
|
||||
"""
|
||||
|
||||
if blob is None:
|
||||
blob = self.get_blob()
|
||||
|
||||
self.hydrate_exif(blob)
|
||||
|
||||
def hydrate_exif(self, blob):
|
||||
f = open('tmp', 'wb')
|
||||
f.write(blob)
|
||||
f.close()
|
||||
"""Extract Exif metadata from bytes blob.
|
||||
"""
|
||||
|
||||
with exiftool.ExifTool() as et:
|
||||
exif = et.get_metadata('tmp')
|
||||
self.exif = json.dumps(exif)
|
||||
with tempfile.NamedTemporaryFile() as temp_file:
|
||||
temp_file.write(blob)
|
||||
|
||||
os.remove('tmp')
|
||||
with exiftool.ExifTool() as et:
|
||||
exif = et.get_metadata(temp_file.name)
|
||||
self.exif = json.dumps(exif)
|
||||
|
||||
@dataclass
|
||||
class Image(Media):
|
||||
"""Class for organizing information about an image file.
|
||||
"""
|
||||
|
||||
#: Extracted OCR content from image
|
||||
ocr: str = None
|
||||
|
||||
def hydrate(self, blob=None):
|
||||
"""Download image file as bytes blob and extract Exif and OCR content
|
||||
from the image.
|
||||
"""
|
||||
|
||||
if blob is None:
|
||||
blob = self.get_blob()
|
||||
|
||||
@@ -165,25 +253,62 @@ class Image(Media):
|
||||
self.hydrate_ocr(blob)
|
||||
|
||||
def hydrate_ocr(self, blob):
|
||||
"""Extract OCR (optical character recognition) data from image bytes blob.
|
||||
"""
|
||||
|
||||
image = PIL.Image.open(io.BytesIO(blob))
|
||||
self.ocr = pytesseract.image_to_string(image)
|
||||
|
||||
@dataclass
|
||||
class Video(Media):
|
||||
"""Class for organizing information about an image file.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
mapper_registry = registry()
|
||||
|
||||
raw_data_table = Table('raw_data', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
Column('scraper', String),
|
||||
Column('platform', String),
|
||||
Column('channel', Integer),
|
||||
Column('platform_id', String),
|
||||
Column('date', DateTime),
|
||||
Column('raw_data', String),
|
||||
Column('date_archived', DateTime),
|
||||
Column('archived_urls', JSON))
|
||||
|
||||
|
||||
analysis_table = Table('analysis', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
Column('raw_id', Integer, ForeignKey('raw_data.id')),
|
||||
Column('scraper', String),
|
||||
Column('transformer', String),
|
||||
Column('platform', String),
|
||||
Column('channel', Integer),
|
||||
Column('date', DateTime),
|
||||
Column('date_archived', DateTime),
|
||||
Column('url', String),
|
||||
Column('author_id', String),
|
||||
Column('author_username', String),
|
||||
Column('content', String))
|
||||
|
||||
media_table = Table('media', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
Column('type', String),
|
||||
Column('type', String),
|
||||
Column('raw_id', Integer, ForeignKey('raw_data.id')),
|
||||
Column('post', Integer, ForeignKey('posts.id')),
|
||||
Column('url', String),
|
||||
Column('original_url', String),
|
||||
Column('exif', String),
|
||||
Column('ocr', String)
|
||||
)
|
||||
Column('ocr', String))
|
||||
|
||||
mapper_registry.map_imperatively(TransformedResult, analysis_table)
|
||||
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
|
||||
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
|
||||
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
|
||||
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')
|
||||
@@ -14,29 +14,91 @@ from cisticola.base import Channel, ScraperResult, mapper_registry
|
||||
from cisticola.utils import make_request
|
||||
|
||||
class Scraper:
|
||||
"""Base class for defining platform-specific scrapers for scraping all posts
|
||||
from a given channel on that specific platform.
|
||||
"""
|
||||
|
||||
__version__ = "Scraper 0.0.0"
|
||||
|
||||
def __init__(self):
|
||||
self.s3_client = boto3.client('s3',
|
||||
region_name=os.environ['DO_SPACES_REGION'],
|
||||
endpoint_url='https://{}.digitaloceanspaces.com'.format(
|
||||
os.environ['DO_SPACES_REGION']),
|
||||
aws_access_key_id=os.environ['DO_SPACES_KEY'],
|
||||
aws_secret_access_key=os.environ['DO_SPACES_SECRET'])
|
||||
|
||||
# Initialize client to transfer files to the storage archive
|
||||
self.s3_client = boto3.client(
|
||||
service_name='s3',
|
||||
region_name=os.environ['DO_SPACES_REGION'],
|
||||
endpoint_url=f'https://{os.environ["DO_SPACES_REGION"]}.digitaloceanspaces.com',
|
||||
aws_access_key_id=os.environ['DO_SPACES_KEY'],
|
||||
aws_secret_access_key=os.environ['DO_SPACES_SECRET'])
|
||||
|
||||
# Define request headers (necessary to bypass scraping protection
|
||||
# for several platform scrapers)
|
||||
self.headers = {
|
||||
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'}
|
||||
|
||||
pass
|
||||
|
||||
def __str__(self):
|
||||
return self.__version__
|
||||
|
||||
def get_username_from_url(self, url: str) -> str:
|
||||
"""Extract a channel's username from its URL.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url: str
|
||||
URL of the channel on a given platform
|
||||
e.g. ``"https://twitter.com/EliotHiggins"``
|
||||
|
||||
Returns
|
||||
-------
|
||||
username: str
|
||||
Extracted username of the channel.
|
||||
e.g. ``"EliotHiggins"``
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
"""Generate a unique identifier for media from a specified post.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
url: str
|
||||
URL of original post.
|
||||
e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"``
|
||||
content_type: str
|
||||
Content-Type of media.
|
||||
e.g. ``"image/jpeg"``
|
||||
|
||||
Returns
|
||||
-------
|
||||
key: str
|
||||
Unique identifier for the media file from a specified post based on
|
||||
the original post URL and the media's Content-Type.
|
||||
"""
|
||||
|
||||
key = urlparse(url).path.split('/')[-1]
|
||||
return key
|
||||
|
||||
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
"""Download media file from a specified media file URL.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
url: str
|
||||
URL of media file from original post.
|
||||
e.g. ``"https://pbs.twimg.com/media/FN0j0dYWUAcQxfK?format=png&name=medium"``
|
||||
key: str or None
|
||||
Pre-defined unique identifier for the media file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
blob: bytes
|
||||
Raw bytes of the downloaded media file.
|
||||
content_type: str
|
||||
Content-Type of media.
|
||||
e.g. ``"image/jpeg"``.
|
||||
key: str
|
||||
Unique identifier for the media file.
|
||||
"""
|
||||
|
||||
r = make_request(url, headers = self.headers)
|
||||
|
||||
@@ -49,6 +111,27 @@ class Scraper:
|
||||
return blob, content_type, key
|
||||
|
||||
def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
"""Download media file from a specified media URL, where the media file
|
||||
is formatted as an m3u8 playlist, which is then decoded to an mp4 file.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
url: str
|
||||
URL of m3u8 playlist file from original post.
|
||||
e.g. ``"https://media.gettr.com/group47/origin/2022/03/15/01/cbc436c1-1a1a-4b97-671d-c42109f3ec9b/out.m3u8"``
|
||||
key: str or None
|
||||
Pre-defined unique identifier for the media file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
blob: bytes
|
||||
Raw bytes of the downloaded media file.
|
||||
content_type: str
|
||||
Content-Type of media.
|
||||
e.g. ``"video/mp4"``.
|
||||
key: str
|
||||
Unique identifier for the media file.
|
||||
"""
|
||||
|
||||
content_type = 'video/mp4'
|
||||
ext = '.' + content_type.split('/')[-1]
|
||||
@@ -71,7 +154,28 @@ class Scraper:
|
||||
return blob, content_type, key
|
||||
|
||||
def ytdlp_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
|
||||
"""Download media file from a specified media URL, using a fork of
|
||||
youtube-dl that enables faster downloading.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
url: str
|
||||
URL of media file from original post.
|
||||
e.g. ``"https://rumble.com/embed/vgt7gh/"``
|
||||
key: str or None
|
||||
Pre-defined unique identifier for the media file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
blob: bytes
|
||||
Raw bytes of the downloaded media file.
|
||||
content_type: str
|
||||
Content-Type of media.
|
||||
e.g. ``"video/mp4"``.
|
||||
key: str
|
||||
Unique identifier for the media file.
|
||||
"""
|
||||
|
||||
content_type = 'video/mp4'
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
@@ -103,6 +207,23 @@ class Scraper:
|
||||
return blob, content_type, key
|
||||
|
||||
def archive_blob(self, blob: bytes, content_type: str, key: str) -> str:
|
||||
"""Upload raw bytes of a media file to the storage archive.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
blob: bytes
|
||||
Raw bytes of the media file to be archived.
|
||||
content_type: str
|
||||
Content-Type of media.
|
||||
e.g. ``"video/mp4"``.
|
||||
key: str
|
||||
Unique identifier for the media file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
archived_url: str
|
||||
URL specifying the file on the storage archive.
|
||||
"""
|
||||
|
||||
filename = self.__version__.replace(' ', '_') + '/' + key
|
||||
|
||||
@@ -114,9 +235,42 @@ class Scraper:
|
||||
return archived_url
|
||||
|
||||
def can_handle(self, channel: Channel) -> bool:
|
||||
"""Whether or not the scraper can scrape the specified channel.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
channel: Channel
|
||||
Channel to be scraped.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
``True`` if the scraper is capable of scraping ``channel``,
|
||||
``False`` if not.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
"""Scrape all posts from the specified Channel.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
channel: Channel
|
||||
Channel to be scraped.
|
||||
since: ScraperResult or None
|
||||
Most recently scraped ScraperResult from a previous scrape, or
|
||||
``None`` if scraper has not run before.
|
||||
archive_media: bool
|
||||
If ``True``, any media files (images, video, etc.) from posts are archived.
|
||||
If ``False``, media files are not archived.
|
||||
|
||||
Yields
|
||||
------
|
||||
ScraperResult
|
||||
Scraper result from a single post/comment from the specified Channel.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@@ -129,9 +283,13 @@ class ScraperController:
|
||||
self.session = None
|
||||
|
||||
def register_scraper(self, scraper: Scraper):
|
||||
"""Register a single Scraper instance to the controller.
|
||||
"""
|
||||
self.scrapers.append(scraper)
|
||||
|
||||
def register_scrapers(self, scraper: List[Scraper]):
|
||||
"""Register a list of Scraper instances to the controller.
|
||||
"""
|
||||
self.scrapers.extend(scraper)
|
||||
|
||||
def scrape_all_channels(self, archive_media: bool = True):
|
||||
@@ -147,6 +305,17 @@ class ScraperController:
|
||||
|
||||
@logger.catch(reraise = True)
|
||||
def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
|
||||
"""Scrape all posts for all specified channels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
channels: list<Channel>
|
||||
List of Channel instances to be scraped
|
||||
archive_media: bool
|
||||
If ``True``, any media files (images, video, etc.) from posts are archived.
|
||||
If ``False``, media files are not archived.
|
||||
"""
|
||||
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
@@ -185,6 +354,9 @@ class ScraperController:
|
||||
logger.warning(f"No handler found for Channel {channel}")
|
||||
|
||||
def connect_to_db(self, engine):
|
||||
"""Connect the specified SQLAlchemy engine to the controller.
|
||||
"""
|
||||
|
||||
# create tables
|
||||
mapper_registry.metadata.create_all(bind=engine)
|
||||
|
||||
@@ -193,8 +365,8 @@ class ScraperController:
|
||||
self.session.configure(bind=self.engine)
|
||||
|
||||
def reset_db(self):
|
||||
"""Drop all data from the connected SQLAlchemy database.
|
||||
"""
|
||||
|
||||
mapper_registry.metadata.drop_all(bind=self.engine)
|
||||
self.connect_to_db(self.engine)
|
||||
|
||||
|
||||
self.connect_to_db(self.engine)
|
||||
@@ -17,7 +17,7 @@ class BitchuteScraper(Scraper):
|
||||
library"""
|
||||
__version__ = "BitchuteScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(url):
|
||||
def get_username_from_url(self, url):
|
||||
username = url.split('bitchute.com/channel/')[-1].strip('/')
|
||||
|
||||
return username
|
||||
@@ -33,7 +33,7 @@ class BitchuteScraper(Scraper):
|
||||
|
||||
detail = 'comments'
|
||||
|
||||
username = BitchuteScraper.get_username_from_url(channel.url)
|
||||
username = self.get_username_from_url(channel.url)
|
||||
scraper = get_videos_user(session, username, csrftoken, detail)
|
||||
|
||||
for post in scraper:
|
||||
@@ -61,7 +61,7 @@ class BitchuteScraper(Scraper):
|
||||
archived_urls=archived_urls)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Bitchute" and BitchuteScraper.get_username_from_url(channel.url) is not None:
|
||||
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
@@ -11,14 +11,14 @@ class GabScraper(Scraper):
|
||||
"""An implementation of a Scraper for Gab, using GARC library"""
|
||||
__version__ = "GabScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(url):
|
||||
def get_username_from_url(self, url):
|
||||
username = url.split('https://gab.com/')[-1]
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
client = Garc(profile = 'main')
|
||||
username = GabScraper.get_username_from_url(channel.url)
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
scraper = client.userposts(username)
|
||||
|
||||
@@ -52,5 +52,5 @@ class GabScraper(Scraper):
|
||||
archived_urls=archived_urls)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Gab" and GabScraper.get_username_from_url(channel.url) is not None:
|
||||
if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
@@ -12,7 +12,7 @@ class GettrScraper(Scraper):
|
||||
"""An implementation of a Scraper for Gettr, using gogettr library"""
|
||||
__version__ = "GettrScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(url):
|
||||
def get_username_from_url(self, url):
|
||||
username = url.split("gettr.com/user/")[1]
|
||||
if len(username.split("/")) > 1:
|
||||
return None
|
||||
@@ -21,7 +21,7 @@ class GettrScraper(Scraper):
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
client = PublicClient()
|
||||
username = GettrScraper.get_username_from_url(channel.url)
|
||||
username = self.get_username_from_url(channel.url)
|
||||
scraper = client.user_activity(username=username, type="posts")
|
||||
|
||||
for post in scraper:
|
||||
@@ -62,7 +62,7 @@ class GettrScraper(Scraper):
|
||||
archived_urls=archived_urls)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None:
|
||||
if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
|
||||
@@ -18,6 +18,7 @@ CONTENT_TYPES = {
|
||||
'mp4' : 'video/mp4'}
|
||||
|
||||
class InstagramScraper(Scraper):
|
||||
"""An implementation of a Scraper for Instagram, using instaloader library"""
|
||||
__version__ = "InstagramScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(self, url):
|
||||
|
||||
@@ -13,7 +13,7 @@ class OdyseeScraper(Scraper):
|
||||
"""An implementation of a Scraper for Odysee, using polyphemus library"""
|
||||
__version__ = "OdyseeScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(url):
|
||||
def get_username_from_url(self, url):
|
||||
|
||||
username = url.split('odysee.com/')[-1].strip('@').split(':')[0]
|
||||
|
||||
@@ -21,7 +21,7 @@ class OdyseeScraper(Scraper):
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
username = OdyseeScraper.get_username_from_url(channel.url)
|
||||
username = self.get_username_from_url(channel.url)
|
||||
odysee_channel = OdyseeChannel(channel_name = username)
|
||||
|
||||
all_videos = odysee_channel.get_all_videos()
|
||||
@@ -70,7 +70,7 @@ class OdyseeScraper(Scraper):
|
||||
archived_urls={})
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Odysee" and OdyseeScraper.get_username_from_url(channel.url) is not None:
|
||||
if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
|
||||
@@ -14,14 +14,14 @@ class RumbleScraper(Scraper):
|
||||
"""An implementation of a Scraper for Rumble, using custom functions"""
|
||||
__version__ = "RumbleScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(url):
|
||||
def get_username_from_url(self, url):
|
||||
username = url.split('https://rumble.com/c/')[1]
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
username = RumbleScraper.get_username_from_url(channel.url)
|
||||
username = self.get_username_from_url(channel.url)
|
||||
scraper = get_channel_videos(username)
|
||||
|
||||
for post in scraper:
|
||||
@@ -54,7 +54,7 @@ class RumbleScraper(Scraper):
|
||||
return key
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Rumble" and RumbleScraper.get_username_from_url(channel.url) is not None:
|
||||
if channel.platform == "Rumble" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
@@ -8,6 +8,7 @@ from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class TelegramSnscrapeScraper(Scraper):
|
||||
"""An implementation of a Scraper for Telegram, using snscrape library"""
|
||||
__version__ = "TelegramSnscrapeScraper 0.0.1"
|
||||
|
||||
def can_handle(self, channel):
|
||||
|
||||
@@ -14,6 +14,7 @@ from cisticola.scraper.base import Scraper
|
||||
MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
|
||||
|
||||
class TelegramTelethonScraper(Scraper):
|
||||
"""An implementation of a Scraper for Telegram, using Telethon library"""
|
||||
__version__ = "TelegramTelethonScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(self, url):
|
||||
@@ -30,9 +31,9 @@ class TelegramTelethonScraper(Scraper):
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
api_id = os.environ['TELEGRAM_API_ID_1']
|
||||
api_hash = os.environ['TELEGRAM_API_HASH_1']
|
||||
phone = os.environ['TELEGRAM_PHONE_1']
|
||||
api_id = os.environ['TELEGRAM_API_ID']
|
||||
api_hash = os.environ['TELEGRAM_API_HASH']
|
||||
phone = os.environ['TELEGRAM_PHONE']
|
||||
|
||||
with TelegramClient(phone, api_id, api_hash) as client:
|
||||
|
||||
|
||||
@@ -8,12 +8,24 @@ SPHINXBUILD ?= sphinx-build
|
||||
SOURCEDIR = source
|
||||
BUILDDIR = build
|
||||
|
||||
SPHINXAPIDOC = sphinx-apidoc
|
||||
APIDOCFLAGS = --separate --private --module-first
|
||||
MODULEPATH = ../cisticola
|
||||
SOURCEFILES = cisticola.*
|
||||
MODULEFILE = modules.rst
|
||||
|
||||
# Put it first so that "make" without argument is like "make help".
|
||||
help:
|
||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
|
||||
.PHONY: help Makefile
|
||||
|
||||
# Custom process and flags for generating Sphinx sources
|
||||
apidoc:
|
||||
rm $(SOURCEDIR)/$(SOURCEFILES)
|
||||
$(SPHINXAPIDOC) $(APIDOCFLAGS) -o "$(SOURCEDIR)" "$(MODULEPATH)"
|
||||
rm $(SOURCEDIR)/$(MODULEFILE)
|
||||
|
||||
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||
%: Makefile
|
||||
|
||||
64
docs/images/cisticola_logo.svg
Normal file
64
docs/images/cisticola_logo.svg
Normal file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 7.0 KiB |
BIN
docs/images/favicon.ico
Normal file
BIN
docs/images/favicon.ico
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 614 B |
@@ -10,6 +10,12 @@ if "%SPHINXBUILD%" == "" (
|
||||
set SOURCEDIR=source
|
||||
set BUILDDIR=build
|
||||
|
||||
set SPHINXAPIDOC=sphinx-apidoc
|
||||
set APIDOCFLAGS=--separate --private --module-first
|
||||
set MODULEPATH=../cisticola
|
||||
set SOURCEFILES=cisticola.*
|
||||
set MODULEFILE=modules.rst
|
||||
|
||||
if "%1" == "" goto help
|
||||
|
||||
%SPHINXBUILD% >NUL 2>NUL
|
||||
@@ -28,6 +34,11 @@ if errorlevel 9009 (
|
||||
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
goto end
|
||||
|
||||
:apidoc
|
||||
del %SOURCEDIR%\%SOURCEFILES%
|
||||
%SPHINXAPIDOC% %APIDOCFLAGS% -o %SOURCEDIR% %MODULEPATH%
|
||||
del %SOURCEDIR%\%MODULEFILE%
|
||||
|
||||
:help
|
||||
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
|
||||
|
||||
8
docs/source/cisticola.base.rst
Normal file
8
docs/source/cisticola.base.rst
Normal file
@@ -0,0 +1,8 @@
|
||||
cisticola.base module
|
||||
=====================
|
||||
|
||||
.. automodule:: cisticola.base
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
@@ -1,6 +1,12 @@
|
||||
cisticola package
|
||||
=================
|
||||
|
||||
.. automodule:: cisticola
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
|
||||
Subpackages
|
||||
-----------
|
||||
|
||||
@@ -13,18 +19,8 @@ Subpackages
|
||||
Submodules
|
||||
----------
|
||||
|
||||
cisticola.base module
|
||||
---------------------
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
.. automodule:: cisticola.base
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: cisticola
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
cisticola.base
|
||||
cisticola.utils
|
||||
|
||||
8
docs/source/cisticola.scraper.base.rst
Normal file
8
docs/source/cisticola.scraper.base.rst
Normal file
@@ -0,0 +1,8 @@
|
||||
cisticola.scraper.base module
|
||||
=============================
|
||||
|
||||
.. automodule:: cisticola.scraper.base
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
8
docs/source/cisticola.scraper.bitchute.rst
Normal file
8
docs/source/cisticola.scraper.bitchute.rst
Normal file
@@ -0,0 +1,8 @@
|
||||
cisticola.scraper.bitchute module
|
||||
=================================
|
||||
|
||||
.. automodule:: cisticola.scraper.bitchute
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
8
docs/source/cisticola.scraper.gab.rst
Normal file
8
docs/source/cisticola.scraper.gab.rst
Normal file
@@ -0,0 +1,8 @@
|
||||
cisticola.scraper.gab module
|
||||
============================
|
||||
|
||||
.. automodule:: cisticola.scraper.gab
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
8
docs/source/cisticola.scraper.gettr.rst
Normal file
8
docs/source/cisticola.scraper.gettr.rst
Normal file
@@ -0,0 +1,8 @@
|
||||
cisticola.scraper.gettr module
|
||||
==============================
|
||||
|
||||
.. automodule:: cisticola.scraper.gettr
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
8
docs/source/cisticola.scraper.instagram.rst
Normal file
8
docs/source/cisticola.scraper.instagram.rst
Normal file
@@ -0,0 +1,8 @@
|
||||
cisticola.scraper.instagram module
|
||||
==================================
|
||||
|
||||
.. automodule:: cisticola.scraper.instagram
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
8
docs/source/cisticola.scraper.odysee.rst
Normal file
8
docs/source/cisticola.scraper.odysee.rst
Normal file
@@ -0,0 +1,8 @@
|
||||
cisticola.scraper.odysee module
|
||||
===============================
|
||||
|
||||
.. automodule:: cisticola.scraper.odysee
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
@@ -1,37 +1,27 @@
|
||||
cisticola.scraper package
|
||||
=========================
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
cisticola.scraper.bitchute module
|
||||
---------------------------------
|
||||
|
||||
.. automodule:: cisticola.scraper.bitchute
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
cisticola.scraper.gettr module
|
||||
------------------------------
|
||||
|
||||
.. automodule:: cisticola.scraper.gettr
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
cisticola.scraper.twitter module
|
||||
--------------------------------
|
||||
|
||||
.. automodule:: cisticola.scraper.twitter
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: cisticola.scraper
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
cisticola.scraper.base
|
||||
cisticola.scraper.bitchute
|
||||
cisticola.scraper.gab
|
||||
cisticola.scraper.gettr
|
||||
cisticola.scraper.instagram
|
||||
cisticola.scraper.odysee
|
||||
cisticola.scraper.rumble
|
||||
cisticola.scraper.telegram_snscrape
|
||||
cisticola.scraper.telegram_telethon
|
||||
cisticola.scraper.twitter
|
||||
cisticola.scraper.vkontakte
|
||||
cisticola.scraper.youtube
|
||||
|
||||
8
docs/source/cisticola.scraper.rumble.rst
Normal file
8
docs/source/cisticola.scraper.rumble.rst
Normal file
@@ -0,0 +1,8 @@
|
||||
cisticola.scraper.rumble module
|
||||
===============================
|
||||
|
||||
.. automodule:: cisticola.scraper.rumble
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
8
docs/source/cisticola.scraper.telegram_snscrape.rst
Normal file
8
docs/source/cisticola.scraper.telegram_snscrape.rst
Normal file
@@ -0,0 +1,8 @@
|
||||
cisticola.scraper.telegram\_snscrape module
|
||||
===========================================
|
||||
|
||||
.. automodule:: cisticola.scraper.telegram_snscrape
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
8
docs/source/cisticola.scraper.telegram_telethon.rst
Normal file
8
docs/source/cisticola.scraper.telegram_telethon.rst
Normal file
@@ -0,0 +1,8 @@
|
||||
cisticola.scraper.telegram\_telethon module
|
||||
===========================================
|
||||
|
||||
.. automodule:: cisticola.scraper.telegram_telethon
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
8
docs/source/cisticola.scraper.twitter.rst
Normal file
8
docs/source/cisticola.scraper.twitter.rst
Normal file
@@ -0,0 +1,8 @@
|
||||
cisticola.scraper.twitter module
|
||||
================================
|
||||
|
||||
.. automodule:: cisticola.scraper.twitter
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
8
docs/source/cisticola.scraper.vkontakte.rst
Normal file
8
docs/source/cisticola.scraper.vkontakte.rst
Normal file
@@ -0,0 +1,8 @@
|
||||
cisticola.scraper.vkontakte module
|
||||
==================================
|
||||
|
||||
.. automodule:: cisticola.scraper.vkontakte
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
8
docs/source/cisticola.scraper.youtube.rst
Normal file
8
docs/source/cisticola.scraper.youtube.rst
Normal file
@@ -0,0 +1,8 @@
|
||||
cisticola.scraper.youtube module
|
||||
================================
|
||||
|
||||
.. automodule:: cisticola.scraper.youtube
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
8
docs/source/cisticola.transformer.base.rst
Normal file
8
docs/source/cisticola.transformer.base.rst
Normal file
@@ -0,0 +1,8 @@
|
||||
cisticola.transformer.base module
|
||||
=================================
|
||||
|
||||
.. automodule:: cisticola.transformer.base
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
8
docs/source/cisticola.transformer.bitchute.rst
Normal file
8
docs/source/cisticola.transformer.bitchute.rst
Normal file
@@ -0,0 +1,8 @@
|
||||
cisticola.transformer.bitchute module
|
||||
=====================================
|
||||
|
||||
.. automodule:: cisticola.transformer.bitchute
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
@@ -1,21 +1,18 @@
|
||||
cisticola.transformer package
|
||||
=============================
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
cisticola.transformer.twitter module
|
||||
------------------------------------
|
||||
|
||||
.. automodule:: cisticola.transformer.twitter
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
.. automodule:: cisticola.transformer
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
cisticola.transformer.base
|
||||
cisticola.transformer.bitchute
|
||||
cisticola.transformer.twitter
|
||||
|
||||
8
docs/source/cisticola.transformer.twitter.rst
Normal file
8
docs/source/cisticola.transformer.twitter.rst
Normal file
@@ -0,0 +1,8 @@
|
||||
cisticola.transformer.twitter module
|
||||
====================================
|
||||
|
||||
.. automodule:: cisticola.transformer.twitter
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
8
docs/source/cisticola.utils.rst
Normal file
8
docs/source/cisticola.utils.rst
Normal file
@@ -0,0 +1,8 @@
|
||||
cisticola.utils module
|
||||
======================
|
||||
|
||||
.. automodule:: cisticola.utils
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
@@ -43,9 +43,18 @@ exclude_patterns = []
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
#
|
||||
html_theme = 'alabaster'
|
||||
html_theme = 'sphinx_rtd_theme'
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = []
|
||||
|
||||
# -- Default flags for autodoc------------------------------------------------
|
||||
|
||||
autodoc_default_options = {'exclude-members': '_sa_class_manager'}
|
||||
|
||||
html_favicon = '../images/favicon.ico'
|
||||
html_logo = '../images/cisticola_logo.svg'
|
||||
|
||||
html_theme_options = {'style_nav_header_background': '#000000'}
|
||||
@@ -2,16 +2,7 @@ Welcome to Cisticola's documentation!
|
||||
=====================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Contents:
|
||||
:maxdepth: 1
|
||||
|
||||
modules
|
||||
|
||||
|
||||
|
||||
Indices and tables
|
||||
==================
|
||||
|
||||
* :ref:`genindex`
|
||||
* :ref:`modindex`
|
||||
* :ref:`search`
|
||||
quickstart
|
||||
cisticola
|
||||
@@ -1,7 +0,0 @@
|
||||
cisticola
|
||||
=========
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
cisticola
|
||||
96
docs/source/quickstart.rst
Normal file
96
docs/source/quickstart.rst
Normal file
@@ -0,0 +1,96 @@
|
||||
Quickstart
|
||||
==========
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
The *cisticola* application uses pipenv_ for dependency management. To install the dependencies of *cisticola*, first install pipenv using the following command:
|
||||
|
||||
.. code-block::
|
||||
|
||||
pip install pipenv
|
||||
|
||||
and then install the dependencies using the following command from the package root directory:
|
||||
|
||||
.. code-block::
|
||||
|
||||
pipenv install
|
||||
|
||||
To install the necessary dependencies for building the documentation and running unit tests, run the following command from the package root directory:
|
||||
|
||||
.. code-block::
|
||||
|
||||
pipenv install --dev
|
||||
|
||||
Environment Variables
|
||||
---------------------
|
||||
|
||||
Three of the scrapers in *cisticola* (:py:mod:`~cisticola.scraper.gab.GabScraper`, :py:mod:`~cisticola.scraper.instagram.InstagramScraper`, and :py:mod:`~cisticola.scraper.telegram_telethon.TelegramTelethonScraper`) require platform credentials to work correctly.
|
||||
|
||||
Gab
|
||||
"""
|
||||
|
||||
The Gab credentials can be configured by running the following command from the root directory:
|
||||
|
||||
.. code-block::
|
||||
|
||||
pipenv run garc configure
|
||||
|
||||
which will direct you to provide the username and password for your Gab account.
|
||||
|
||||
Instagram
|
||||
"""""""""
|
||||
|
||||
The Instagram credentials can be configured by setting the following environment variables, either in the project's ``.env`` file or in the system's environment:
|
||||
|
||||
- ``INSTAGRAM_USERNAME``: username of your Instagram account
|
||||
- ``INSTAGRAM_PASSWORD``: password of your Instagram account
|
||||
|
||||
Telegram Telethon
|
||||
"""""""""""""""""
|
||||
|
||||
The Telegram credentials can be configured by setting the following environment variables, either in the project's ``.env`` file or in the system's environment:
|
||||
|
||||
- ``TELEGRAM_API_ID``: API ID number for your Telegram application
|
||||
- ``TELEGRAM_API_HASH``: API hash for your Telegram application
|
||||
- ``TELEGRAM_PHONE``: phone number for the account corresponding to your your Telegram application
|
||||
|
||||
If you do not already have a Telegram application, you can create one by following the instructions on `this page`_.
|
||||
|
||||
Documentation
|
||||
-------------
|
||||
|
||||
The *cisticola* application uses Sphinx_ to generate and display its documentation. To build the documentation in the HTML format, run the following command from the ``docs/`` directory:
|
||||
|
||||
.. code-block::
|
||||
|
||||
pipenv run make html
|
||||
|
||||
For developers, if changes are made to the package structure or additional modules are created, you can update the Sphinx source ``*.rst`` files by running the following command from the ``docs/`` directory:
|
||||
|
||||
.. code-block::
|
||||
|
||||
pipenv run make apidoc
|
||||
|
||||
Testing
|
||||
-------
|
||||
|
||||
The *cisticola* application uses pytest_ for unit testing. To run the test suite, run the following command from the package root directory:
|
||||
|
||||
.. code-block::
|
||||
|
||||
pipenv run pytest
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
An example of a *cisticola* ingest file ``russian_telegram_ingest.py`` is included in the package root directory, showing how the list of channels to scrape is defined, and how the :py:mod:`~cisticola.scraper.base.ScraperController` and :py:mod:`~cisticola.transformer.base.Transformer` classes are used. To run the ingest script, run the following command from the package root directory:
|
||||
|
||||
.. code-block::
|
||||
|
||||
pipenv run python russian_telegram_ingest.py
|
||||
|
||||
.. _pipenv: https://pipenv.pypa.io/en/latest/
|
||||
.. _Sphinx: https://www.sphinx-doc.org/en/master/
|
||||
.. _pytest: https://docs.pytest.org/en/7.1.x/
|
||||
.. _this page: https://core.telegram.org/api/obtaining_api_id
|
||||
@@ -1,6 +1,6 @@
|
||||
[pytest]
|
||||
minversion =
|
||||
6.0.2
|
||||
7.0.0
|
||||
testpaths =
|
||||
tests/
|
||||
python_files =
|
||||
@@ -13,4 +13,5 @@ addopts =
|
||||
--self-contained-html
|
||||
filterwarnings =
|
||||
ignore:the imp module is deprecated:DeprecationWarning
|
||||
ignore:The localize method is no longer necessary, as this time zone supports the fold attribute
|
||||
ignore:The localize method is no longer necessary, as this time zone supports the fold attribute
|
||||
ignore:invalid escape sequence:DeprecationWarning
|
||||
Reference in New Issue
Block a user