mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-12 21:38:33 +03:00
Merge branch 'main' into channel-db
This commit is contained in:
3
Pipfile
3
Pipfile
@@ -10,7 +10,6 @@ gogettr = "*"
|
|||||||
requests = "*"
|
requests = "*"
|
||||||
bs4 = "*"
|
bs4 = "*"
|
||||||
dateparser = "*"
|
dateparser = "*"
|
||||||
sphinx = "*"
|
|
||||||
boto3 = "*"
|
boto3 = "*"
|
||||||
snscrape = {git = "https://github.com/bellingcat/snscrape.git"}
|
snscrape = {git = "https://github.com/bellingcat/snscrape.git"}
|
||||||
ffmpeg-python = "*"
|
ffmpeg-python = "*"
|
||||||
@@ -29,6 +28,8 @@ pytest-cov = "*"
|
|||||||
pytest-html = "*"
|
pytest-html = "*"
|
||||||
pytest-metadata = "*"
|
pytest-metadata = "*"
|
||||||
black = "*"
|
black = "*"
|
||||||
|
sphinx = "*"
|
||||||
|
sphinx_rtd_theme = "*"
|
||||||
|
|
||||||
[requires]
|
[requires]
|
||||||
python_version = "3.9"
|
python_version = "3.9"
|
||||||
|
|||||||
418
Pipfile.lock
generated
418
Pipfile.lock
generated
@@ -1,7 +1,7 @@
|
|||||||
{
|
{
|
||||||
"_meta": {
|
"_meta": {
|
||||||
"hash": {
|
"hash": {
|
||||||
"sha256": "d465f2d09a728ee76cb0af521890ecc1e1bce672acbd1caf2e4d01b6567480d5"
|
"sha256": "e3b96b0ac8c80d4817f9adac4ab171bf4b7e07e80927c7b152a24e8bbdbf7faa"
|
||||||
},
|
},
|
||||||
"pipfile-spec": 6,
|
"pipfile-spec": 6,
|
||||||
"requires": {
|
"requires": {
|
||||||
@@ -16,13 +16,6 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"default": {
|
"default": {
|
||||||
"alabaster": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359",
|
|
||||||
"sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02"
|
|
||||||
],
|
|
||||||
"version": "==0.7.12"
|
|
||||||
},
|
|
||||||
"attrs": {
|
"attrs": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4",
|
"sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4",
|
||||||
@@ -31,14 +24,6 @@
|
|||||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||||
"version": "==21.4.0"
|
"version": "==21.4.0"
|
||||||
},
|
},
|
||||||
"babel": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:ab49e12b91d937cd11f0b67cb259a57ab4ad2b59ac7a3b41d6c06c0ac5b0def9",
|
|
||||||
"sha256:bc0c176f9f6a994582230df350aa6e05ba2ebe4b3ac317eab29d9be5d2768da0"
|
|
||||||
],
|
|
||||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
|
||||||
"version": "==2.9.1"
|
|
||||||
},
|
|
||||||
"beautifulsoup4": {
|
"beautifulsoup4": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf",
|
"sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf",
|
||||||
@@ -49,19 +34,19 @@
|
|||||||
},
|
},
|
||||||
"boto3": {
|
"boto3": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:8d6f3c548f0ee03d742f404c96515e7579fc6968135aaa50dd855a046698ff79",
|
"sha256:76d5b90400c54b25278150768e946edf166acce2c1597c0ecfbebb1dbe9acf2c",
|
||||||
"sha256:d857feb6af9932e1ee3a748060a2cd9fd6043dbbccf66976eda54586597efdc0"
|
"sha256:7bb2e6506a6ad44d111dd20a5d510374b6958fe989b4ef887109c79d812f926f"
|
||||||
],
|
],
|
||||||
"index": "pypi",
|
"index": "pypi",
|
||||||
"version": "==1.21.18"
|
"version": "==1.21.19"
|
||||||
},
|
},
|
||||||
"botocore": {
|
"botocore": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:7ea8ef1ff7c4882ab59b337662f90ddf5ea860e95e7e209dca593a34ea585b1b",
|
"sha256:5ed2be0e413961134f4c17eab16396d41a5b4b73a637588260c04d20806d52ea",
|
||||||
"sha256:d2da7ccbc5ddd61fe3cd45fcbd3de380d9e3a15bfa8fbfd2d9259a93dcc60c56"
|
"sha256:d0d77bce152ca51f3c2cd0f9bf05cb3b623e719406ad58b4c20444e237fe82eb"
|
||||||
],
|
],
|
||||||
"markers": "python_version >= '3.6'",
|
"markers": "python_version >= '3.6'",
|
||||||
"version": "==1.24.18"
|
"version": "==1.24.19"
|
||||||
},
|
},
|
||||||
"brotli": {
|
"brotli": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
@@ -169,14 +154,6 @@
|
|||||||
"index": "pypi",
|
"index": "pypi",
|
||||||
"version": "==1.1.0"
|
"version": "==1.1.0"
|
||||||
},
|
},
|
||||||
"docutils": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125",
|
|
||||||
"sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61"
|
|
||||||
],
|
|
||||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
|
||||||
"version": "==0.17.1"
|
|
||||||
},
|
|
||||||
"ffmpeg-python": {
|
"ffmpeg-python": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127",
|
"sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127",
|
||||||
@@ -284,22 +261,6 @@
|
|||||||
"markers": "python_version >= '3'",
|
"markers": "python_version >= '3'",
|
||||||
"version": "==3.3"
|
"version": "==3.3"
|
||||||
},
|
},
|
||||||
"imagesize": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c",
|
|
||||||
"sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d"
|
|
||||||
],
|
|
||||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
|
||||||
"version": "==1.3.0"
|
|
||||||
},
|
|
||||||
"importlib-metadata": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:1208431ca90a8cca1a6b8af391bb53c1a2db74e5d1cef6ddced95d4b2062edc6",
|
|
||||||
"sha256:ea4c597ebf37142f827b8f39299579e31685c31d3a438b59f469406afd0f2539"
|
|
||||||
],
|
|
||||||
"markers": "python_version < '3.10'",
|
|
||||||
"version": "==4.11.3"
|
|
||||||
},
|
|
||||||
"iniconfig": {
|
"iniconfig": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
|
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
|
||||||
@@ -314,14 +275,6 @@
|
|||||||
"index": "pypi",
|
"index": "pypi",
|
||||||
"version": "==4.8.4"
|
"version": "==4.8.4"
|
||||||
},
|
},
|
||||||
"jinja2": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8",
|
|
||||||
"sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7"
|
|
||||||
],
|
|
||||||
"markers": "python_version >= '3.6'",
|
|
||||||
"version": "==3.0.3"
|
|
||||||
},
|
|
||||||
"jmespath": {
|
"jmespath": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9",
|
"sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9",
|
||||||
@@ -405,52 +358,6 @@
|
|||||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||||
"version": "==4.8.0"
|
"version": "==4.8.0"
|
||||||
},
|
},
|
||||||
"markupsafe": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3",
|
|
||||||
"sha256:09c86c9643cceb1d87ca08cdc30160d1b7ab49a8a21564868921959bd16441b8",
|
|
||||||
"sha256:142119fb14a1ef6d758912b25c4e803c3ff66920635c44078666fe7cc3f8f759",
|
|
||||||
"sha256:1d1fb9b2eec3c9714dd936860850300b51dbaa37404209c8d4cb66547884b7ed",
|
|
||||||
"sha256:204730fd5fe2fe3b1e9ccadb2bd18ba8712b111dcabce185af0b3b5285a7c989",
|
|
||||||
"sha256:24c3be29abb6b34052fd26fc7a8e0a49b1ee9d282e3665e8ad09a0a68faee5b3",
|
|
||||||
"sha256:290b02bab3c9e216da57c1d11d2ba73a9f73a614bbdcc027d299a60cdfabb11a",
|
|
||||||
"sha256:3028252424c72b2602a323f70fbf50aa80a5d3aa616ea6add4ba21ae9cc9da4c",
|
|
||||||
"sha256:30c653fde75a6e5eb814d2a0a89378f83d1d3f502ab710904ee585c38888816c",
|
|
||||||
"sha256:3cace1837bc84e63b3fd2dfce37f08f8c18aeb81ef5cf6bb9b51f625cb4e6cd8",
|
|
||||||
"sha256:4056f752015dfa9828dce3140dbadd543b555afb3252507348c493def166d454",
|
|
||||||
"sha256:454ffc1cbb75227d15667c09f164a0099159da0c1f3d2636aa648f12675491ad",
|
|
||||||
"sha256:598b65d74615c021423bd45c2bc5e9b59539c875a9bdb7e5f2a6b92dfcfc268d",
|
|
||||||
"sha256:599941da468f2cf22bf90a84f6e2a65524e87be2fce844f96f2dd9a6c9d1e635",
|
|
||||||
"sha256:5ddea4c352a488b5e1069069f2f501006b1a4362cb906bee9a193ef1245a7a61",
|
|
||||||
"sha256:62c0285e91414f5c8f621a17b69fc0088394ccdaa961ef469e833dbff64bd5ea",
|
|
||||||
"sha256:679cbb78914ab212c49c67ba2c7396dc599a8479de51b9a87b174700abd9ea49",
|
|
||||||
"sha256:6e104c0c2b4cd765b4e83909cde7ec61a1e313f8a75775897db321450e928cce",
|
|
||||||
"sha256:736895a020e31b428b3382a7887bfea96102c529530299f426bf2e636aacec9e",
|
|
||||||
"sha256:75bb36f134883fdbe13d8e63b8675f5f12b80bb6627f7714c7d6c5becf22719f",
|
|
||||||
"sha256:7d2f5d97fcbd004c03df8d8fe2b973fe2b14e7bfeb2cfa012eaa8759ce9a762f",
|
|
||||||
"sha256:80beaf63ddfbc64a0452b841d8036ca0611e049650e20afcb882f5d3c266d65f",
|
|
||||||
"sha256:84ad5e29bf8bab3ad70fd707d3c05524862bddc54dc040982b0dbcff36481de7",
|
|
||||||
"sha256:8da5924cb1f9064589767b0f3fc39d03e3d0fb5aa29e0cb21d43106519bd624a",
|
|
||||||
"sha256:961eb86e5be7d0973789f30ebcf6caab60b844203f4396ece27310295a6082c7",
|
|
||||||
"sha256:96de1932237abe0a13ba68b63e94113678c379dca45afa040a17b6e1ad7ed076",
|
|
||||||
"sha256:a0a0abef2ca47b33fb615b491ce31b055ef2430de52c5b3fb19a4042dbc5cadb",
|
|
||||||
"sha256:b2a5a856019d2833c56a3dcac1b80fe795c95f401818ea963594b345929dffa7",
|
|
||||||
"sha256:b8811d48078d1cf2a6863dafb896e68406c5f513048451cd2ded0473133473c7",
|
|
||||||
"sha256:c532d5ab79be0199fa2658e24a02fce8542df196e60665dd322409a03db6a52c",
|
|
||||||
"sha256:d3b64c65328cb4cd252c94f83e66e3d7acf8891e60ebf588d7b493a55a1dbf26",
|
|
||||||
"sha256:d4e702eea4a2903441f2735799d217f4ac1b55f7d8ad96ab7d4e25417cb0827c",
|
|
||||||
"sha256:d5653619b3eb5cbd35bfba3c12d575db2a74d15e0e1c08bf1db788069d410ce8",
|
|
||||||
"sha256:d66624f04de4af8bbf1c7f21cc06649c1c69a7f84109179add573ce35e46d448",
|
|
||||||
"sha256:e67ec74fada3841b8c5f4c4f197bea916025cb9aa3fe5abf7d52b655d042f956",
|
|
||||||
"sha256:e6f7f3f41faffaea6596da86ecc2389672fa949bd035251eab26dc6697451d05",
|
|
||||||
"sha256:f02cf7221d5cd915d7fa58ab64f7ee6dd0f6cddbb48683debf5d04ae9b1c2cc1",
|
|
||||||
"sha256:f0eddfcabd6936558ec020130f932d479930581171368fd728efcfb6ef0dd357",
|
|
||||||
"sha256:fabbe18087c3d33c5824cb145ffca52eccd053061df1d79d4b66dafa5ad2a5ea",
|
|
||||||
"sha256:fc3150f85e2dbcf99e65238c842d1cfe69d3e7649b19864c1cc043213d9cd730"
|
|
||||||
],
|
|
||||||
"markers": "python_version >= '3.7'",
|
|
||||||
"version": "==2.1.0"
|
|
||||||
},
|
|
||||||
"mutagen": {
|
"mutagen": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1",
|
"sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1",
|
||||||
@@ -642,14 +549,6 @@
|
|||||||
"git": "https://github.com/smarnach/pyexiftool.git",
|
"git": "https://github.com/smarnach/pyexiftool.git",
|
||||||
"ref": "3db3764895e687d75b42d3ae4e554ca8664a7f6f"
|
"ref": "3db3764895e687d75b42d3ae4e554ca8664a7f6f"
|
||||||
},
|
},
|
||||||
"pygments": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65",
|
|
||||||
"sha256:4e426f72023d88d03b2fa258de560726ce890ff3b630f88c21cbb8b2503b8c6a"
|
|
||||||
],
|
|
||||||
"markers": "python_version >= '3.5'",
|
|
||||||
"version": "==2.11.2"
|
|
||||||
},
|
|
||||||
"pyparsing": {
|
"pyparsing": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea",
|
"sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea",
|
||||||
@@ -786,6 +685,9 @@
|
|||||||
"version": "==2022.3.2"
|
"version": "==2022.3.2"
|
||||||
},
|
},
|
||||||
"requests": {
|
"requests": {
|
||||||
|
"extras": [
|
||||||
|
"socks"
|
||||||
|
],
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
|
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
|
||||||
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
|
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
|
||||||
@@ -817,13 +719,6 @@
|
|||||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||||
"version": "==1.16.0"
|
"version": "==1.16.0"
|
||||||
},
|
},
|
||||||
"snowballstemmer": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1",
|
|
||||||
"sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"
|
|
||||||
],
|
|
||||||
"version": "==2.2.0"
|
|
||||||
},
|
|
||||||
"snscrape": {
|
"snscrape": {
|
||||||
"git": "https://github.com/bellingcat/snscrape.git",
|
"git": "https://github.com/bellingcat/snscrape.git",
|
||||||
"ref": "de4ebed81f3f6a4bb4c65630daab6ec63784959b"
|
"ref": "de4ebed81f3f6a4bb4c65630daab6ec63784959b"
|
||||||
@@ -836,62 +731,6 @@
|
|||||||
"markers": "python_version >= '3.6'",
|
"markers": "python_version >= '3.6'",
|
||||||
"version": "==2.3.1"
|
"version": "==2.3.1"
|
||||||
},
|
},
|
||||||
"sphinx": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:5da895959511473857b6d0200f56865ed62c31e8f82dd338063b84ec022701fe",
|
|
||||||
"sha256:6caad9786055cb1fa22b4a365c1775816b876f91966481765d7d50e9f0dd35cc"
|
|
||||||
],
|
|
||||||
"index": "pypi",
|
|
||||||
"version": "==4.4.0"
|
|
||||||
},
|
|
||||||
"sphinxcontrib-applehelp": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a",
|
|
||||||
"sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58"
|
|
||||||
],
|
|
||||||
"markers": "python_version >= '3.5'",
|
|
||||||
"version": "==1.0.2"
|
|
||||||
},
|
|
||||||
"sphinxcontrib-devhelp": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e",
|
|
||||||
"sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4"
|
|
||||||
],
|
|
||||||
"markers": "python_version >= '3.5'",
|
|
||||||
"version": "==1.0.2"
|
|
||||||
},
|
|
||||||
"sphinxcontrib-htmlhelp": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07",
|
|
||||||
"sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2"
|
|
||||||
],
|
|
||||||
"markers": "python_version >= '3.6'",
|
|
||||||
"version": "==2.0.0"
|
|
||||||
},
|
|
||||||
"sphinxcontrib-jsmath": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178",
|
|
||||||
"sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"
|
|
||||||
],
|
|
||||||
"markers": "python_version >= '3.5'",
|
|
||||||
"version": "==1.0.1"
|
|
||||||
},
|
|
||||||
"sphinxcontrib-qthelp": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72",
|
|
||||||
"sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6"
|
|
||||||
],
|
|
||||||
"markers": "python_version >= '3.5'",
|
|
||||||
"version": "==1.0.3"
|
|
||||||
},
|
|
||||||
"sphinxcontrib-serializinghtml": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd",
|
|
||||||
"sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952"
|
|
||||||
],
|
|
||||||
"markers": "python_version >= '3.5'",
|
|
||||||
"version": "==1.1.5"
|
|
||||||
},
|
|
||||||
"sqlalchemy": {
|
"sqlalchemy": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:04164e0063feb7aedd9d073db0fd496edb244be40d46ea1f0d8990815e4b8c34",
|
"sha256:04164e0063feb7aedd9d073db0fd496edb244be40d46ea1f0d8990815e4b8c34",
|
||||||
@@ -1034,17 +873,16 @@
|
|||||||
],
|
],
|
||||||
"index": "pypi",
|
"index": "pypi",
|
||||||
"version": "==2022.3.8.2"
|
"version": "==2022.3.8.2"
|
||||||
},
|
|
||||||
"zipp": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d",
|
|
||||||
"sha256:b47250dd24f92b7dd6a0a8fc5244da14608f3ca90a5efcd37a3b1642fac9a375"
|
|
||||||
],
|
|
||||||
"markers": "python_version >= '3.7'",
|
|
||||||
"version": "==3.7.0"
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"develop": {
|
"develop": {
|
||||||
|
"alabaster": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359",
|
||||||
|
"sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02"
|
||||||
|
],
|
||||||
|
"version": "==0.7.12"
|
||||||
|
},
|
||||||
"attrs": {
|
"attrs": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4",
|
"sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4",
|
||||||
@@ -1053,6 +891,29 @@
|
|||||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||||
"version": "==21.4.0"
|
"version": "==21.4.0"
|
||||||
},
|
},
|
||||||
|
"babel": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:ab49e12b91d937cd11f0b67cb259a57ab4ad2b59ac7a3b41d6c06c0ac5b0def9",
|
||||||
|
"sha256:bc0c176f9f6a994582230df350aa6e05ba2ebe4b3ac317eab29d9be5d2768da0"
|
||||||
|
],
|
||||||
|
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||||
|
"version": "==2.9.1"
|
||||||
|
},
|
||||||
|
"certifi": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
|
||||||
|
"sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
|
||||||
|
],
|
||||||
|
"version": "==2021.10.8"
|
||||||
|
},
|
||||||
|
"charset-normalizer": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597",
|
||||||
|
"sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"
|
||||||
|
],
|
||||||
|
"markers": "python_version >= '3'",
|
||||||
|
"version": "==2.0.12"
|
||||||
|
},
|
||||||
"coverage": {
|
"coverage": {
|
||||||
"extras": [
|
"extras": [
|
||||||
"toml"
|
"toml"
|
||||||
@@ -1103,6 +964,38 @@
|
|||||||
"markers": "python_version >= '3.7'",
|
"markers": "python_version >= '3.7'",
|
||||||
"version": "==6.3.2"
|
"version": "==6.3.2"
|
||||||
},
|
},
|
||||||
|
"docutils": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125",
|
||||||
|
"sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61"
|
||||||
|
],
|
||||||
|
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||||
|
"version": "==0.17.1"
|
||||||
|
},
|
||||||
|
"idna": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
|
||||||
|
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
|
||||||
|
],
|
||||||
|
"markers": "python_version >= '3'",
|
||||||
|
"version": "==3.3"
|
||||||
|
},
|
||||||
|
"imagesize": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c",
|
||||||
|
"sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d"
|
||||||
|
],
|
||||||
|
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||||
|
"version": "==1.3.0"
|
||||||
|
},
|
||||||
|
"importlib-metadata": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:1208431ca90a8cca1a6b8af391bb53c1a2db74e5d1cef6ddced95d4b2062edc6",
|
||||||
|
"sha256:ea4c597ebf37142f827b8f39299579e31685c31d3a438b59f469406afd0f2539"
|
||||||
|
],
|
||||||
|
"markers": "python_version < '3.10'",
|
||||||
|
"version": "==4.11.3"
|
||||||
|
},
|
||||||
"iniconfig": {
|
"iniconfig": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
|
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
|
||||||
@@ -1110,6 +1003,60 @@
|
|||||||
],
|
],
|
||||||
"version": "==1.1.1"
|
"version": "==1.1.1"
|
||||||
},
|
},
|
||||||
|
"jinja2": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8",
|
||||||
|
"sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7"
|
||||||
|
],
|
||||||
|
"markers": "python_version >= '3.6'",
|
||||||
|
"version": "==3.0.3"
|
||||||
|
},
|
||||||
|
"markupsafe": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003",
|
||||||
|
"sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88",
|
||||||
|
"sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5",
|
||||||
|
"sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7",
|
||||||
|
"sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a",
|
||||||
|
"sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603",
|
||||||
|
"sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1",
|
||||||
|
"sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135",
|
||||||
|
"sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247",
|
||||||
|
"sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6",
|
||||||
|
"sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601",
|
||||||
|
"sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77",
|
||||||
|
"sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02",
|
||||||
|
"sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e",
|
||||||
|
"sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63",
|
||||||
|
"sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f",
|
||||||
|
"sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980",
|
||||||
|
"sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b",
|
||||||
|
"sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812",
|
||||||
|
"sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff",
|
||||||
|
"sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96",
|
||||||
|
"sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1",
|
||||||
|
"sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925",
|
||||||
|
"sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a",
|
||||||
|
"sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6",
|
||||||
|
"sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e",
|
||||||
|
"sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f",
|
||||||
|
"sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4",
|
||||||
|
"sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f",
|
||||||
|
"sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3",
|
||||||
|
"sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c",
|
||||||
|
"sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a",
|
||||||
|
"sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417",
|
||||||
|
"sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a",
|
||||||
|
"sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a",
|
||||||
|
"sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37",
|
||||||
|
"sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452",
|
||||||
|
"sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933",
|
||||||
|
"sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a",
|
||||||
|
"sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"
|
||||||
|
],
|
||||||
|
"markers": "python_version >= '3.7'",
|
||||||
|
"version": "==2.1.1"
|
||||||
|
},
|
||||||
"packaging": {
|
"packaging": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb",
|
"sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb",
|
||||||
@@ -1134,6 +1081,14 @@
|
|||||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||||
"version": "==1.11.0"
|
"version": "==1.11.0"
|
||||||
},
|
},
|
||||||
|
"pygments": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65",
|
||||||
|
"sha256:4e426f72023d88d03b2fa258de560726ce890ff3b630f88c21cbb8b2503b8c6a"
|
||||||
|
],
|
||||||
|
"markers": "python_version >= '3.5'",
|
||||||
|
"version": "==2.11.2"
|
||||||
|
},
|
||||||
"pyparsing": {
|
"pyparsing": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea",
|
"sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea",
|
||||||
@@ -1174,6 +1129,95 @@
|
|||||||
"index": "pypi",
|
"index": "pypi",
|
||||||
"version": "==1.11.0"
|
"version": "==1.11.0"
|
||||||
},
|
},
|
||||||
|
"pytz": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c",
|
||||||
|
"sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"
|
||||||
|
],
|
||||||
|
"version": "==2021.3"
|
||||||
|
},
|
||||||
|
"requests": {
|
||||||
|
"extras": [
|
||||||
|
"socks"
|
||||||
|
],
|
||||||
|
"hashes": [
|
||||||
|
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
|
||||||
|
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
|
||||||
|
],
|
||||||
|
"index": "pypi",
|
||||||
|
"version": "==2.27.1"
|
||||||
|
},
|
||||||
|
"snowballstemmer": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1",
|
||||||
|
"sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"
|
||||||
|
],
|
||||||
|
"version": "==2.2.0"
|
||||||
|
},
|
||||||
|
"sphinx": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:5da895959511473857b6d0200f56865ed62c31e8f82dd338063b84ec022701fe",
|
||||||
|
"sha256:6caad9786055cb1fa22b4a365c1775816b876f91966481765d7d50e9f0dd35cc"
|
||||||
|
],
|
||||||
|
"index": "pypi",
|
||||||
|
"version": "==4.4.0"
|
||||||
|
},
|
||||||
|
"sphinx-rtd-theme": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:4d35a56f4508cfee4c4fb604373ede6feae2a306731d533f409ef5c3496fdbd8",
|
||||||
|
"sha256:eec6d497e4c2195fa0e8b2016b337532b8a699a68bcb22a512870e16925c6a5c"
|
||||||
|
],
|
||||||
|
"index": "pypi",
|
||||||
|
"version": "==1.0.0"
|
||||||
|
},
|
||||||
|
"sphinxcontrib-applehelp": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a",
|
||||||
|
"sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58"
|
||||||
|
],
|
||||||
|
"markers": "python_version >= '3.5'",
|
||||||
|
"version": "==1.0.2"
|
||||||
|
},
|
||||||
|
"sphinxcontrib-devhelp": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e",
|
||||||
|
"sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4"
|
||||||
|
],
|
||||||
|
"markers": "python_version >= '3.5'",
|
||||||
|
"version": "==1.0.2"
|
||||||
|
},
|
||||||
|
"sphinxcontrib-htmlhelp": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07",
|
||||||
|
"sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2"
|
||||||
|
],
|
||||||
|
"markers": "python_version >= '3.6'",
|
||||||
|
"version": "==2.0.0"
|
||||||
|
},
|
||||||
|
"sphinxcontrib-jsmath": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178",
|
||||||
|
"sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"
|
||||||
|
],
|
||||||
|
"markers": "python_version >= '3.5'",
|
||||||
|
"version": "==1.0.1"
|
||||||
|
},
|
||||||
|
"sphinxcontrib-qthelp": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72",
|
||||||
|
"sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6"
|
||||||
|
],
|
||||||
|
"markers": "python_version >= '3.5'",
|
||||||
|
"version": "==1.0.3"
|
||||||
|
},
|
||||||
|
"sphinxcontrib-serializinghtml": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd",
|
||||||
|
"sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952"
|
||||||
|
],
|
||||||
|
"markers": "python_version >= '3.5'",
|
||||||
|
"version": "==1.1.5"
|
||||||
|
},
|
||||||
"tomli": {
|
"tomli": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
|
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
|
||||||
@@ -1181,6 +1225,22 @@
|
|||||||
],
|
],
|
||||||
"markers": "python_version >= '3.7'",
|
"markers": "python_version >= '3.7'",
|
||||||
"version": "==2.0.1"
|
"version": "==2.0.1"
|
||||||
|
},
|
||||||
|
"urllib3": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
|
||||||
|
"sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
|
||||||
|
],
|
||||||
|
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
|
||||||
|
"version": "==1.26.8"
|
||||||
|
},
|
||||||
|
"zipp": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d",
|
||||||
|
"sha256:b47250dd24f92b7dd6a0a8fc5244da14608f3ca90a5efcd37a3b1642fac9a375"
|
||||||
|
],
|
||||||
|
"markers": "python_version >= '3.7'",
|
||||||
|
"version": "==3.7.0"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,33 +1,47 @@
|
|||||||
from typing import List
|
from typing import List
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
import tempfile
|
||||||
|
import json
|
||||||
|
import io
|
||||||
|
|
||||||
from sqlalchemy.orm import registry
|
from sqlalchemy.orm import registry
|
||||||
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean
|
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean
|
||||||
import pytesseract
|
import pytesseract
|
||||||
import PIL
|
import PIL
|
||||||
import io
|
|
||||||
import exiftool
|
import exiftool
|
||||||
import json
|
|
||||||
import os
|
|
||||||
|
|
||||||
from .utils import make_request
|
from .utils import make_request
|
||||||
|
|
||||||
mapper_registry = registry()
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ScraperResult:
|
class ScraperResult:
|
||||||
"""A minimally processed result from a scraper"""
|
"""A minimally processed result from a scraper
|
||||||
|
"""
|
||||||
|
|
||||||
|
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
|
||||||
scraper: str
|
scraper: str
|
||||||
|
|
||||||
|
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
|
||||||
platform: str
|
platform: str
|
||||||
|
|
||||||
|
#: Foreign key of channel ID that this was scraped from
|
||||||
channel: int
|
channel: int
|
||||||
|
|
||||||
|
#: String that uniquely identifies the scraped post on the given platform, e.g. ``"1503397267675533313"``
|
||||||
platform_id: str
|
platform_id: str
|
||||||
|
|
||||||
|
#: Datetime (relative to UTC) that the scraped post was created at.
|
||||||
date: datetime
|
date: datetime
|
||||||
|
|
||||||
|
#: JSON dump of dict that contains all data scraped for the post.
|
||||||
raw_data: str
|
raw_data: str
|
||||||
|
|
||||||
|
#: Datetime (relative to UTC) that the scraped post was archived at.
|
||||||
date_archived: datetime
|
date_archived: datetime
|
||||||
|
|
||||||
|
#: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files.
|
||||||
archived_urls: dict
|
archived_urls: dict
|
||||||
|
|
||||||
|
|
||||||
raw_data_table = Table('raw_data', mapper_registry.metadata,
|
raw_data_table = Table('raw_data', mapper_registry.metadata,
|
||||||
Column('id', Integer, primary_key=True,
|
Column('id', Integer, primary_key=True,
|
||||||
autoincrement=True),
|
autoincrement=True),
|
||||||
@@ -40,22 +54,45 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
|
|||||||
Column('date_archived', DateTime),
|
Column('date_archived', DateTime),
|
||||||
Column('archived_urls', JSON))
|
Column('archived_urls', JSON))
|
||||||
|
|
||||||
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Channel:
|
class Channel:
|
||||||
|
"""Information about a specific channel to be scraped.
|
||||||
|
"""
|
||||||
|
|
||||||
|
#: Name of channel (different from username because it can be non-unique and contain emojis), e.g. ``T🕊Редакция Президент Гордон🕊"``.
|
||||||
name: str
|
name: str
|
||||||
|
|
||||||
|
#: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
|
||||||
platform_id: str
|
platform_id: str
|
||||||
|
|
||||||
|
#: User-specified category for the channel, e.g. ``"explicit_qanon"``.
|
||||||
category: str
|
category: str
|
||||||
|
|
||||||
|
#: Name of platform the given channel is on, e.g. ``"Telegram"``.
|
||||||
platform: str
|
platform: str
|
||||||
|
|
||||||
|
#: URL for the given channel on the platform, e.g. ``"https://t.me/prezidentgordonteam"``
|
||||||
url: str
|
url: str
|
||||||
|
|
||||||
|
#: Screen name/username of channel.
|
||||||
screenname: str
|
screenname: str
|
||||||
|
|
||||||
|
#: 2 digit country code for the country of origin for the channel, e.g. ``"RU"``.
|
||||||
country: str = None
|
country: str = None
|
||||||
|
|
||||||
|
#: Name of influencer, if channel belongs to an influencer that operates on multiple platforms.
|
||||||
influencer: str = None
|
influencer: str = None
|
||||||
|
|
||||||
|
#: Whether or not the channel is publicly-accessible.
|
||||||
public: bool = None
|
public: bool = None
|
||||||
|
|
||||||
|
#: Whether or not the channel is a chat (i.e. allows users who are not the channel creator to post/message)
|
||||||
chat: bool = None
|
chat: bool = None
|
||||||
|
|
||||||
|
#: Any other additional notes about the channel.
|
||||||
notes: str = ""
|
notes: str = ""
|
||||||
|
|
||||||
|
#: Did the channel come from a researcher or a scraping process?
|
||||||
source: str = None
|
source: str = None
|
||||||
|
|
||||||
def hydrate(self):
|
def hydrate(self):
|
||||||
@@ -82,26 +119,52 @@ mapper_registry.map_imperatively(Channel, channel_table)
|
|||||||
@dataclass
|
@dataclass
|
||||||
class Post:
|
class Post:
|
||||||
"""An object with fields for columns in the analysis table"""
|
"""An object with fields for columns in the analysis table"""
|
||||||
|
|
||||||
|
#: ID number of the scraped post in the ``raw_data`` table
|
||||||
raw_id: int
|
raw_id: int
|
||||||
|
|
||||||
|
#: Platform specific post ID
|
||||||
platform_id: str
|
platform_id: str
|
||||||
|
|
||||||
|
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
|
||||||
scraper: str
|
scraper: str
|
||||||
|
|
||||||
|
#: String specifying name and version of transformer used to tranform result, e.g. ``"TwitterTransformer 0.0.1"``.
|
||||||
transformer: str
|
transformer: str
|
||||||
|
|
||||||
|
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
|
||||||
platform: str
|
platform: str
|
||||||
|
|
||||||
|
#: User-specified integer that uniquely identifies a channel, e.g. ``15``.
|
||||||
channel: int
|
channel: int
|
||||||
|
|
||||||
|
#: Datetime (relative to UTC) that the scraped post was created at.
|
||||||
date: datetime
|
date: datetime
|
||||||
|
|
||||||
|
#: Datetime (relative to UTC) that the scraped post was archived at.
|
||||||
date_archived: datetime
|
date_archived: datetime
|
||||||
|
|
||||||
|
#: URL of the original post
|
||||||
url: str
|
url: str
|
||||||
|
|
||||||
|
#: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
|
||||||
author_id: str
|
author_id: str
|
||||||
|
|
||||||
|
#: Username of author who made post.
|
||||||
author_username: str
|
author_username: str
|
||||||
|
|
||||||
|
#: Text of the original post
|
||||||
content: str
|
content: str
|
||||||
|
|
||||||
|
#: The ID of the Channel that the post was forwarded or quoted from
|
||||||
forwarded_from: int = None
|
forwarded_from: int = None
|
||||||
|
|
||||||
|
#: The ID of the Post that this Post is a reply to or reblog of
|
||||||
reply_to: int = None
|
reply_to: int = None
|
||||||
|
|
||||||
def hydrate(self):
|
def hydrate(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
post_table = Table('posts', mapper_registry.metadata,
|
post_table = Table('posts', mapper_registry.metadata,
|
||||||
Column('id', Integer, primary_key=True,
|
Column('id', Integer, primary_key=True,
|
||||||
autoincrement=True),
|
autoincrement=True),
|
||||||
@@ -125,39 +188,64 @@ mapper_registry.map_imperatively(Post, post_table)
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Media:
|
class Media:
|
||||||
|
"""Base class for organizing information about a media file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
#: ID number of the media's corresponding scraped post in the ``raw_data`` table.
|
||||||
raw_id: int
|
raw_id: int
|
||||||
|
|
||||||
|
#: ID number of the media's corresponging scraped post in the ``analysis`` table.
|
||||||
post: int
|
post: int
|
||||||
|
|
||||||
|
#: URL of the original post.
|
||||||
url: str
|
url: str
|
||||||
|
|
||||||
|
#: Original URL of the media from the the original post.
|
||||||
original_url: str
|
original_url: str
|
||||||
|
|
||||||
|
#: JSON dump of the dict containing metadata information for the media file.
|
||||||
exif: str = None
|
exif: str = None
|
||||||
|
|
||||||
def get_blob(self):
|
def get_blob(self):
|
||||||
|
"""Download media file as bytes blob.
|
||||||
|
"""
|
||||||
|
|
||||||
blob = make_request(self.url)
|
blob = make_request(self.url)
|
||||||
return blob.content
|
return blob.content
|
||||||
|
|
||||||
def hydrate(self, blob = None):
|
def hydrate(self, blob = None):
|
||||||
|
"""Download media file as bytes blob and extract data from content.
|
||||||
|
"""
|
||||||
|
|
||||||
if blob is None:
|
if blob is None:
|
||||||
blob = self.get_blob()
|
blob = self.get_blob()
|
||||||
|
|
||||||
self.hydrate_exif(blob)
|
self.hydrate_exif(blob)
|
||||||
|
|
||||||
def hydrate_exif(self, blob):
|
def hydrate_exif(self, blob):
|
||||||
f = open('tmp', 'wb')
|
"""Extract Exif metadata from bytes blob.
|
||||||
f.write(blob)
|
"""
|
||||||
f.close()
|
|
||||||
|
|
||||||
with exiftool.ExifTool() as et:
|
with tempfile.NamedTemporaryFile() as temp_file:
|
||||||
exif = et.get_metadata('tmp')
|
temp_file.write(blob)
|
||||||
self.exif = json.dumps(exif)
|
|
||||||
|
|
||||||
os.remove('tmp')
|
with exiftool.ExifTool() as et:
|
||||||
|
exif = et.get_metadata(temp_file.name)
|
||||||
|
self.exif = json.dumps(exif)
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Image(Media):
|
class Image(Media):
|
||||||
|
"""Class for organizing information about an image file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
#: Extracted OCR content from image
|
||||||
ocr: str = None
|
ocr: str = None
|
||||||
|
|
||||||
def hydrate(self, blob=None):
|
def hydrate(self, blob=None):
|
||||||
|
"""Download image file as bytes blob and extract Exif and OCR content
|
||||||
|
from the image.
|
||||||
|
"""
|
||||||
|
|
||||||
if blob is None:
|
if blob is None:
|
||||||
blob = self.get_blob()
|
blob = self.get_blob()
|
||||||
|
|
||||||
@@ -165,25 +253,62 @@ class Image(Media):
|
|||||||
self.hydrate_ocr(blob)
|
self.hydrate_ocr(blob)
|
||||||
|
|
||||||
def hydrate_ocr(self, blob):
|
def hydrate_ocr(self, blob):
|
||||||
|
"""Extract OCR (optical character recognition) data from image bytes blob.
|
||||||
|
"""
|
||||||
|
|
||||||
image = PIL.Image.open(io.BytesIO(blob))
|
image = PIL.Image.open(io.BytesIO(blob))
|
||||||
self.ocr = pytesseract.image_to_string(image)
|
self.ocr = pytesseract.image_to_string(image)
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Video(Media):
|
class Video(Media):
|
||||||
|
"""Class for organizing information about an image file.
|
||||||
|
"""
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
mapper_registry = registry()
|
||||||
|
|
||||||
|
raw_data_table = Table('raw_data', mapper_registry.metadata,
|
||||||
|
Column('id', Integer, primary_key=True,
|
||||||
|
autoincrement=True),
|
||||||
|
Column('scraper', String),
|
||||||
|
Column('platform', String),
|
||||||
|
Column('channel', Integer),
|
||||||
|
Column('platform_id', String),
|
||||||
|
Column('date', DateTime),
|
||||||
|
Column('raw_data', String),
|
||||||
|
Column('date_archived', DateTime),
|
||||||
|
Column('archived_urls', JSON))
|
||||||
|
|
||||||
|
|
||||||
|
analysis_table = Table('analysis', mapper_registry.metadata,
|
||||||
|
Column('id', Integer, primary_key=True,
|
||||||
|
autoincrement=True),
|
||||||
|
Column('raw_id', Integer, ForeignKey('raw_data.id')),
|
||||||
|
Column('scraper', String),
|
||||||
|
Column('transformer', String),
|
||||||
|
Column('platform', String),
|
||||||
|
Column('channel', Integer),
|
||||||
|
Column('date', DateTime),
|
||||||
|
Column('date_archived', DateTime),
|
||||||
|
Column('url', String),
|
||||||
|
Column('author_id', String),
|
||||||
|
Column('author_username', String),
|
||||||
|
Column('content', String))
|
||||||
|
|
||||||
media_table = Table('media', mapper_registry.metadata,
|
media_table = Table('media', mapper_registry.metadata,
|
||||||
Column('id', Integer, primary_key=True,
|
Column('id', Integer, primary_key=True,
|
||||||
autoincrement=True),
|
autoincrement=True),
|
||||||
Column('type', String),
|
Column('type', String),
|
||||||
Column('raw_id', Integer, ForeignKey('raw_data.id')),
|
Column('raw_id', Integer, ForeignKey('raw_data.id')),
|
||||||
Column('post', Integer, ForeignKey('posts.id')),
|
Column('post', Integer, ForeignKey('posts.id')),
|
||||||
Column('url', String),
|
Column('url', String),
|
||||||
Column('original_url', String),
|
Column('original_url', String),
|
||||||
Column('exif', String),
|
Column('exif', String),
|
||||||
Column('ocr', String)
|
Column('ocr', String))
|
||||||
)
|
|
||||||
|
|
||||||
|
mapper_registry.map_imperatively(TransformedResult, analysis_table)
|
||||||
|
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
|
||||||
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
|
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
|
||||||
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
|
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
|
||||||
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')
|
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')
|
||||||
@@ -14,29 +14,91 @@ from cisticola.base import Channel, ScraperResult, mapper_registry
|
|||||||
from cisticola.utils import make_request
|
from cisticola.utils import make_request
|
||||||
|
|
||||||
class Scraper:
|
class Scraper:
|
||||||
|
"""Base class for defining platform-specific scrapers for scraping all posts
|
||||||
|
from a given channel on that specific platform.
|
||||||
|
"""
|
||||||
|
|
||||||
__version__ = "Scraper 0.0.0"
|
__version__ = "Scraper 0.0.0"
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.s3_client = boto3.client('s3',
|
|
||||||
region_name=os.environ['DO_SPACES_REGION'],
|
|
||||||
endpoint_url='https://{}.digitaloceanspaces.com'.format(
|
|
||||||
os.environ['DO_SPACES_REGION']),
|
|
||||||
aws_access_key_id=os.environ['DO_SPACES_KEY'],
|
|
||||||
aws_secret_access_key=os.environ['DO_SPACES_SECRET'])
|
|
||||||
|
|
||||||
|
# Initialize client to transfer files to the storage archive
|
||||||
|
self.s3_client = boto3.client(
|
||||||
|
service_name='s3',
|
||||||
|
region_name=os.environ['DO_SPACES_REGION'],
|
||||||
|
endpoint_url=f'https://{os.environ["DO_SPACES_REGION"]}.digitaloceanspaces.com',
|
||||||
|
aws_access_key_id=os.environ['DO_SPACES_KEY'],
|
||||||
|
aws_secret_access_key=os.environ['DO_SPACES_SECRET'])
|
||||||
|
|
||||||
|
# Define request headers (necessary to bypass scraping protection
|
||||||
|
# for several platform scrapers)
|
||||||
self.headers = {
|
self.headers = {
|
||||||
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'}
|
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'}
|
||||||
|
|
||||||
pass
|
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.__version__
|
return self.__version__
|
||||||
|
|
||||||
|
def get_username_from_url(self, url: str) -> str:
|
||||||
|
"""Extract a channel's username from its URL.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
url: str
|
||||||
|
URL of the channel on a given platform
|
||||||
|
e.g. ``"https://twitter.com/EliotHiggins"``
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
username: str
|
||||||
|
Extracted username of the channel.
|
||||||
|
e.g. ``"EliotHiggins"``
|
||||||
|
"""
|
||||||
|
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
def url_to_key(self, url: str, content_type: str) -> str:
|
def url_to_key(self, url: str, content_type: str) -> str:
|
||||||
|
"""Generate a unique identifier for media from a specified post.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
---------
|
||||||
|
url: str
|
||||||
|
URL of original post.
|
||||||
|
e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"``
|
||||||
|
content_type: str
|
||||||
|
Content-Type of media.
|
||||||
|
e.g. ``"image/jpeg"``
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
key: str
|
||||||
|
Unique identifier for the media file from a specified post based on
|
||||||
|
the original post URL and the media's Content-Type.
|
||||||
|
"""
|
||||||
|
|
||||||
key = urlparse(url).path.split('/')[-1]
|
key = urlparse(url).path.split('/')[-1]
|
||||||
return key
|
return key
|
||||||
|
|
||||||
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||||
|
"""Download media file from a specified media file URL.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
---------
|
||||||
|
url: str
|
||||||
|
URL of media file from original post.
|
||||||
|
e.g. ``"https://pbs.twimg.com/media/FN0j0dYWUAcQxfK?format=png&name=medium"``
|
||||||
|
key: str or None
|
||||||
|
Pre-defined unique identifier for the media file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
blob: bytes
|
||||||
|
Raw bytes of the downloaded media file.
|
||||||
|
content_type: str
|
||||||
|
Content-Type of media.
|
||||||
|
e.g. ``"image/jpeg"``.
|
||||||
|
key: str
|
||||||
|
Unique identifier for the media file.
|
||||||
|
"""
|
||||||
|
|
||||||
r = make_request(url, headers = self.headers)
|
r = make_request(url, headers = self.headers)
|
||||||
|
|
||||||
@@ -49,6 +111,27 @@ class Scraper:
|
|||||||
return blob, content_type, key
|
return blob, content_type, key
|
||||||
|
|
||||||
def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||||
|
"""Download media file from a specified media URL, where the media file
|
||||||
|
is formatted as an m3u8 playlist, which is then decoded to an mp4 file.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
---------
|
||||||
|
url: str
|
||||||
|
URL of m3u8 playlist file from original post.
|
||||||
|
e.g. ``"https://media.gettr.com/group47/origin/2022/03/15/01/cbc436c1-1a1a-4b97-671d-c42109f3ec9b/out.m3u8"``
|
||||||
|
key: str or None
|
||||||
|
Pre-defined unique identifier for the media file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
blob: bytes
|
||||||
|
Raw bytes of the downloaded media file.
|
||||||
|
content_type: str
|
||||||
|
Content-Type of media.
|
||||||
|
e.g. ``"video/mp4"``.
|
||||||
|
key: str
|
||||||
|
Unique identifier for the media file.
|
||||||
|
"""
|
||||||
|
|
||||||
content_type = 'video/mp4'
|
content_type = 'video/mp4'
|
||||||
ext = '.' + content_type.split('/')[-1]
|
ext = '.' + content_type.split('/')[-1]
|
||||||
@@ -71,7 +154,28 @@ class Scraper:
|
|||||||
return blob, content_type, key
|
return blob, content_type, key
|
||||||
|
|
||||||
def ytdlp_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
def ytdlp_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||||
|
"""Download media file from a specified media URL, using a fork of
|
||||||
|
youtube-dl that enables faster downloading.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
---------
|
||||||
|
url: str
|
||||||
|
URL of media file from original post.
|
||||||
|
e.g. ``"https://rumble.com/embed/vgt7gh/"``
|
||||||
|
key: str or None
|
||||||
|
Pre-defined unique identifier for the media file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
blob: bytes
|
||||||
|
Raw bytes of the downloaded media file.
|
||||||
|
content_type: str
|
||||||
|
Content-Type of media.
|
||||||
|
e.g. ``"video/mp4"``.
|
||||||
|
key: str
|
||||||
|
Unique identifier for the media file.
|
||||||
|
"""
|
||||||
|
|
||||||
content_type = 'video/mp4'
|
content_type = 'video/mp4'
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
@@ -103,6 +207,23 @@ class Scraper:
|
|||||||
return blob, content_type, key
|
return blob, content_type, key
|
||||||
|
|
||||||
def archive_blob(self, blob: bytes, content_type: str, key: str) -> str:
|
def archive_blob(self, blob: bytes, content_type: str, key: str) -> str:
|
||||||
|
"""Upload raw bytes of a media file to the storage archive.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
blob: bytes
|
||||||
|
Raw bytes of the media file to be archived.
|
||||||
|
content_type: str
|
||||||
|
Content-Type of media.
|
||||||
|
e.g. ``"video/mp4"``.
|
||||||
|
key: str
|
||||||
|
Unique identifier for the media file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
archived_url: str
|
||||||
|
URL specifying the file on the storage archive.
|
||||||
|
"""
|
||||||
|
|
||||||
filename = self.__version__.replace(' ', '_') + '/' + key
|
filename = self.__version__.replace(' ', '_') + '/' + key
|
||||||
|
|
||||||
@@ -114,9 +235,42 @@ class Scraper:
|
|||||||
return archived_url
|
return archived_url
|
||||||
|
|
||||||
def can_handle(self, channel: Channel) -> bool:
|
def can_handle(self, channel: Channel) -> bool:
|
||||||
|
"""Whether or not the scraper can scrape the specified channel.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
channel: Channel
|
||||||
|
Channel to be scraped.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
``True`` if the scraper is capable of scraping ``channel``,
|
||||||
|
``False`` if not.
|
||||||
|
"""
|
||||||
|
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||||
|
"""Scrape all posts from the specified Channel.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
channel: Channel
|
||||||
|
Channel to be scraped.
|
||||||
|
since: ScraperResult or None
|
||||||
|
Most recently scraped ScraperResult from a previous scrape, or
|
||||||
|
``None`` if scraper has not run before.
|
||||||
|
archive_media: bool
|
||||||
|
If ``True``, any media files (images, video, etc.) from posts are archived.
|
||||||
|
If ``False``, media files are not archived.
|
||||||
|
|
||||||
|
Yields
|
||||||
|
------
|
||||||
|
ScraperResult
|
||||||
|
Scraper result from a single post/comment from the specified Channel.
|
||||||
|
"""
|
||||||
|
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
@@ -129,9 +283,13 @@ class ScraperController:
|
|||||||
self.session = None
|
self.session = None
|
||||||
|
|
||||||
def register_scraper(self, scraper: Scraper):
|
def register_scraper(self, scraper: Scraper):
|
||||||
|
"""Register a single Scraper instance to the controller.
|
||||||
|
"""
|
||||||
self.scrapers.append(scraper)
|
self.scrapers.append(scraper)
|
||||||
|
|
||||||
def register_scrapers(self, scraper: List[Scraper]):
|
def register_scrapers(self, scraper: List[Scraper]):
|
||||||
|
"""Register a list of Scraper instances to the controller.
|
||||||
|
"""
|
||||||
self.scrapers.extend(scraper)
|
self.scrapers.extend(scraper)
|
||||||
|
|
||||||
def scrape_all_channels(self, archive_media: bool = True):
|
def scrape_all_channels(self, archive_media: bool = True):
|
||||||
@@ -147,6 +305,17 @@ class ScraperController:
|
|||||||
|
|
||||||
@logger.catch(reraise = True)
|
@logger.catch(reraise = True)
|
||||||
def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
|
def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
|
||||||
|
"""Scrape all posts for all specified channels.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
channels: list<Channel>
|
||||||
|
List of Channel instances to be scraped
|
||||||
|
archive_media: bool
|
||||||
|
If ``True``, any media files (images, video, etc.) from posts are archived.
|
||||||
|
If ``False``, media files are not archived.
|
||||||
|
"""
|
||||||
|
|
||||||
if self.session is None:
|
if self.session is None:
|
||||||
logger.error("No DB session")
|
logger.error("No DB session")
|
||||||
return
|
return
|
||||||
@@ -185,6 +354,9 @@ class ScraperController:
|
|||||||
logger.warning(f"No handler found for Channel {channel}")
|
logger.warning(f"No handler found for Channel {channel}")
|
||||||
|
|
||||||
def connect_to_db(self, engine):
|
def connect_to_db(self, engine):
|
||||||
|
"""Connect the specified SQLAlchemy engine to the controller.
|
||||||
|
"""
|
||||||
|
|
||||||
# create tables
|
# create tables
|
||||||
mapper_registry.metadata.create_all(bind=engine)
|
mapper_registry.metadata.create_all(bind=engine)
|
||||||
|
|
||||||
@@ -193,8 +365,8 @@ class ScraperController:
|
|||||||
self.session.configure(bind=self.engine)
|
self.session.configure(bind=self.engine)
|
||||||
|
|
||||||
def reset_db(self):
|
def reset_db(self):
|
||||||
|
"""Drop all data from the connected SQLAlchemy database.
|
||||||
|
"""
|
||||||
|
|
||||||
mapper_registry.metadata.drop_all(bind=self.engine)
|
mapper_registry.metadata.drop_all(bind=self.engine)
|
||||||
self.connect_to_db(self.engine)
|
self.connect_to_db(self.engine)
|
||||||
|
|
||||||
|
|
||||||
@@ -17,7 +17,7 @@ class BitchuteScraper(Scraper):
|
|||||||
library"""
|
library"""
|
||||||
__version__ = "BitchuteScraper 0.0.1"
|
__version__ = "BitchuteScraper 0.0.1"
|
||||||
|
|
||||||
def get_username_from_url(url):
|
def get_username_from_url(self, url):
|
||||||
username = url.split('bitchute.com/channel/')[-1].strip('/')
|
username = url.split('bitchute.com/channel/')[-1].strip('/')
|
||||||
|
|
||||||
return username
|
return username
|
||||||
@@ -33,7 +33,7 @@ class BitchuteScraper(Scraper):
|
|||||||
|
|
||||||
detail = 'comments'
|
detail = 'comments'
|
||||||
|
|
||||||
username = BitchuteScraper.get_username_from_url(channel.url)
|
username = self.get_username_from_url(channel.url)
|
||||||
scraper = get_videos_user(session, username, csrftoken, detail)
|
scraper = get_videos_user(session, username, csrftoken, detail)
|
||||||
|
|
||||||
for post in scraper:
|
for post in scraper:
|
||||||
@@ -61,7 +61,7 @@ class BitchuteScraper(Scraper):
|
|||||||
archived_urls=archived_urls)
|
archived_urls=archived_urls)
|
||||||
|
|
||||||
def can_handle(self, channel):
|
def can_handle(self, channel):
|
||||||
if channel.platform == "Bitchute" and BitchuteScraper.get_username_from_url(channel.url) is not None:
|
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||||
|
|||||||
@@ -11,14 +11,14 @@ class GabScraper(Scraper):
|
|||||||
"""An implementation of a Scraper for Gab, using GARC library"""
|
"""An implementation of a Scraper for Gab, using GARC library"""
|
||||||
__version__ = "GabScraper 0.0.1"
|
__version__ = "GabScraper 0.0.1"
|
||||||
|
|
||||||
def get_username_from_url(url):
|
def get_username_from_url(self, url):
|
||||||
username = url.split('https://gab.com/')[-1]
|
username = url.split('https://gab.com/')[-1]
|
||||||
|
|
||||||
return username
|
return username
|
||||||
|
|
||||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||||
client = Garc(profile = 'main')
|
client = Garc(profile = 'main')
|
||||||
username = GabScraper.get_username_from_url(channel.url)
|
username = self.get_username_from_url(channel.url)
|
||||||
|
|
||||||
scraper = client.userposts(username)
|
scraper = client.userposts(username)
|
||||||
|
|
||||||
@@ -52,5 +52,5 @@ class GabScraper(Scraper):
|
|||||||
archived_urls=archived_urls)
|
archived_urls=archived_urls)
|
||||||
|
|
||||||
def can_handle(self, channel):
|
def can_handle(self, channel):
|
||||||
if channel.platform == "Gab" and GabScraper.get_username_from_url(channel.url) is not None:
|
if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:
|
||||||
return True
|
return True
|
||||||
@@ -12,7 +12,7 @@ class GettrScraper(Scraper):
|
|||||||
"""An implementation of a Scraper for Gettr, using gogettr library"""
|
"""An implementation of a Scraper for Gettr, using gogettr library"""
|
||||||
__version__ = "GettrScraper 0.0.1"
|
__version__ = "GettrScraper 0.0.1"
|
||||||
|
|
||||||
def get_username_from_url(url):
|
def get_username_from_url(self, url):
|
||||||
username = url.split("gettr.com/user/")[1]
|
username = url.split("gettr.com/user/")[1]
|
||||||
if len(username.split("/")) > 1:
|
if len(username.split("/")) > 1:
|
||||||
return None
|
return None
|
||||||
@@ -21,7 +21,7 @@ class GettrScraper(Scraper):
|
|||||||
|
|
||||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||||
client = PublicClient()
|
client = PublicClient()
|
||||||
username = GettrScraper.get_username_from_url(channel.url)
|
username = self.get_username_from_url(channel.url)
|
||||||
scraper = client.user_activity(username=username, type="posts")
|
scraper = client.user_activity(username=username, type="posts")
|
||||||
|
|
||||||
for post in scraper:
|
for post in scraper:
|
||||||
@@ -62,7 +62,7 @@ class GettrScraper(Scraper):
|
|||||||
archived_urls=archived_urls)
|
archived_urls=archived_urls)
|
||||||
|
|
||||||
def can_handle(self, channel):
|
def can_handle(self, channel):
|
||||||
if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None:
|
if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def url_to_key(self, url: str, content_type: str) -> str:
|
def url_to_key(self, url: str, content_type: str) -> str:
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ CONTENT_TYPES = {
|
|||||||
'mp4' : 'video/mp4'}
|
'mp4' : 'video/mp4'}
|
||||||
|
|
||||||
class InstagramScraper(Scraper):
|
class InstagramScraper(Scraper):
|
||||||
|
"""An implementation of a Scraper for Instagram, using instaloader library"""
|
||||||
__version__ = "InstagramScraper 0.0.1"
|
__version__ = "InstagramScraper 0.0.1"
|
||||||
|
|
||||||
def get_username_from_url(self, url):
|
def get_username_from_url(self, url):
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ class OdyseeScraper(Scraper):
|
|||||||
"""An implementation of a Scraper for Odysee, using polyphemus library"""
|
"""An implementation of a Scraper for Odysee, using polyphemus library"""
|
||||||
__version__ = "OdyseeScraper 0.0.1"
|
__version__ = "OdyseeScraper 0.0.1"
|
||||||
|
|
||||||
def get_username_from_url(url):
|
def get_username_from_url(self, url):
|
||||||
|
|
||||||
username = url.split('odysee.com/')[-1].strip('@').split(':')[0]
|
username = url.split('odysee.com/')[-1].strip('@').split(':')[0]
|
||||||
|
|
||||||
@@ -21,7 +21,7 @@ class OdyseeScraper(Scraper):
|
|||||||
|
|
||||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||||
|
|
||||||
username = OdyseeScraper.get_username_from_url(channel.url)
|
username = self.get_username_from_url(channel.url)
|
||||||
odysee_channel = OdyseeChannel(channel_name = username)
|
odysee_channel = OdyseeChannel(channel_name = username)
|
||||||
|
|
||||||
all_videos = odysee_channel.get_all_videos()
|
all_videos = odysee_channel.get_all_videos()
|
||||||
@@ -70,7 +70,7 @@ class OdyseeScraper(Scraper):
|
|||||||
archived_urls={})
|
archived_urls={})
|
||||||
|
|
||||||
def can_handle(self, channel):
|
def can_handle(self, channel):
|
||||||
if channel.platform == "Odysee" and OdyseeScraper.get_username_from_url(channel.url) is not None:
|
if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def url_to_key(self, url: str, content_type: str) -> str:
|
def url_to_key(self, url: str, content_type: str) -> str:
|
||||||
|
|||||||
@@ -14,14 +14,14 @@ class RumbleScraper(Scraper):
|
|||||||
"""An implementation of a Scraper for Rumble, using custom functions"""
|
"""An implementation of a Scraper for Rumble, using custom functions"""
|
||||||
__version__ = "RumbleScraper 0.0.1"
|
__version__ = "RumbleScraper 0.0.1"
|
||||||
|
|
||||||
def get_username_from_url(url):
|
def get_username_from_url(self, url):
|
||||||
username = url.split('https://rumble.com/c/')[1]
|
username = url.split('https://rumble.com/c/')[1]
|
||||||
|
|
||||||
return username
|
return username
|
||||||
|
|
||||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||||
|
|
||||||
username = RumbleScraper.get_username_from_url(channel.url)
|
username = self.get_username_from_url(channel.url)
|
||||||
scraper = get_channel_videos(username)
|
scraper = get_channel_videos(username)
|
||||||
|
|
||||||
for post in scraper:
|
for post in scraper:
|
||||||
@@ -54,7 +54,7 @@ class RumbleScraper(Scraper):
|
|||||||
return key
|
return key
|
||||||
|
|
||||||
def can_handle(self, channel):
|
def can_handle(self, channel):
|
||||||
if channel.platform == "Rumble" and RumbleScraper.get_username_from_url(channel.url) is not None:
|
if channel.platform == "Rumble" and self.get_username_from_url(channel.url) is not None:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ from cisticola.base import Channel, ScraperResult
|
|||||||
from cisticola.scraper.base import Scraper
|
from cisticola.scraper.base import Scraper
|
||||||
|
|
||||||
class TelegramSnscrapeScraper(Scraper):
|
class TelegramSnscrapeScraper(Scraper):
|
||||||
|
"""An implementation of a Scraper for Telegram, using snscrape library"""
|
||||||
__version__ = "TelegramSnscrapeScraper 0.0.1"
|
__version__ = "TelegramSnscrapeScraper 0.0.1"
|
||||||
|
|
||||||
def can_handle(self, channel):
|
def can_handle(self, channel):
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ from cisticola.scraper.base import Scraper
|
|||||||
MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
|
MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
|
||||||
|
|
||||||
class TelegramTelethonScraper(Scraper):
|
class TelegramTelethonScraper(Scraper):
|
||||||
|
"""An implementation of a Scraper for Telegram, using Telethon library"""
|
||||||
__version__ = "TelegramTelethonScraper 0.0.1"
|
__version__ = "TelegramTelethonScraper 0.0.1"
|
||||||
|
|
||||||
def get_username_from_url(self, url):
|
def get_username_from_url(self, url):
|
||||||
@@ -30,9 +31,9 @@ class TelegramTelethonScraper(Scraper):
|
|||||||
|
|
||||||
username = self.get_username_from_url(channel.url)
|
username = self.get_username_from_url(channel.url)
|
||||||
|
|
||||||
api_id = os.environ['TELEGRAM_API_ID_1']
|
api_id = os.environ['TELEGRAM_API_ID']
|
||||||
api_hash = os.environ['TELEGRAM_API_HASH_1']
|
api_hash = os.environ['TELEGRAM_API_HASH']
|
||||||
phone = os.environ['TELEGRAM_PHONE_1']
|
phone = os.environ['TELEGRAM_PHONE']
|
||||||
|
|
||||||
with TelegramClient(phone, api_id, api_hash) as client:
|
with TelegramClient(phone, api_id, api_hash) as client:
|
||||||
|
|
||||||
|
|||||||
@@ -8,12 +8,24 @@ SPHINXBUILD ?= sphinx-build
|
|||||||
SOURCEDIR = source
|
SOURCEDIR = source
|
||||||
BUILDDIR = build
|
BUILDDIR = build
|
||||||
|
|
||||||
|
SPHINXAPIDOC = sphinx-apidoc
|
||||||
|
APIDOCFLAGS = --separate --private --module-first
|
||||||
|
MODULEPATH = ../cisticola
|
||||||
|
SOURCEFILES = cisticola.*
|
||||||
|
MODULEFILE = modules.rst
|
||||||
|
|
||||||
# Put it first so that "make" without argument is like "make help".
|
# Put it first so that "make" without argument is like "make help".
|
||||||
help:
|
help:
|
||||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||||
|
|
||||||
.PHONY: help Makefile
|
.PHONY: help Makefile
|
||||||
|
|
||||||
|
# Custom process and flags for generating Sphinx sources
|
||||||
|
apidoc:
|
||||||
|
rm $(SOURCEDIR)/$(SOURCEFILES)
|
||||||
|
$(SPHINXAPIDOC) $(APIDOCFLAGS) -o "$(SOURCEDIR)" "$(MODULEPATH)"
|
||||||
|
rm $(SOURCEDIR)/$(MODULEFILE)
|
||||||
|
|
||||||
# Catch-all target: route all unknown targets to Sphinx using the new
|
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||||
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||||
%: Makefile
|
%: Makefile
|
||||||
|
|||||||
64
docs/images/cisticola_logo.svg
Normal file
64
docs/images/cisticola_logo.svg
Normal file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 7.0 KiB |
BIN
docs/images/favicon.ico
Normal file
BIN
docs/images/favicon.ico
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 614 B |
@@ -10,6 +10,12 @@ if "%SPHINXBUILD%" == "" (
|
|||||||
set SOURCEDIR=source
|
set SOURCEDIR=source
|
||||||
set BUILDDIR=build
|
set BUILDDIR=build
|
||||||
|
|
||||||
|
set SPHINXAPIDOC=sphinx-apidoc
|
||||||
|
set APIDOCFLAGS=--separate --private --module-first
|
||||||
|
set MODULEPATH=../cisticola
|
||||||
|
set SOURCEFILES=cisticola.*
|
||||||
|
set MODULEFILE=modules.rst
|
||||||
|
|
||||||
if "%1" == "" goto help
|
if "%1" == "" goto help
|
||||||
|
|
||||||
%SPHINXBUILD% >NUL 2>NUL
|
%SPHINXBUILD% >NUL 2>NUL
|
||||||
@@ -28,6 +34,11 @@ if errorlevel 9009 (
|
|||||||
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||||
goto end
|
goto end
|
||||||
|
|
||||||
|
:apidoc
|
||||||
|
del %SOURCEDIR%\%SOURCEFILES%
|
||||||
|
%SPHINXAPIDOC% %APIDOCFLAGS% -o %SOURCEDIR% %MODULEPATH%
|
||||||
|
del %SOURCEDIR%\%MODULEFILE%
|
||||||
|
|
||||||
:help
|
:help
|
||||||
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||||
|
|
||||||
|
|||||||
8
docs/source/cisticola.base.rst
Normal file
8
docs/source/cisticola.base.rst
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
cisticola.base module
|
||||||
|
=====================
|
||||||
|
|
||||||
|
.. automodule:: cisticola.base
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
:private-members:
|
||||||
@@ -1,6 +1,12 @@
|
|||||||
cisticola package
|
cisticola package
|
||||||
=================
|
=================
|
||||||
|
|
||||||
|
.. automodule:: cisticola
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
:private-members:
|
||||||
|
|
||||||
Subpackages
|
Subpackages
|
||||||
-----------
|
-----------
|
||||||
|
|
||||||
@@ -13,18 +19,8 @@ Subpackages
|
|||||||
Submodules
|
Submodules
|
||||||
----------
|
----------
|
||||||
|
|
||||||
cisticola.base module
|
.. toctree::
|
||||||
---------------------
|
:maxdepth: 4
|
||||||
|
|
||||||
.. automodule:: cisticola.base
|
cisticola.base
|
||||||
:members:
|
cisticola.utils
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
||||||
Module contents
|
|
||||||
---------------
|
|
||||||
|
|
||||||
.. automodule:: cisticola
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|||||||
8
docs/source/cisticola.scraper.base.rst
Normal file
8
docs/source/cisticola.scraper.base.rst
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
cisticola.scraper.base module
|
||||||
|
=============================
|
||||||
|
|
||||||
|
.. automodule:: cisticola.scraper.base
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
:private-members:
|
||||||
8
docs/source/cisticola.scraper.bitchute.rst
Normal file
8
docs/source/cisticola.scraper.bitchute.rst
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
cisticola.scraper.bitchute module
|
||||||
|
=================================
|
||||||
|
|
||||||
|
.. automodule:: cisticola.scraper.bitchute
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
:private-members:
|
||||||
8
docs/source/cisticola.scraper.gab.rst
Normal file
8
docs/source/cisticola.scraper.gab.rst
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
cisticola.scraper.gab module
|
||||||
|
============================
|
||||||
|
|
||||||
|
.. automodule:: cisticola.scraper.gab
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
:private-members:
|
||||||
8
docs/source/cisticola.scraper.gettr.rst
Normal file
8
docs/source/cisticola.scraper.gettr.rst
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
cisticola.scraper.gettr module
|
||||||
|
==============================
|
||||||
|
|
||||||
|
.. automodule:: cisticola.scraper.gettr
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
:private-members:
|
||||||
8
docs/source/cisticola.scraper.instagram.rst
Normal file
8
docs/source/cisticola.scraper.instagram.rst
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
cisticola.scraper.instagram module
|
||||||
|
==================================
|
||||||
|
|
||||||
|
.. automodule:: cisticola.scraper.instagram
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
:private-members:
|
||||||
8
docs/source/cisticola.scraper.odysee.rst
Normal file
8
docs/source/cisticola.scraper.odysee.rst
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
cisticola.scraper.odysee module
|
||||||
|
===============================
|
||||||
|
|
||||||
|
.. automodule:: cisticola.scraper.odysee
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
:private-members:
|
||||||
@@ -1,37 +1,27 @@
|
|||||||
cisticola.scraper package
|
cisticola.scraper package
|
||||||
=========================
|
=========================
|
||||||
|
|
||||||
Submodules
|
|
||||||
----------
|
|
||||||
|
|
||||||
cisticola.scraper.bitchute module
|
|
||||||
---------------------------------
|
|
||||||
|
|
||||||
.. automodule:: cisticola.scraper.bitchute
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
||||||
cisticola.scraper.gettr module
|
|
||||||
------------------------------
|
|
||||||
|
|
||||||
.. automodule:: cisticola.scraper.gettr
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
||||||
cisticola.scraper.twitter module
|
|
||||||
--------------------------------
|
|
||||||
|
|
||||||
.. automodule:: cisticola.scraper.twitter
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
||||||
Module contents
|
|
||||||
---------------
|
|
||||||
|
|
||||||
.. automodule:: cisticola.scraper
|
.. automodule:: cisticola.scraper
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
:private-members:
|
||||||
|
|
||||||
|
Submodules
|
||||||
|
----------
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 4
|
||||||
|
|
||||||
|
cisticola.scraper.base
|
||||||
|
cisticola.scraper.bitchute
|
||||||
|
cisticola.scraper.gab
|
||||||
|
cisticola.scraper.gettr
|
||||||
|
cisticola.scraper.instagram
|
||||||
|
cisticola.scraper.odysee
|
||||||
|
cisticola.scraper.rumble
|
||||||
|
cisticola.scraper.telegram_snscrape
|
||||||
|
cisticola.scraper.telegram_telethon
|
||||||
|
cisticola.scraper.twitter
|
||||||
|
cisticola.scraper.vkontakte
|
||||||
|
cisticola.scraper.youtube
|
||||||
|
|||||||
8
docs/source/cisticola.scraper.rumble.rst
Normal file
8
docs/source/cisticola.scraper.rumble.rst
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
cisticola.scraper.rumble module
|
||||||
|
===============================
|
||||||
|
|
||||||
|
.. automodule:: cisticola.scraper.rumble
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
:private-members:
|
||||||
8
docs/source/cisticola.scraper.telegram_snscrape.rst
Normal file
8
docs/source/cisticola.scraper.telegram_snscrape.rst
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
cisticola.scraper.telegram\_snscrape module
|
||||||
|
===========================================
|
||||||
|
|
||||||
|
.. automodule:: cisticola.scraper.telegram_snscrape
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
:private-members:
|
||||||
8
docs/source/cisticola.scraper.telegram_telethon.rst
Normal file
8
docs/source/cisticola.scraper.telegram_telethon.rst
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
cisticola.scraper.telegram\_telethon module
|
||||||
|
===========================================
|
||||||
|
|
||||||
|
.. automodule:: cisticola.scraper.telegram_telethon
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
:private-members:
|
||||||
8
docs/source/cisticola.scraper.twitter.rst
Normal file
8
docs/source/cisticola.scraper.twitter.rst
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
cisticola.scraper.twitter module
|
||||||
|
================================
|
||||||
|
|
||||||
|
.. automodule:: cisticola.scraper.twitter
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
:private-members:
|
||||||
8
docs/source/cisticola.scraper.vkontakte.rst
Normal file
8
docs/source/cisticola.scraper.vkontakte.rst
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
cisticola.scraper.vkontakte module
|
||||||
|
==================================
|
||||||
|
|
||||||
|
.. automodule:: cisticola.scraper.vkontakte
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
:private-members:
|
||||||
8
docs/source/cisticola.scraper.youtube.rst
Normal file
8
docs/source/cisticola.scraper.youtube.rst
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
cisticola.scraper.youtube module
|
||||||
|
================================
|
||||||
|
|
||||||
|
.. automodule:: cisticola.scraper.youtube
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
:private-members:
|
||||||
8
docs/source/cisticola.transformer.base.rst
Normal file
8
docs/source/cisticola.transformer.base.rst
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
cisticola.transformer.base module
|
||||||
|
=================================
|
||||||
|
|
||||||
|
.. automodule:: cisticola.transformer.base
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
:private-members:
|
||||||
8
docs/source/cisticola.transformer.bitchute.rst
Normal file
8
docs/source/cisticola.transformer.bitchute.rst
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
cisticola.transformer.bitchute module
|
||||||
|
=====================================
|
||||||
|
|
||||||
|
.. automodule:: cisticola.transformer.bitchute
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
:private-members:
|
||||||
@@ -1,21 +1,18 @@
|
|||||||
cisticola.transformer package
|
cisticola.transformer package
|
||||||
=============================
|
=============================
|
||||||
|
|
||||||
Submodules
|
|
||||||
----------
|
|
||||||
|
|
||||||
cisticola.transformer.twitter module
|
|
||||||
------------------------------------
|
|
||||||
|
|
||||||
.. automodule:: cisticola.transformer.twitter
|
|
||||||
:members:
|
|
||||||
:undoc-members:
|
|
||||||
:show-inheritance:
|
|
||||||
|
|
||||||
Module contents
|
|
||||||
---------------
|
|
||||||
|
|
||||||
.. automodule:: cisticola.transformer
|
.. automodule:: cisticola.transformer
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
:private-members:
|
||||||
|
|
||||||
|
Submodules
|
||||||
|
----------
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 4
|
||||||
|
|
||||||
|
cisticola.transformer.base
|
||||||
|
cisticola.transformer.bitchute
|
||||||
|
cisticola.transformer.twitter
|
||||||
|
|||||||
8
docs/source/cisticola.transformer.twitter.rst
Normal file
8
docs/source/cisticola.transformer.twitter.rst
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
cisticola.transformer.twitter module
|
||||||
|
====================================
|
||||||
|
|
||||||
|
.. automodule:: cisticola.transformer.twitter
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
:private-members:
|
||||||
8
docs/source/cisticola.utils.rst
Normal file
8
docs/source/cisticola.utils.rst
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
cisticola.utils module
|
||||||
|
======================
|
||||||
|
|
||||||
|
.. automodule:: cisticola.utils
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
:private-members:
|
||||||
@@ -43,9 +43,18 @@ exclude_patterns = []
|
|||||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||||
# a list of builtin themes.
|
# a list of builtin themes.
|
||||||
#
|
#
|
||||||
html_theme = 'alabaster'
|
html_theme = 'sphinx_rtd_theme'
|
||||||
|
|
||||||
# Add any paths that contain custom static files (such as style sheets) here,
|
# Add any paths that contain custom static files (such as style sheets) here,
|
||||||
# relative to this directory. They are copied after the builtin static files,
|
# relative to this directory. They are copied after the builtin static files,
|
||||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||||
html_static_path = []
|
html_static_path = []
|
||||||
|
|
||||||
|
# -- Default flags for autodoc------------------------------------------------
|
||||||
|
|
||||||
|
autodoc_default_options = {'exclude-members': '_sa_class_manager'}
|
||||||
|
|
||||||
|
html_favicon = '../images/favicon.ico'
|
||||||
|
html_logo = '../images/cisticola_logo.svg'
|
||||||
|
|
||||||
|
html_theme_options = {'style_nav_header_background': '#000000'}
|
||||||
@@ -2,16 +2,7 @@ Welcome to Cisticola's documentation!
|
|||||||
=====================================
|
=====================================
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 2
|
:maxdepth: 1
|
||||||
:caption: Contents:
|
|
||||||
|
|
||||||
modules
|
quickstart
|
||||||
|
cisticola
|
||||||
|
|
||||||
|
|
||||||
Indices and tables
|
|
||||||
==================
|
|
||||||
|
|
||||||
* :ref:`genindex`
|
|
||||||
* :ref:`modindex`
|
|
||||||
* :ref:`search`
|
|
||||||
@@ -1,7 +0,0 @@
|
|||||||
cisticola
|
|
||||||
=========
|
|
||||||
|
|
||||||
.. toctree::
|
|
||||||
:maxdepth: 4
|
|
||||||
|
|
||||||
cisticola
|
|
||||||
96
docs/source/quickstart.rst
Normal file
96
docs/source/quickstart.rst
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
Quickstart
|
||||||
|
==========
|
||||||
|
|
||||||
|
Installation
|
||||||
|
------------
|
||||||
|
|
||||||
|
The *cisticola* application uses pipenv_ for dependency management. To install the dependencies of *cisticola*, first install pipenv using the following command:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
pip install pipenv
|
||||||
|
|
||||||
|
and then install the dependencies using the following command from the package root directory:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
pipenv install
|
||||||
|
|
||||||
|
To install the necessary dependencies for building the documentation and running unit tests, run the following command from the package root directory:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
pipenv install --dev
|
||||||
|
|
||||||
|
Environment Variables
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
Three of the scrapers in *cisticola* (:py:mod:`~cisticola.scraper.gab.GabScraper`, :py:mod:`~cisticola.scraper.instagram.InstagramScraper`, and :py:mod:`~cisticola.scraper.telegram_telethon.TelegramTelethonScraper`) require platform credentials to work correctly.
|
||||||
|
|
||||||
|
Gab
|
||||||
|
"""
|
||||||
|
|
||||||
|
The Gab credentials can be configured by running the following command from the root directory:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
pipenv run garc configure
|
||||||
|
|
||||||
|
which will direct you to provide the username and password for your Gab account.
|
||||||
|
|
||||||
|
Instagram
|
||||||
|
"""""""""
|
||||||
|
|
||||||
|
The Instagram credentials can be configured by setting the following environment variables, either in the project's ``.env`` file or in the system's environment:
|
||||||
|
|
||||||
|
- ``INSTAGRAM_USERNAME``: username of your Instagram account
|
||||||
|
- ``INSTAGRAM_PASSWORD``: password of your Instagram account
|
||||||
|
|
||||||
|
Telegram Telethon
|
||||||
|
"""""""""""""""""
|
||||||
|
|
||||||
|
The Telegram credentials can be configured by setting the following environment variables, either in the project's ``.env`` file or in the system's environment:
|
||||||
|
|
||||||
|
- ``TELEGRAM_API_ID``: API ID number for your Telegram application
|
||||||
|
- ``TELEGRAM_API_HASH``: API hash for your Telegram application
|
||||||
|
- ``TELEGRAM_PHONE``: phone number for the account corresponding to your your Telegram application
|
||||||
|
|
||||||
|
If you do not already have a Telegram application, you can create one by following the instructions on `this page`_.
|
||||||
|
|
||||||
|
Documentation
|
||||||
|
-------------
|
||||||
|
|
||||||
|
The *cisticola* application uses Sphinx_ to generate and display its documentation. To build the documentation in the HTML format, run the following command from the ``docs/`` directory:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
pipenv run make html
|
||||||
|
|
||||||
|
For developers, if changes are made to the package structure or additional modules are created, you can update the Sphinx source ``*.rst`` files by running the following command from the ``docs/`` directory:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
pipenv run make apidoc
|
||||||
|
|
||||||
|
Testing
|
||||||
|
-------
|
||||||
|
|
||||||
|
The *cisticola* application uses pytest_ for unit testing. To run the test suite, run the following command from the package root directory:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
pipenv run pytest
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
|
||||||
|
An example of a *cisticola* ingest file ``russian_telegram_ingest.py`` is included in the package root directory, showing how the list of channels to scrape is defined, and how the :py:mod:`~cisticola.scraper.base.ScraperController` and :py:mod:`~cisticola.transformer.base.Transformer` classes are used. To run the ingest script, run the following command from the package root directory:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
pipenv run python russian_telegram_ingest.py
|
||||||
|
|
||||||
|
.. _pipenv: https://pipenv.pypa.io/en/latest/
|
||||||
|
.. _Sphinx: https://www.sphinx-doc.org/en/master/
|
||||||
|
.. _pytest: https://docs.pytest.org/en/7.1.x/
|
||||||
|
.. _this page: https://core.telegram.org/api/obtaining_api_id
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
[pytest]
|
[pytest]
|
||||||
minversion =
|
minversion =
|
||||||
6.0.2
|
7.0.0
|
||||||
testpaths =
|
testpaths =
|
||||||
tests/
|
tests/
|
||||||
python_files =
|
python_files =
|
||||||
@@ -13,4 +13,5 @@ addopts =
|
|||||||
--self-contained-html
|
--self-contained-html
|
||||||
filterwarnings =
|
filterwarnings =
|
||||||
ignore:the imp module is deprecated:DeprecationWarning
|
ignore:the imp module is deprecated:DeprecationWarning
|
||||||
ignore:The localize method is no longer necessary, as this time zone supports the fold attribute
|
ignore:The localize method is no longer necessary, as this time zone supports the fold attribute
|
||||||
|
ignore:invalid escape sequence:DeprecationWarning
|
||||||
Reference in New Issue
Block a user