added docstrings, improved Sphinx docs

This commit is contained in:
Tristan Lee
2022-03-14 18:04:27 -05:00
parent 6cf3b8842d
commit e4cf9daf73
25 changed files with 700 additions and 311 deletions

View File

@@ -10,7 +10,6 @@ gogettr = "*"
requests = "*" requests = "*"
bs4 = "*" bs4 = "*"
dateparser = "*" dateparser = "*"
sphinx = "*"
boto3 = "*" boto3 = "*"
snscrape = {git = "https://github.com/bellingcat/snscrape.git"} snscrape = {git = "https://github.com/bellingcat/snscrape.git"}
ffmpeg-python = "*" ffmpeg-python = "*"
@@ -24,6 +23,8 @@ pytest = "*"
pytest-cov = "*" pytest-cov = "*"
pytest-html = "*" pytest-html = "*"
pytest-metadata = "*" pytest-metadata = "*"
sphinx = "*"
sphinx_rtd_theme = "*"
[requires] [requires]
python_version = "3.9" python_version = "3.9"

433
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "3d293e1f3802d64ae7a8fbfc4c1d742cc33cd4c520a6263f93e566f89faa7013" "sha256": "495ba305ca55a0ac5754037ba133518b47324965dd3ab0b8db8b69206524d68e"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": { "requires": {
@@ -16,13 +16,6 @@
] ]
}, },
"default": { "default": {
"alabaster": {
"hashes": [
"sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359",
"sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02"
],
"version": "==0.7.12"
},
"attrs": { "attrs": {
"hashes": [ "hashes": [
"sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4", "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4",
@@ -31,14 +24,6 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==21.4.0" "version": "==21.4.0"
}, },
"babel": {
"hashes": [
"sha256:ab49e12b91d937cd11f0b67cb259a57ab4ad2b59ac7a3b41d6c06c0ac5b0def9",
"sha256:bc0c176f9f6a994582230df350aa6e05ba2ebe4b3ac317eab29d9be5d2768da0"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.9.1"
},
"beautifulsoup4": { "beautifulsoup4": {
"hashes": [ "hashes": [
"sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf", "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf",
@@ -49,19 +34,19 @@
}, },
"boto3": { "boto3": {
"hashes": [ "hashes": [
"sha256:30394729b38d5ce2f845440428a55161c6d45478044e553a12ca1acf56d7278a", "sha256:8d6f3c548f0ee03d742f404c96515e7579fc6968135aaa50dd855a046698ff79",
"sha256:895489900eb882777124c3b64a13df49785cf77f7bd1504e783464fb3b4c8163" "sha256:d857feb6af9932e1ee3a748060a2cd9fd6043dbbccf66976eda54586597efdc0"
], ],
"index": "pypi", "index": "pypi",
"version": "==1.21.15" "version": "==1.21.18"
}, },
"botocore": { "botocore": {
"hashes": [ "hashes": [
"sha256:405082f92a9e524e1aee96cbc90134668026d7da3c12f86990c91a12620ca28b", "sha256:7ea8ef1ff7c4882ab59b337662f90ddf5ea860e95e7e209dca593a34ea585b1b",
"sha256:fa4816e94e72111a9341204061e760bcbde74ca5d900d3f2206c2c2e8e4b56e4" "sha256:d2da7ccbc5ddd61fe3cd45fcbd3de380d9e3a15bfa8fbfd2d9259a93dcc60c56"
], ],
"markers": "python_version >= '3.6'", "markers": "python_version >= '3.6'",
"version": "==1.24.15" "version": "==1.24.18"
}, },
"bs4": { "bs4": {
"hashes": [ "hashes": [
@@ -101,14 +86,6 @@
"index": "pypi", "index": "pypi",
"version": "==1.1.0" "version": "==1.1.0"
}, },
"docutils": {
"hashes": [
"sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125",
"sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==0.17.1"
},
"ffmpeg-python": { "ffmpeg-python": {
"hashes": [ "hashes": [
"sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127", "sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127",
@@ -216,22 +193,6 @@
"markers": "python_version >= '3'", "markers": "python_version >= '3'",
"version": "==3.3" "version": "==3.3"
}, },
"imagesize": {
"hashes": [
"sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c",
"sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.3.0"
},
"importlib-metadata": {
"hashes": [
"sha256:b36ffa925fe3139b2f6ff11d6925ffd4fa7bc47870165e3ac260ac7b4f91e6ac",
"sha256:d16e8c1deb60de41b8e8ed21c1a7b947b0bc62fab7e1d470bcdf331cea2e6735"
],
"markers": "python_version < '3.10'",
"version": "==4.11.2"
},
"iniconfig": { "iniconfig": {
"hashes": [ "hashes": [
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3", "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
@@ -239,14 +200,6 @@
], ],
"version": "==1.1.1" "version": "==1.1.1"
}, },
"jinja2": {
"hashes": [
"sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8",
"sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7"
],
"markers": "python_version >= '3.6'",
"version": "==3.0.3"
},
"jmespath": { "jmespath": {
"hashes": [ "hashes": [
"sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9", "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9",
@@ -330,52 +283,6 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==4.8.0" "version": "==4.8.0"
}, },
"markupsafe": {
"hashes": [
"sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3",
"sha256:09c86c9643cceb1d87ca08cdc30160d1b7ab49a8a21564868921959bd16441b8",
"sha256:142119fb14a1ef6d758912b25c4e803c3ff66920635c44078666fe7cc3f8f759",
"sha256:1d1fb9b2eec3c9714dd936860850300b51dbaa37404209c8d4cb66547884b7ed",
"sha256:204730fd5fe2fe3b1e9ccadb2bd18ba8712b111dcabce185af0b3b5285a7c989",
"sha256:24c3be29abb6b34052fd26fc7a8e0a49b1ee9d282e3665e8ad09a0a68faee5b3",
"sha256:290b02bab3c9e216da57c1d11d2ba73a9f73a614bbdcc027d299a60cdfabb11a",
"sha256:3028252424c72b2602a323f70fbf50aa80a5d3aa616ea6add4ba21ae9cc9da4c",
"sha256:30c653fde75a6e5eb814d2a0a89378f83d1d3f502ab710904ee585c38888816c",
"sha256:3cace1837bc84e63b3fd2dfce37f08f8c18aeb81ef5cf6bb9b51f625cb4e6cd8",
"sha256:4056f752015dfa9828dce3140dbadd543b555afb3252507348c493def166d454",
"sha256:454ffc1cbb75227d15667c09f164a0099159da0c1f3d2636aa648f12675491ad",
"sha256:598b65d74615c021423bd45c2bc5e9b59539c875a9bdb7e5f2a6b92dfcfc268d",
"sha256:599941da468f2cf22bf90a84f6e2a65524e87be2fce844f96f2dd9a6c9d1e635",
"sha256:5ddea4c352a488b5e1069069f2f501006b1a4362cb906bee9a193ef1245a7a61",
"sha256:62c0285e91414f5c8f621a17b69fc0088394ccdaa961ef469e833dbff64bd5ea",
"sha256:679cbb78914ab212c49c67ba2c7396dc599a8479de51b9a87b174700abd9ea49",
"sha256:6e104c0c2b4cd765b4e83909cde7ec61a1e313f8a75775897db321450e928cce",
"sha256:736895a020e31b428b3382a7887bfea96102c529530299f426bf2e636aacec9e",
"sha256:75bb36f134883fdbe13d8e63b8675f5f12b80bb6627f7714c7d6c5becf22719f",
"sha256:7d2f5d97fcbd004c03df8d8fe2b973fe2b14e7bfeb2cfa012eaa8759ce9a762f",
"sha256:80beaf63ddfbc64a0452b841d8036ca0611e049650e20afcb882f5d3c266d65f",
"sha256:84ad5e29bf8bab3ad70fd707d3c05524862bddc54dc040982b0dbcff36481de7",
"sha256:8da5924cb1f9064589767b0f3fc39d03e3d0fb5aa29e0cb21d43106519bd624a",
"sha256:961eb86e5be7d0973789f30ebcf6caab60b844203f4396ece27310295a6082c7",
"sha256:96de1932237abe0a13ba68b63e94113678c379dca45afa040a17b6e1ad7ed076",
"sha256:a0a0abef2ca47b33fb615b491ce31b055ef2430de52c5b3fb19a4042dbc5cadb",
"sha256:b2a5a856019d2833c56a3dcac1b80fe795c95f401818ea963594b345929dffa7",
"sha256:b8811d48078d1cf2a6863dafb896e68406c5f513048451cd2ded0473133473c7",
"sha256:c532d5ab79be0199fa2658e24a02fce8542df196e60665dd322409a03db6a52c",
"sha256:d3b64c65328cb4cd252c94f83e66e3d7acf8891e60ebf588d7b493a55a1dbf26",
"sha256:d4e702eea4a2903441f2735799d217f4ac1b55f7d8ad96ab7d4e25417cb0827c",
"sha256:d5653619b3eb5cbd35bfba3c12d575db2a74d15e0e1c08bf1db788069d410ce8",
"sha256:d66624f04de4af8bbf1c7f21cc06649c1c69a7f84109179add573ce35e46d448",
"sha256:e67ec74fada3841b8c5f4c4f197bea916025cb9aa3fe5abf7d52b655d042f956",
"sha256:e6f7f3f41faffaea6596da86ecc2389672fa949bd035251eab26dc6697451d05",
"sha256:f02cf7221d5cd915d7fa58ab64f7ee6dd0f6cddbb48683debf5d04ae9b1c2cc1",
"sha256:f0eddfcabd6936558ec020130f932d479930581171368fd728efcfb6ef0dd357",
"sha256:fabbe18087c3d33c5824cb145ffca52eccd053061df1d79d4b66dafa5ad2a5ea",
"sha256:fc3150f85e2dbcf99e65238c842d1cfe69d3e7649b19864c1cc043213d9cd730"
],
"markers": "python_version >= '3.7'",
"version": "==2.1.0"
},
"numpy": { "numpy": {
"hashes": [ "hashes": [
"sha256:07a8c89a04997625236c5ecb7afe35a02af3896c8aa01890a849913a2309c676", "sha256:07a8c89a04997625236c5ecb7afe35a02af3896c8aa01890a849913a2309c676",
@@ -395,6 +302,7 @@
"sha256:dbc7601a3b7472d559dc7b933b18b4b66f9aa7452c120e87dfb33d02008c8a18", "sha256:dbc7601a3b7472d559dc7b933b18b4b66f9aa7452c120e87dfb33d02008c8a18",
"sha256:e7927a589df200c5e23c57970bafbd0cd322459aa7b1ff73b7c2e84d6e3eae62", "sha256:e7927a589df200c5e23c57970bafbd0cd322459aa7b1ff73b7c2e84d6e3eae62",
"sha256:f8c1f39caad2c896bc0018f699882b345b2a63708008be29b1f355ebf6f933fe", "sha256:f8c1f39caad2c896bc0018f699882b345b2a63708008be29b1f355ebf6f933fe",
"sha256:f950f8845b480cffe522913d35567e29dd381b0dc7e4ce6a4a9f9156417d2430",
"sha256:fade0d4f4d292b6f39951b6836d7a3c7ef5b2347f3c420cd9820a1d90d794802", "sha256:fade0d4f4d292b6f39951b6836d7a3c7ef5b2347f3c420cd9820a1d90d794802",
"sha256:fdf3c08bce27132395d3c3ba1503cac12e17282358cb4bddc25cc46b0aca07aa" "sha256:fdf3c08bce27132395d3c3ba1503cac12e17282358cb4bddc25cc46b0aca07aa"
], ],
@@ -480,14 +388,6 @@
], ],
"version": "==0.4.8" "version": "==0.4.8"
}, },
"pygments": {
"hashes": [
"sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65",
"sha256:4e426f72023d88d03b2fa258de560726ce890ff3b630f88c21cbb8b2503b8c6a"
],
"markers": "python_version >= '3.5'",
"version": "==2.11.2"
},
"pyparsing": { "pyparsing": {
"hashes": [ "hashes": [
"sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea", "sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea",
@@ -506,11 +406,11 @@
}, },
"pytest": { "pytest": {
"hashes": [ "hashes": [
"sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db", "sha256:b555252a95bbb2a37a97b5ac2eb050c436f7989993565f5e0c9128fcaacadd0e",
"sha256:e30905a0c131d3d94b89624a1cc5afec3e0ba2fbdb151867d8e0ebd49850f171" "sha256:f1089d218cfcc63a212c42896f1b7fbf096874d045e1988186861a1a87d27b47"
], ],
"markers": "python_version >= '3.6'", "markers": "python_version >= '3.7'",
"version": "==7.0.1" "version": "==7.1.0"
}, },
"python-dateutil": { "python-dateutil": {
"hashes": [ "hashes": [
@@ -628,7 +528,7 @@
"sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17", "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17",
"sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb" "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"
], ],
"markers": "python_version >= '3.6' and python_version < '4.0'", "markers": "python_version >= '3.6' and python_version < '4'",
"version": "==4.8" "version": "==4.8"
}, },
"s3transfer": { "s3transfer": {
@@ -647,13 +547,6 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.16.0" "version": "==1.16.0"
}, },
"snowballstemmer": {
"hashes": [
"sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1",
"sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"
],
"version": "==2.2.0"
},
"snscrape": { "snscrape": {
"git": "https://github.com/bellingcat/snscrape.git", "git": "https://github.com/bellingcat/snscrape.git",
"ref": "de4ebed81f3f6a4bb4c65630daab6ec63784959b" "ref": "de4ebed81f3f6a4bb4c65630daab6ec63784959b"
@@ -666,62 +559,6 @@
"markers": "python_version >= '3.6'", "markers": "python_version >= '3.6'",
"version": "==2.3.1" "version": "==2.3.1"
}, },
"sphinx": {
"hashes": [
"sha256:5da895959511473857b6d0200f56865ed62c31e8f82dd338063b84ec022701fe",
"sha256:6caad9786055cb1fa22b4a365c1775816b876f91966481765d7d50e9f0dd35cc"
],
"index": "pypi",
"version": "==4.4.0"
},
"sphinxcontrib-applehelp": {
"hashes": [
"sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a",
"sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.2"
},
"sphinxcontrib-devhelp": {
"hashes": [
"sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e",
"sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.2"
},
"sphinxcontrib-htmlhelp": {
"hashes": [
"sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07",
"sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2"
],
"markers": "python_version >= '3.6'",
"version": "==2.0.0"
},
"sphinxcontrib-jsmath": {
"hashes": [
"sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178",
"sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.1"
},
"sphinxcontrib-qthelp": {
"hashes": [
"sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72",
"sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.3"
},
"sphinxcontrib-serializinghtml": {
"hashes": [
"sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd",
"sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952"
],
"markers": "python_version >= '3.5'",
"version": "==1.1.5"
},
"sqlalchemy": { "sqlalchemy": {
"hashes": [ "hashes": [
"sha256:04164e0063feb7aedd9d073db0fd496edb244be40d46ea1f0d8990815e4b8c34", "sha256:04164e0063feb7aedd9d073db0fd496edb244be40d46ea1f0d8990815e4b8c34",
@@ -800,7 +637,7 @@
"sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed", "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
"sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c" "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
], ],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==1.26.8" "version": "==1.26.8"
}, },
"youtube-dl": { "youtube-dl": {
@@ -810,17 +647,16 @@
], ],
"index": "pypi", "index": "pypi",
"version": "==2021.12.17" "version": "==2021.12.17"
},
"zipp": {
"hashes": [
"sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d",
"sha256:b47250dd24f92b7dd6a0a8fc5244da14608f3ca90a5efcd37a3b1642fac9a375"
],
"markers": "python_version >= '3.7'",
"version": "==3.7.0"
} }
}, },
"develop": { "develop": {
"alabaster": {
"hashes": [
"sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359",
"sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02"
],
"version": "==0.7.12"
},
"attrs": { "attrs": {
"hashes": [ "hashes": [
"sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4", "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4",
@@ -829,6 +665,29 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==21.4.0" "version": "==21.4.0"
}, },
"babel": {
"hashes": [
"sha256:ab49e12b91d937cd11f0b67cb259a57ab4ad2b59ac7a3b41d6c06c0ac5b0def9",
"sha256:bc0c176f9f6a994582230df350aa6e05ba2ebe4b3ac317eab29d9be5d2768da0"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.9.1"
},
"certifi": {
"hashes": [
"sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
"sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
],
"version": "==2021.10.8"
},
"charset-normalizer": {
"hashes": [
"sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597",
"sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"
],
"markers": "python_version >= '3'",
"version": "==2.0.12"
},
"coverage": { "coverage": {
"extras": [ "extras": [
"toml" "toml"
@@ -879,6 +738,38 @@
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.7'",
"version": "==6.3.2" "version": "==6.3.2"
}, },
"docutils": {
"hashes": [
"sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125",
"sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==0.17.1"
},
"idna": {
"hashes": [
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
],
"markers": "python_version >= '3'",
"version": "==3.3"
},
"imagesize": {
"hashes": [
"sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c",
"sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.3.0"
},
"importlib-metadata": {
"hashes": [
"sha256:1208431ca90a8cca1a6b8af391bb53c1a2db74e5d1cef6ddced95d4b2062edc6",
"sha256:ea4c597ebf37142f827b8f39299579e31685c31d3a438b59f469406afd0f2539"
],
"markers": "python_version < '3.10'",
"version": "==4.11.3"
},
"iniconfig": { "iniconfig": {
"hashes": [ "hashes": [
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3", "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
@@ -886,6 +777,60 @@
], ],
"version": "==1.1.1" "version": "==1.1.1"
}, },
"jinja2": {
"hashes": [
"sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8",
"sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7"
],
"markers": "python_version >= '3.6'",
"version": "==3.0.3"
},
"markupsafe": {
"hashes": [
"sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3",
"sha256:09c86c9643cceb1d87ca08cdc30160d1b7ab49a8a21564868921959bd16441b8",
"sha256:142119fb14a1ef6d758912b25c4e803c3ff66920635c44078666fe7cc3f8f759",
"sha256:1d1fb9b2eec3c9714dd936860850300b51dbaa37404209c8d4cb66547884b7ed",
"sha256:204730fd5fe2fe3b1e9ccadb2bd18ba8712b111dcabce185af0b3b5285a7c989",
"sha256:24c3be29abb6b34052fd26fc7a8e0a49b1ee9d282e3665e8ad09a0a68faee5b3",
"sha256:290b02bab3c9e216da57c1d11d2ba73a9f73a614bbdcc027d299a60cdfabb11a",
"sha256:3028252424c72b2602a323f70fbf50aa80a5d3aa616ea6add4ba21ae9cc9da4c",
"sha256:30c653fde75a6e5eb814d2a0a89378f83d1d3f502ab710904ee585c38888816c",
"sha256:3cace1837bc84e63b3fd2dfce37f08f8c18aeb81ef5cf6bb9b51f625cb4e6cd8",
"sha256:4056f752015dfa9828dce3140dbadd543b555afb3252507348c493def166d454",
"sha256:454ffc1cbb75227d15667c09f164a0099159da0c1f3d2636aa648f12675491ad",
"sha256:598b65d74615c021423bd45c2bc5e9b59539c875a9bdb7e5f2a6b92dfcfc268d",
"sha256:599941da468f2cf22bf90a84f6e2a65524e87be2fce844f96f2dd9a6c9d1e635",
"sha256:5ddea4c352a488b5e1069069f2f501006b1a4362cb906bee9a193ef1245a7a61",
"sha256:62c0285e91414f5c8f621a17b69fc0088394ccdaa961ef469e833dbff64bd5ea",
"sha256:679cbb78914ab212c49c67ba2c7396dc599a8479de51b9a87b174700abd9ea49",
"sha256:6e104c0c2b4cd765b4e83909cde7ec61a1e313f8a75775897db321450e928cce",
"sha256:736895a020e31b428b3382a7887bfea96102c529530299f426bf2e636aacec9e",
"sha256:75bb36f134883fdbe13d8e63b8675f5f12b80bb6627f7714c7d6c5becf22719f",
"sha256:7d2f5d97fcbd004c03df8d8fe2b973fe2b14e7bfeb2cfa012eaa8759ce9a762f",
"sha256:80beaf63ddfbc64a0452b841d8036ca0611e049650e20afcb882f5d3c266d65f",
"sha256:84ad5e29bf8bab3ad70fd707d3c05524862bddc54dc040982b0dbcff36481de7",
"sha256:8da5924cb1f9064589767b0f3fc39d03e3d0fb5aa29e0cb21d43106519bd624a",
"sha256:961eb86e5be7d0973789f30ebcf6caab60b844203f4396ece27310295a6082c7",
"sha256:96de1932237abe0a13ba68b63e94113678c379dca45afa040a17b6e1ad7ed076",
"sha256:a0a0abef2ca47b33fb615b491ce31b055ef2430de52c5b3fb19a4042dbc5cadb",
"sha256:b2a5a856019d2833c56a3dcac1b80fe795c95f401818ea963594b345929dffa7",
"sha256:b8811d48078d1cf2a6863dafb896e68406c5f513048451cd2ded0473133473c7",
"sha256:c532d5ab79be0199fa2658e24a02fce8542df196e60665dd322409a03db6a52c",
"sha256:d3b64c65328cb4cd252c94f83e66e3d7acf8891e60ebf588d7b493a55a1dbf26",
"sha256:d4e702eea4a2903441f2735799d217f4ac1b55f7d8ad96ab7d4e25417cb0827c",
"sha256:d5653619b3eb5cbd35bfba3c12d575db2a74d15e0e1c08bf1db788069d410ce8",
"sha256:d66624f04de4af8bbf1c7f21cc06649c1c69a7f84109179add573ce35e46d448",
"sha256:e67ec74fada3841b8c5f4c4f197bea916025cb9aa3fe5abf7d52b655d042f956",
"sha256:e6f7f3f41faffaea6596da86ecc2389672fa949bd035251eab26dc6697451d05",
"sha256:f02cf7221d5cd915d7fa58ab64f7ee6dd0f6cddbb48683debf5d04ae9b1c2cc1",
"sha256:f0eddfcabd6936558ec020130f932d479930581171368fd728efcfb6ef0dd357",
"sha256:fabbe18087c3d33c5824cb145ffca52eccd053061df1d79d4b66dafa5ad2a5ea",
"sha256:fc3150f85e2dbcf99e65238c842d1cfe69d3e7649b19864c1cc043213d9cd730"
],
"markers": "python_version >= '3.7'",
"version": "==2.1.0"
},
"packaging": { "packaging": {
"hashes": [ "hashes": [
"sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb",
@@ -910,6 +855,14 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==1.11.0" "version": "==1.11.0"
}, },
"pygments": {
"hashes": [
"sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65",
"sha256:4e426f72023d88d03b2fa258de560726ce890ff3b630f88c21cbb8b2503b8c6a"
],
"markers": "python_version >= '3.5'",
"version": "==2.11.2"
},
"pyparsing": { "pyparsing": {
"hashes": [ "hashes": [
"sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea", "sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea",
@@ -920,11 +873,11 @@
}, },
"pytest": { "pytest": {
"hashes": [ "hashes": [
"sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db", "sha256:b555252a95bbb2a37a97b5ac2eb050c436f7989993565f5e0c9128fcaacadd0e",
"sha256:e30905a0c131d3d94b89624a1cc5afec3e0ba2fbdb151867d8e0ebd49850f171" "sha256:f1089d218cfcc63a212c42896f1b7fbf096874d045e1988186861a1a87d27b47"
], ],
"markers": "python_version >= '3.6'", "markers": "python_version >= '3.7'",
"version": "==7.0.1" "version": "==7.1.0"
}, },
"pytest-cov": { "pytest-cov": {
"hashes": [ "hashes": [
@@ -950,6 +903,92 @@
"index": "pypi", "index": "pypi",
"version": "==1.11.0" "version": "==1.11.0"
}, },
"pytz": {
"hashes": [
"sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c",
"sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"
],
"version": "==2021.3"
},
"requests": {
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
],
"index": "pypi",
"version": "==2.27.1"
},
"snowballstemmer": {
"hashes": [
"sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1",
"sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"
],
"version": "==2.2.0"
},
"sphinx": {
"hashes": [
"sha256:5da895959511473857b6d0200f56865ed62c31e8f82dd338063b84ec022701fe",
"sha256:6caad9786055cb1fa22b4a365c1775816b876f91966481765d7d50e9f0dd35cc"
],
"index": "pypi",
"version": "==4.4.0"
},
"sphinx-rtd-theme": {
"hashes": [
"sha256:4d35a56f4508cfee4c4fb604373ede6feae2a306731d533f409ef5c3496fdbd8",
"sha256:eec6d497e4c2195fa0e8b2016b337532b8a699a68bcb22a512870e16925c6a5c"
],
"index": "pypi",
"version": "==1.0.0"
},
"sphinxcontrib-applehelp": {
"hashes": [
"sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a",
"sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.2"
},
"sphinxcontrib-devhelp": {
"hashes": [
"sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e",
"sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.2"
},
"sphinxcontrib-htmlhelp": {
"hashes": [
"sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07",
"sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2"
],
"markers": "python_version >= '3.6'",
"version": "==2.0.0"
},
"sphinxcontrib-jsmath": {
"hashes": [
"sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178",
"sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.1"
},
"sphinxcontrib-qthelp": {
"hashes": [
"sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72",
"sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.3"
},
"sphinxcontrib-serializinghtml": {
"hashes": [
"sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd",
"sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952"
],
"markers": "python_version >= '3.5'",
"version": "==1.1.5"
},
"tomli": { "tomli": {
"hashes": [ "hashes": [
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
@@ -957,6 +996,22 @@
], ],
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.7'",
"version": "==2.0.1" "version": "==2.0.1"
},
"urllib3": {
"hashes": [
"sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
"sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==1.26.8"
},
"zipp": {
"hashes": [
"sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d",
"sha256:b47250dd24f92b7dd6a0a8fc5244da14608f3ca90a5efcd37a3b1642fac9a375"
],
"markers": "python_version >= '3.7'",
"version": "==3.7.0"
} }
} }
} }

View File

@@ -5,21 +5,118 @@ from datetime import datetime
from sqlalchemy.orm import registry from sqlalchemy.orm import registry
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey
mapper_registry = registry()
@dataclass @dataclass
class ScraperResult: class ScraperResult:
"""A minimally processed result from a scraper""" """A minimally processed result from a scraper
"""
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
scraper: str scraper: str
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
platform: str platform: str
channel: int #TODO there is probably a way of making this a Channel object foreign key
#TODO there is probably a way of making this a Channel object foreign key
#: User-specified integer that uniquely identifies a channel, e.g. ``15``.
channel: int
#: String that uniquely identifies the scraped post on the given platform, e.g. ``"1503397267675533313"``
platform_id: str platform_id: str
#: Datetime (relative to UTC) that the scraped post was created at.
date: datetime date: datetime
#: JSON dump of dict that contains all data scraped for the post.
raw_data: str raw_data: str
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime date_archived: datetime
#: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files.
archived_urls: dict archived_urls: dict
@dataclass
class Channel:
"""Information about a specific channel to be scraped.
"""
#: User-specified integer that uniquely identifies a channel, e.g. ``15``.
id: int
#: Name of channel (different from username because it can be non-unique and contain emojis), e.g. ``T🕊Редакция Президент Гордон🕊"``.
name: str
#: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
platform_id: str
#: User-specified category for the channel, e.g. ``"qanon-adjacent"``.
category: str
#: Number of followers the channel has on the given platform, e.e. ``"1465"``.
followers: int
#: Name of platform the given channel is on, e.g. ``"Telegram"``.
platform: str
#: URL for the given channel on the platform, e.g. ``"https://t.me/prezidentgordonteam"``
url: str
#: Screen name/username of channel.
screenname: str
#: 2 digit country code for the country of origin for the channel, e.g. ``"RU"``.
country: str
#: Name of influencer, if channel belongs to an influencer that operates on multiple platforms.
influencer: str
#: Whether or not the channel is publicly-accessible.
public: bool
#: Whether or not the channel is a chat (i.e. allows users who are not the channel creator to post/message)
chat: bool
#: Any other additional notes about the channel.
notes: str
@dataclass
class TransformedResult:
"""An object with fields for columns in the analysis table"""
#: ID number of the scraped post in the ``raw_data`` table
raw_id: int
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
scraper: str
#: String specifying name and version of transformer used to tranform result, e.g. ``"TwitterTransformer 0.0.1"``.
transformer: str
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
platform: str
#: User-specified integer that uniquely identifies a channel, e.g. ``15``.
channel: str
#: Datetime (relative to UTC) that the scraped post was created at.
date: datetime
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime
#: URL of the original post
url: str
#: Text of the original post
content: str
#: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
author_id: str
#: Username of author who made post.
author_username: str
mapper_registry = registry()
raw_data_table = Table('raw_data', mapper_registry.metadata, raw_data_table = Table('raw_data', mapper_registry.metadata,
Column('id', Integer, primary_key=True, Column('id', Integer, primary_key=True,
@@ -35,40 +132,6 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
mapper_registry.map_imperatively(ScraperResult, raw_data_table) mapper_registry.map_imperatively(ScraperResult, raw_data_table)
@dataclass
class Channel:
id: int
name: str
platform_id: str
category: str
followers: int
platform: str
url: str
screenname: str
country: str
influencer: str
public: bool
chat: bool
notes: str
@dataclass
class TransformedResult:
"""An object with fields for columns in the analysis table"""
raw_id: int
scraper: str
transformer: str
platform: str
channel: str
date: datetime
date_archived: datetime
url: str
content: str
author_id: str
author_username: str
analysis_table = Table('analysis', mapper_registry.metadata, analysis_table = Table('analysis', mapper_registry.metadata,
Column('id', Integer, primary_key=True, Column('id', Integer, primary_key=True,
autoincrement=True), autoincrement=True),

View File

@@ -13,29 +13,73 @@ from cisticola.base import Channel, ScraperResult, mapper_registry
from cisticola.scraper import make_request from cisticola.scraper import make_request
class Scraper: class Scraper:
"""Base class for defining platform-specific scrapers for scraping all posts
from a given channel on that specific platform.
"""
__version__ = "Scraper 0.0.0" __version__ = "Scraper 0.0.0"
def __init__(self): def __init__(self):
self.s3_client = boto3.client('s3',
region_name=os.environ['DO_SPACES_REGION'],
endpoint_url='https://{}.digitaloceanspaces.com'.format(
os.environ['DO_SPACES_REGION']),
aws_access_key_id=os.environ['DO_SPACES_KEY'],
aws_secret_access_key=os.environ['DO_SPACES_SECRET'])
# Initialize client to transfer files to the storage archive
self.s3_client = boto3.client(
service_name='s3',
region_name=os.environ['DO_SPACES_REGION'],
endpoint_url=f'https://{os.environ["DO_SPACES_REGION"]}.digitaloceanspaces.com',
aws_access_key_id=os.environ['DO_SPACES_KEY'],
aws_secret_access_key=os.environ['DO_SPACES_SECRET'])
# Define request headers (necessary to bypass scraping protection
# for several platform scrapers)
self.headers = { self.headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'} 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'}
pass
def __str__(self): def __str__(self):
return self.__version__ return self.__version__
def url_to_key(self, url: str, content_type: str) -> str: def url_to_key(self, url: str, content_type: str) -> str:
"""Generate a unique identifier for media from a specified post.
Parameters
---------
url: str
URL of original post.
e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"``
content_type: str
Content-Type of media.
e.g. ``"image/jpeg"``
Returns
-------
key: str
Unique identifier for the media file from a specified post based on
the original post URL and the media's Content-Type.
"""
key = urlparse(url).path.split('/')[-1] key = urlparse(url).path.split('/')[-1]
return key return key
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
"""Download media file from a specified post URL.
Parameters
---------
url: str
URL of original post.
e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"``
key: str or None
Pre-defined unique identifier for the media file.
Returns
-------
blob: bytes
Raw bytes of the downloaded media file.
content_type: str
Content-Type of media.
e.g. ``"image/jpeg"``.
key: str
Unique identifier for the media file.
"""
r = make_request(url, headers = self.headers) r = make_request(url, headers = self.headers)
@@ -48,6 +92,27 @@ class Scraper:
return blob, content_type, key return blob, content_type, key
def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
"""Download media file from a specified post URL, where the media file
is formatted as an m3u8 playlist, which is then decoded to an mp4 file.
Parameters
---------
url: str
URL of original post.
e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"``
key: str or None
Pre-defined unique identifier for the media file.
Returns
-------
blob: bytes
Raw bytes of the downloaded media file.
content_type: str
Content-Type of media.
e.g. ``"video/mp4"``.
key: str
Unique identifier for the media file.
"""
content_type = 'video/mp4' content_type = 'video/mp4'
ext = '.' + content_type.split('/')[-1] ext = '.' + content_type.split('/')[-1]
@@ -70,6 +135,23 @@ class Scraper:
return blob, content_type, key return blob, content_type, key
def archive_blob(self, blob: bytes, content_type: str, key: str) -> str: def archive_blob(self, blob: bytes, content_type: str, key: str) -> str:
"""Upload raw bytes of a media file to the storage archive.
Parameters
----------
blob: bytes
Raw bytes of the media file to be archived.
content_type: str
Content-Type of media.
e.g. ``"video/mp4"``.
key: str
Unique identifier for the media file.
Returns
-------
archived_url: str
URL specifying the file on the storage archive.
"""
filename = self.__version__.replace(' ', '_') + '/' + key filename = self.__version__.replace(' ', '_') + '/' + key
@@ -81,9 +163,37 @@ class Scraper:
return archived_url return archived_url
def can_handle(self, channel: Channel) -> bool: def can_handle(self, channel: Channel) -> bool:
"""Whether or not the scraper can scrape the specified channel.
Parameters
----------
channel: Channel
Channel to be scraped.
Returns
-------
bool
``True`` if the scraper is capable of scraping ``channel``,
``False`` if not.
"""
raise NotImplementedError raise NotImplementedError
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
"""Scrape all posts from the specified Channel.
Parameters
----------
channel: Channel
Channel to be scraped.
since: ScraperResult or None
Most recently scraped ScraperResult from a previous scrape, or
``None`` if scraper has not run before.
archive_media: bool
If ``True``, any media files (images, video, etc.) from posts are archived.
If ``False``, media files are not archived.
"""
raise NotImplementedError raise NotImplementedError
@@ -97,13 +207,28 @@ class ScraperController:
self.mapper_registry = None self.mapper_registry = None
def register_scraper(self, scraper: Scraper): def register_scraper(self, scraper: Scraper):
"""Register a single Scraper instance to the controller.
"""
self.scrapers.append(scraper) self.scrapers.append(scraper)
def register_scrapers(self, scraper: List[Scraper]): def register_scrapers(self, scraper: List[Scraper]):
"""Register a list of Scraper instances to the controller.
"""
self.scrapers.extend(scraper) self.scrapers.extend(scraper)
@logger.catch @logger.catch
def scrape_channels(self, channels: List[Channel], archive_media: bool = True): def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
"""Scrape all posts for all specified channels.
Parameters
----------
channels: list<Channel>
List of Channel instances to be scraped
archive_media: bool
If ``True``, any media files (images, video, etc.) from posts are archived.
If ``False``, media files are not archived.
"""
if self.session is None: if self.session is None:
logger.error("No DB session") logger.error("No DB session")
return return
@@ -143,15 +268,11 @@ class ScraperController:
logger.warning(f"No handler found for Channel {channel}") logger.warning(f"No handler found for Channel {channel}")
def connect_to_db(self, engine): def connect_to_db(self, engine):
"""Connect the specified SQLAlchemy engine to the controller.
"""
# create tables # create tables
mapper_registry.metadata.create_all(bind=engine) mapper_registry.metadata.create_all(bind=engine)
self.session = sessionmaker() self.session = sessionmaker()
self.session.configure(bind=engine) self.session.configure(bind=engine)
class ETLController:
"""This class will transform the raw_data tables into a format more conducive to analysis."""
def __init__(self):
pass

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 7.0 KiB

BIN
docs/images/favicon.ico Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 614 B

View File

@@ -0,0 +1,8 @@
cisticola.base module
=====================
.. automodule:: cisticola.base
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -1,6 +1,12 @@
cisticola package cisticola package
================= =================
.. automodule:: cisticola
:members:
:undoc-members:
:show-inheritance:
:private-members:
Subpackages Subpackages
----------- -----------
@@ -13,18 +19,7 @@ Subpackages
Submodules Submodules
---------- ----------
cisticola.base module .. toctree::
--------------------- :maxdepth: 4
.. automodule:: cisticola.base cisticola.base
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: cisticola
:members:
:undoc-members:
:show-inheritance:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.base module
=============================
.. automodule:: cisticola.scraper.base
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.bitchute module
=================================
.. automodule:: cisticola.scraper.bitchute
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.gab module
============================
.. automodule:: cisticola.scraper.gab
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.gettr module
==============================
.. automodule:: cisticola.scraper.gettr
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.odysee module
===============================
.. automodule:: cisticola.scraper.odysee
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -1,37 +1,25 @@
cisticola.scraper package cisticola.scraper package
========================= =========================
Submodules
----------
cisticola.scraper.bitchute module
---------------------------------
.. automodule:: cisticola.scraper.bitchute
:members:
:undoc-members:
:show-inheritance:
cisticola.scraper.gettr module
------------------------------
.. automodule:: cisticola.scraper.gettr
:members:
:undoc-members:
:show-inheritance:
cisticola.scraper.twitter module
--------------------------------
.. automodule:: cisticola.scraper.twitter
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: cisticola.scraper .. automodule:: cisticola.scraper
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
:private-members:
Submodules
----------
.. toctree::
:maxdepth: 4
cisticola.scraper.base
cisticola.scraper.bitchute
cisticola.scraper.gab
cisticola.scraper.gettr
cisticola.scraper.odysee
cisticola.scraper.rumble
cisticola.scraper.telegram_snscrape
cisticola.scraper.telegram_telethon
cisticola.scraper.twitter
cisticola.scraper.utils

View File

@@ -0,0 +1,8 @@
cisticola.scraper.rumble module
===============================
.. automodule:: cisticola.scraper.rumble
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.telegram\_snscrape module
===========================================
.. automodule:: cisticola.scraper.telegram_snscrape
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.telegram\_telethon module
===========================================
.. automodule:: cisticola.scraper.telegram_telethon
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.twitter module
================================
.. automodule:: cisticola.scraper.twitter
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.utils module
==============================
.. automodule:: cisticola.scraper.utils
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.transformer.base module
=================================
.. automodule:: cisticola.transformer.base
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -1,21 +1,17 @@
cisticola.transformer package cisticola.transformer package
============================= =============================
Submodules
----------
cisticola.transformer.twitter module
------------------------------------
.. automodule:: cisticola.transformer.twitter
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: cisticola.transformer .. automodule:: cisticola.transformer
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
:private-members:
Submodules
----------
.. toctree::
:maxdepth: 4
cisticola.transformer.base
cisticola.transformer.twitter

View File

@@ -0,0 +1,8 @@
cisticola.transformer.twitter module
====================================
.. automodule:: cisticola.transformer.twitter
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -43,9 +43,18 @@ exclude_patterns = []
# The theme to use for HTML and HTML Help pages. See the documentation for # The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes. # a list of builtin themes.
# #
html_theme = 'alabaster' html_theme = 'sphinx_rtd_theme'
# Add any paths that contain custom static files (such as style sheets) here, # Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files, # relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css". # so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = [] html_static_path = []
# -- Default flags for autodoc------------------------------------------------
autodoc_default_options = {'exclude-members': '_sa_class_manager'}
html_favicon = '../images/favicon.ico'
html_logo = '../images/cisticola_logo.svg'
html_theme_options = {'style_nav_header_background': '#000000'}

View File

@@ -5,7 +5,7 @@ Welcome to Cisticola's documentation!
:maxdepth: 2 :maxdepth: 2
:caption: Contents: :caption: Contents:
modules cisticola

View File

@@ -1,7 +0,0 @@
cisticola
=========
.. toctree::
:maxdepth: 4
cisticola