Merge branch 'main' into channel-db

This commit is contained in:
Logan Williams
2022-03-22 11:49:07 +01:00
committed by GitHub
41 changed files with 970 additions and 313 deletions

View File

@@ -10,7 +10,6 @@ gogettr = "*"
requests = "*" requests = "*"
bs4 = "*" bs4 = "*"
dateparser = "*" dateparser = "*"
sphinx = "*"
boto3 = "*" boto3 = "*"
snscrape = {git = "https://github.com/bellingcat/snscrape.git"} snscrape = {git = "https://github.com/bellingcat/snscrape.git"}
ffmpeg-python = "*" ffmpeg-python = "*"
@@ -29,6 +28,8 @@ pytest-cov = "*"
pytest-html = "*" pytest-html = "*"
pytest-metadata = "*" pytest-metadata = "*"
black = "*" black = "*"
sphinx = "*"
sphinx_rtd_theme = "*"
[requires] [requires]
python_version = "3.9" python_version = "3.9"

418
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "d465f2d09a728ee76cb0af521890ecc1e1bce672acbd1caf2e4d01b6567480d5" "sha256": "e3b96b0ac8c80d4817f9adac4ab171bf4b7e07e80927c7b152a24e8bbdbf7faa"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": { "requires": {
@@ -16,13 +16,6 @@
] ]
}, },
"default": { "default": {
"alabaster": {
"hashes": [
"sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359",
"sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02"
],
"version": "==0.7.12"
},
"attrs": { "attrs": {
"hashes": [ "hashes": [
"sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4", "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4",
@@ -31,14 +24,6 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==21.4.0" "version": "==21.4.0"
}, },
"babel": {
"hashes": [
"sha256:ab49e12b91d937cd11f0b67cb259a57ab4ad2b59ac7a3b41d6c06c0ac5b0def9",
"sha256:bc0c176f9f6a994582230df350aa6e05ba2ebe4b3ac317eab29d9be5d2768da0"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.9.1"
},
"beautifulsoup4": { "beautifulsoup4": {
"hashes": [ "hashes": [
"sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf", "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf",
@@ -49,19 +34,19 @@
}, },
"boto3": { "boto3": {
"hashes": [ "hashes": [
"sha256:8d6f3c548f0ee03d742f404c96515e7579fc6968135aaa50dd855a046698ff79", "sha256:76d5b90400c54b25278150768e946edf166acce2c1597c0ecfbebb1dbe9acf2c",
"sha256:d857feb6af9932e1ee3a748060a2cd9fd6043dbbccf66976eda54586597efdc0" "sha256:7bb2e6506a6ad44d111dd20a5d510374b6958fe989b4ef887109c79d812f926f"
], ],
"index": "pypi", "index": "pypi",
"version": "==1.21.18" "version": "==1.21.19"
}, },
"botocore": { "botocore": {
"hashes": [ "hashes": [
"sha256:7ea8ef1ff7c4882ab59b337662f90ddf5ea860e95e7e209dca593a34ea585b1b", "sha256:5ed2be0e413961134f4c17eab16396d41a5b4b73a637588260c04d20806d52ea",
"sha256:d2da7ccbc5ddd61fe3cd45fcbd3de380d9e3a15bfa8fbfd2d9259a93dcc60c56" "sha256:d0d77bce152ca51f3c2cd0f9bf05cb3b623e719406ad58b4c20444e237fe82eb"
], ],
"markers": "python_version >= '3.6'", "markers": "python_version >= '3.6'",
"version": "==1.24.18" "version": "==1.24.19"
}, },
"brotli": { "brotli": {
"hashes": [ "hashes": [
@@ -169,14 +154,6 @@
"index": "pypi", "index": "pypi",
"version": "==1.1.0" "version": "==1.1.0"
}, },
"docutils": {
"hashes": [
"sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125",
"sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==0.17.1"
},
"ffmpeg-python": { "ffmpeg-python": {
"hashes": [ "hashes": [
"sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127", "sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127",
@@ -284,22 +261,6 @@
"markers": "python_version >= '3'", "markers": "python_version >= '3'",
"version": "==3.3" "version": "==3.3"
}, },
"imagesize": {
"hashes": [
"sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c",
"sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.3.0"
},
"importlib-metadata": {
"hashes": [
"sha256:1208431ca90a8cca1a6b8af391bb53c1a2db74e5d1cef6ddced95d4b2062edc6",
"sha256:ea4c597ebf37142f827b8f39299579e31685c31d3a438b59f469406afd0f2539"
],
"markers": "python_version < '3.10'",
"version": "==4.11.3"
},
"iniconfig": { "iniconfig": {
"hashes": [ "hashes": [
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3", "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
@@ -314,14 +275,6 @@
"index": "pypi", "index": "pypi",
"version": "==4.8.4" "version": "==4.8.4"
}, },
"jinja2": {
"hashes": [
"sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8",
"sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7"
],
"markers": "python_version >= '3.6'",
"version": "==3.0.3"
},
"jmespath": { "jmespath": {
"hashes": [ "hashes": [
"sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9", "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9",
@@ -405,52 +358,6 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==4.8.0" "version": "==4.8.0"
}, },
"markupsafe": {
"hashes": [
"sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3",
"sha256:09c86c9643cceb1d87ca08cdc30160d1b7ab49a8a21564868921959bd16441b8",
"sha256:142119fb14a1ef6d758912b25c4e803c3ff66920635c44078666fe7cc3f8f759",
"sha256:1d1fb9b2eec3c9714dd936860850300b51dbaa37404209c8d4cb66547884b7ed",
"sha256:204730fd5fe2fe3b1e9ccadb2bd18ba8712b111dcabce185af0b3b5285a7c989",
"sha256:24c3be29abb6b34052fd26fc7a8e0a49b1ee9d282e3665e8ad09a0a68faee5b3",
"sha256:290b02bab3c9e216da57c1d11d2ba73a9f73a614bbdcc027d299a60cdfabb11a",
"sha256:3028252424c72b2602a323f70fbf50aa80a5d3aa616ea6add4ba21ae9cc9da4c",
"sha256:30c653fde75a6e5eb814d2a0a89378f83d1d3f502ab710904ee585c38888816c",
"sha256:3cace1837bc84e63b3fd2dfce37f08f8c18aeb81ef5cf6bb9b51f625cb4e6cd8",
"sha256:4056f752015dfa9828dce3140dbadd543b555afb3252507348c493def166d454",
"sha256:454ffc1cbb75227d15667c09f164a0099159da0c1f3d2636aa648f12675491ad",
"sha256:598b65d74615c021423bd45c2bc5e9b59539c875a9bdb7e5f2a6b92dfcfc268d",
"sha256:599941da468f2cf22bf90a84f6e2a65524e87be2fce844f96f2dd9a6c9d1e635",
"sha256:5ddea4c352a488b5e1069069f2f501006b1a4362cb906bee9a193ef1245a7a61",
"sha256:62c0285e91414f5c8f621a17b69fc0088394ccdaa961ef469e833dbff64bd5ea",
"sha256:679cbb78914ab212c49c67ba2c7396dc599a8479de51b9a87b174700abd9ea49",
"sha256:6e104c0c2b4cd765b4e83909cde7ec61a1e313f8a75775897db321450e928cce",
"sha256:736895a020e31b428b3382a7887bfea96102c529530299f426bf2e636aacec9e",
"sha256:75bb36f134883fdbe13d8e63b8675f5f12b80bb6627f7714c7d6c5becf22719f",
"sha256:7d2f5d97fcbd004c03df8d8fe2b973fe2b14e7bfeb2cfa012eaa8759ce9a762f",
"sha256:80beaf63ddfbc64a0452b841d8036ca0611e049650e20afcb882f5d3c266d65f",
"sha256:84ad5e29bf8bab3ad70fd707d3c05524862bddc54dc040982b0dbcff36481de7",
"sha256:8da5924cb1f9064589767b0f3fc39d03e3d0fb5aa29e0cb21d43106519bd624a",
"sha256:961eb86e5be7d0973789f30ebcf6caab60b844203f4396ece27310295a6082c7",
"sha256:96de1932237abe0a13ba68b63e94113678c379dca45afa040a17b6e1ad7ed076",
"sha256:a0a0abef2ca47b33fb615b491ce31b055ef2430de52c5b3fb19a4042dbc5cadb",
"sha256:b2a5a856019d2833c56a3dcac1b80fe795c95f401818ea963594b345929dffa7",
"sha256:b8811d48078d1cf2a6863dafb896e68406c5f513048451cd2ded0473133473c7",
"sha256:c532d5ab79be0199fa2658e24a02fce8542df196e60665dd322409a03db6a52c",
"sha256:d3b64c65328cb4cd252c94f83e66e3d7acf8891e60ebf588d7b493a55a1dbf26",
"sha256:d4e702eea4a2903441f2735799d217f4ac1b55f7d8ad96ab7d4e25417cb0827c",
"sha256:d5653619b3eb5cbd35bfba3c12d575db2a74d15e0e1c08bf1db788069d410ce8",
"sha256:d66624f04de4af8bbf1c7f21cc06649c1c69a7f84109179add573ce35e46d448",
"sha256:e67ec74fada3841b8c5f4c4f197bea916025cb9aa3fe5abf7d52b655d042f956",
"sha256:e6f7f3f41faffaea6596da86ecc2389672fa949bd035251eab26dc6697451d05",
"sha256:f02cf7221d5cd915d7fa58ab64f7ee6dd0f6cddbb48683debf5d04ae9b1c2cc1",
"sha256:f0eddfcabd6936558ec020130f932d479930581171368fd728efcfb6ef0dd357",
"sha256:fabbe18087c3d33c5824cb145ffca52eccd053061df1d79d4b66dafa5ad2a5ea",
"sha256:fc3150f85e2dbcf99e65238c842d1cfe69d3e7649b19864c1cc043213d9cd730"
],
"markers": "python_version >= '3.7'",
"version": "==2.1.0"
},
"mutagen": { "mutagen": {
"hashes": [ "hashes": [
"sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1", "sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1",
@@ -642,14 +549,6 @@
"git": "https://github.com/smarnach/pyexiftool.git", "git": "https://github.com/smarnach/pyexiftool.git",
"ref": "3db3764895e687d75b42d3ae4e554ca8664a7f6f" "ref": "3db3764895e687d75b42d3ae4e554ca8664a7f6f"
}, },
"pygments": {
"hashes": [
"sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65",
"sha256:4e426f72023d88d03b2fa258de560726ce890ff3b630f88c21cbb8b2503b8c6a"
],
"markers": "python_version >= '3.5'",
"version": "==2.11.2"
},
"pyparsing": { "pyparsing": {
"hashes": [ "hashes": [
"sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea", "sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea",
@@ -786,6 +685,9 @@
"version": "==2022.3.2" "version": "==2022.3.2"
}, },
"requests": { "requests": {
"extras": [
"socks"
],
"hashes": [ "hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
@@ -817,13 +719,6 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.16.0" "version": "==1.16.0"
}, },
"snowballstemmer": {
"hashes": [
"sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1",
"sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"
],
"version": "==2.2.0"
},
"snscrape": { "snscrape": {
"git": "https://github.com/bellingcat/snscrape.git", "git": "https://github.com/bellingcat/snscrape.git",
"ref": "de4ebed81f3f6a4bb4c65630daab6ec63784959b" "ref": "de4ebed81f3f6a4bb4c65630daab6ec63784959b"
@@ -836,62 +731,6 @@
"markers": "python_version >= '3.6'", "markers": "python_version >= '3.6'",
"version": "==2.3.1" "version": "==2.3.1"
}, },
"sphinx": {
"hashes": [
"sha256:5da895959511473857b6d0200f56865ed62c31e8f82dd338063b84ec022701fe",
"sha256:6caad9786055cb1fa22b4a365c1775816b876f91966481765d7d50e9f0dd35cc"
],
"index": "pypi",
"version": "==4.4.0"
},
"sphinxcontrib-applehelp": {
"hashes": [
"sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a",
"sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.2"
},
"sphinxcontrib-devhelp": {
"hashes": [
"sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e",
"sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.2"
},
"sphinxcontrib-htmlhelp": {
"hashes": [
"sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07",
"sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2"
],
"markers": "python_version >= '3.6'",
"version": "==2.0.0"
},
"sphinxcontrib-jsmath": {
"hashes": [
"sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178",
"sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.1"
},
"sphinxcontrib-qthelp": {
"hashes": [
"sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72",
"sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.3"
},
"sphinxcontrib-serializinghtml": {
"hashes": [
"sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd",
"sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952"
],
"markers": "python_version >= '3.5'",
"version": "==1.1.5"
},
"sqlalchemy": { "sqlalchemy": {
"hashes": [ "hashes": [
"sha256:04164e0063feb7aedd9d073db0fd496edb244be40d46ea1f0d8990815e4b8c34", "sha256:04164e0063feb7aedd9d073db0fd496edb244be40d46ea1f0d8990815e4b8c34",
@@ -1034,17 +873,16 @@
], ],
"index": "pypi", "index": "pypi",
"version": "==2022.3.8.2" "version": "==2022.3.8.2"
},
"zipp": {
"hashes": [
"sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d",
"sha256:b47250dd24f92b7dd6a0a8fc5244da14608f3ca90a5efcd37a3b1642fac9a375"
],
"markers": "python_version >= '3.7'",
"version": "==3.7.0"
} }
}, },
"develop": { "develop": {
"alabaster": {
"hashes": [
"sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359",
"sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02"
],
"version": "==0.7.12"
},
"attrs": { "attrs": {
"hashes": [ "hashes": [
"sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4", "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4",
@@ -1053,6 +891,29 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==21.4.0" "version": "==21.4.0"
}, },
"babel": {
"hashes": [
"sha256:ab49e12b91d937cd11f0b67cb259a57ab4ad2b59ac7a3b41d6c06c0ac5b0def9",
"sha256:bc0c176f9f6a994582230df350aa6e05ba2ebe4b3ac317eab29d9be5d2768da0"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.9.1"
},
"certifi": {
"hashes": [
"sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
"sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
],
"version": "==2021.10.8"
},
"charset-normalizer": {
"hashes": [
"sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597",
"sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"
],
"markers": "python_version >= '3'",
"version": "==2.0.12"
},
"coverage": { "coverage": {
"extras": [ "extras": [
"toml" "toml"
@@ -1103,6 +964,38 @@
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.7'",
"version": "==6.3.2" "version": "==6.3.2"
}, },
"docutils": {
"hashes": [
"sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125",
"sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==0.17.1"
},
"idna": {
"hashes": [
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
],
"markers": "python_version >= '3'",
"version": "==3.3"
},
"imagesize": {
"hashes": [
"sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c",
"sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.3.0"
},
"importlib-metadata": {
"hashes": [
"sha256:1208431ca90a8cca1a6b8af391bb53c1a2db74e5d1cef6ddced95d4b2062edc6",
"sha256:ea4c597ebf37142f827b8f39299579e31685c31d3a438b59f469406afd0f2539"
],
"markers": "python_version < '3.10'",
"version": "==4.11.3"
},
"iniconfig": { "iniconfig": {
"hashes": [ "hashes": [
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3", "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
@@ -1110,6 +1003,60 @@
], ],
"version": "==1.1.1" "version": "==1.1.1"
}, },
"jinja2": {
"hashes": [
"sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8",
"sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7"
],
"markers": "python_version >= '3.6'",
"version": "==3.0.3"
},
"markupsafe": {
"hashes": [
"sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003",
"sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88",
"sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5",
"sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7",
"sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a",
"sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603",
"sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1",
"sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135",
"sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247",
"sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6",
"sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601",
"sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77",
"sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02",
"sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e",
"sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63",
"sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f",
"sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980",
"sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b",
"sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812",
"sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff",
"sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96",
"sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1",
"sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925",
"sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a",
"sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6",
"sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e",
"sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f",
"sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4",
"sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f",
"sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3",
"sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c",
"sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a",
"sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417",
"sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a",
"sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a",
"sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37",
"sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452",
"sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933",
"sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a",
"sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"
],
"markers": "python_version >= '3.7'",
"version": "==2.1.1"
},
"packaging": { "packaging": {
"hashes": [ "hashes": [
"sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb",
@@ -1134,6 +1081,14 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==1.11.0" "version": "==1.11.0"
}, },
"pygments": {
"hashes": [
"sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65",
"sha256:4e426f72023d88d03b2fa258de560726ce890ff3b630f88c21cbb8b2503b8c6a"
],
"markers": "python_version >= '3.5'",
"version": "==2.11.2"
},
"pyparsing": { "pyparsing": {
"hashes": [ "hashes": [
"sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea", "sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea",
@@ -1174,6 +1129,95 @@
"index": "pypi", "index": "pypi",
"version": "==1.11.0" "version": "==1.11.0"
}, },
"pytz": {
"hashes": [
"sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c",
"sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"
],
"version": "==2021.3"
},
"requests": {
"extras": [
"socks"
],
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
],
"index": "pypi",
"version": "==2.27.1"
},
"snowballstemmer": {
"hashes": [
"sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1",
"sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"
],
"version": "==2.2.0"
},
"sphinx": {
"hashes": [
"sha256:5da895959511473857b6d0200f56865ed62c31e8f82dd338063b84ec022701fe",
"sha256:6caad9786055cb1fa22b4a365c1775816b876f91966481765d7d50e9f0dd35cc"
],
"index": "pypi",
"version": "==4.4.0"
},
"sphinx-rtd-theme": {
"hashes": [
"sha256:4d35a56f4508cfee4c4fb604373ede6feae2a306731d533f409ef5c3496fdbd8",
"sha256:eec6d497e4c2195fa0e8b2016b337532b8a699a68bcb22a512870e16925c6a5c"
],
"index": "pypi",
"version": "==1.0.0"
},
"sphinxcontrib-applehelp": {
"hashes": [
"sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a",
"sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.2"
},
"sphinxcontrib-devhelp": {
"hashes": [
"sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e",
"sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.2"
},
"sphinxcontrib-htmlhelp": {
"hashes": [
"sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07",
"sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2"
],
"markers": "python_version >= '3.6'",
"version": "==2.0.0"
},
"sphinxcontrib-jsmath": {
"hashes": [
"sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178",
"sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.1"
},
"sphinxcontrib-qthelp": {
"hashes": [
"sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72",
"sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.3"
},
"sphinxcontrib-serializinghtml": {
"hashes": [
"sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd",
"sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952"
],
"markers": "python_version >= '3.5'",
"version": "==1.1.5"
},
"tomli": { "tomli": {
"hashes": [ "hashes": [
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
@@ -1181,6 +1225,22 @@
], ],
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.7'",
"version": "==2.0.1" "version": "==2.0.1"
},
"urllib3": {
"hashes": [
"sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
"sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==1.26.8"
},
"zipp": {
"hashes": [
"sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d",
"sha256:b47250dd24f92b7dd6a0a8fc5244da14608f3ca90a5efcd37a3b1642fac9a375"
],
"markers": "python_version >= '3.7'",
"version": "==3.7.0"
} }
} }
} }

View File

@@ -1,33 +1,47 @@
from typing import List from typing import List
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime from datetime import datetime
import tempfile
import json
import io
from sqlalchemy.orm import registry from sqlalchemy.orm import registry
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean
import pytesseract import pytesseract
import PIL import PIL
import io
import exiftool import exiftool
import json
import os
from .utils import make_request from .utils import make_request
mapper_registry = registry()
@dataclass @dataclass
class ScraperResult: class ScraperResult:
"""A minimally processed result from a scraper""" """A minimally processed result from a scraper
"""
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
scraper: str scraper: str
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
platform: str platform: str
#: Foreign key of channel ID that this was scraped from
channel: int channel: int
#: String that uniquely identifies the scraped post on the given platform, e.g. ``"1503397267675533313"``
platform_id: str platform_id: str
#: Datetime (relative to UTC) that the scraped post was created at.
date: datetime date: datetime
#: JSON dump of dict that contains all data scraped for the post.
raw_data: str raw_data: str
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime date_archived: datetime
#: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files.
archived_urls: dict archived_urls: dict
raw_data_table = Table('raw_data', mapper_registry.metadata, raw_data_table = Table('raw_data', mapper_registry.metadata,
Column('id', Integer, primary_key=True, Column('id', Integer, primary_key=True,
autoincrement=True), autoincrement=True),
@@ -40,22 +54,45 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
Column('date_archived', DateTime), Column('date_archived', DateTime),
Column('archived_urls', JSON)) Column('archived_urls', JSON))
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
@dataclass @dataclass
class Channel: class Channel:
"""Information about a specific channel to be scraped.
"""
#: Name of channel (different from username because it can be non-unique and contain emojis), e.g. ``T🕊Редакция Президент Гордон🕊"``.
name: str name: str
#: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
platform_id: str platform_id: str
#: User-specified category for the channel, e.g. ``"explicit_qanon"``.
category: str category: str
#: Name of platform the given channel is on, e.g. ``"Telegram"``.
platform: str platform: str
#: URL for the given channel on the platform, e.g. ``"https://t.me/prezidentgordonteam"``
url: str url: str
#: Screen name/username of channel.
screenname: str screenname: str
#: 2 digit country code for the country of origin for the channel, e.g. ``"RU"``.
country: str = None country: str = None
#: Name of influencer, if channel belongs to an influencer that operates on multiple platforms.
influencer: str = None influencer: str = None
#: Whether or not the channel is publicly-accessible.
public: bool = None public: bool = None
#: Whether or not the channel is a chat (i.e. allows users who are not the channel creator to post/message)
chat: bool = None chat: bool = None
#: Any other additional notes about the channel.
notes: str = "" notes: str = ""
#: Did the channel come from a researcher or a scraping process?
source: str = None source: str = None
def hydrate(self): def hydrate(self):
@@ -82,26 +119,52 @@ mapper_registry.map_imperatively(Channel, channel_table)
@dataclass @dataclass
class Post: class Post:
"""An object with fields for columns in the analysis table""" """An object with fields for columns in the analysis table"""
#: ID number of the scraped post in the ``raw_data`` table
raw_id: int raw_id: int
#: Platform specific post ID
platform_id: str platform_id: str
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
scraper: str scraper: str
#: String specifying name and version of transformer used to tranform result, e.g. ``"TwitterTransformer 0.0.1"``.
transformer: str transformer: str
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
platform: str platform: str
#: User-specified integer that uniquely identifies a channel, e.g. ``15``.
channel: int channel: int
#: Datetime (relative to UTC) that the scraped post was created at.
date: datetime date: datetime
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime date_archived: datetime
#: URL of the original post
url: str url: str
#: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
author_id: str author_id: str
#: Username of author who made post.
author_username: str author_username: str
#: Text of the original post
content: str content: str
#: The ID of the Channel that the post was forwarded or quoted from
forwarded_from: int = None forwarded_from: int = None
#: The ID of the Post that this Post is a reply to or reblog of
reply_to: int = None reply_to: int = None
def hydrate(self): def hydrate(self):
pass pass
post_table = Table('posts', mapper_registry.metadata, post_table = Table('posts', mapper_registry.metadata,
Column('id', Integer, primary_key=True, Column('id', Integer, primary_key=True,
autoincrement=True), autoincrement=True),
@@ -125,39 +188,64 @@ mapper_registry.map_imperatively(Post, post_table)
@dataclass @dataclass
class Media: class Media:
"""Base class for organizing information about a media file.
"""
#: ID number of the media's corresponding scraped post in the ``raw_data`` table.
raw_id: int raw_id: int
#: ID number of the media's corresponging scraped post in the ``analysis`` table.
post: int post: int
#: URL of the original post.
url: str url: str
#: Original URL of the media from the the original post.
original_url: str original_url: str
#: JSON dump of the dict containing metadata information for the media file.
exif: str = None exif: str = None
def get_blob(self): def get_blob(self):
"""Download media file as bytes blob.
"""
blob = make_request(self.url) blob = make_request(self.url)
return blob.content return blob.content
def hydrate(self, blob = None): def hydrate(self, blob = None):
"""Download media file as bytes blob and extract data from content.
"""
if blob is None: if blob is None:
blob = self.get_blob() blob = self.get_blob()
self.hydrate_exif(blob) self.hydrate_exif(blob)
def hydrate_exif(self, blob): def hydrate_exif(self, blob):
f = open('tmp', 'wb') """Extract Exif metadata from bytes blob.
f.write(blob) """
f.close()
with exiftool.ExifTool() as et: with tempfile.NamedTemporaryFile() as temp_file:
exif = et.get_metadata('tmp') temp_file.write(blob)
self.exif = json.dumps(exif)
os.remove('tmp') with exiftool.ExifTool() as et:
exif = et.get_metadata(temp_file.name)
self.exif = json.dumps(exif)
@dataclass @dataclass
class Image(Media): class Image(Media):
"""Class for organizing information about an image file.
"""
#: Extracted OCR content from image
ocr: str = None ocr: str = None
def hydrate(self, blob=None): def hydrate(self, blob=None):
"""Download image file as bytes blob and extract Exif and OCR content
from the image.
"""
if blob is None: if blob is None:
blob = self.get_blob() blob = self.get_blob()
@@ -165,25 +253,62 @@ class Image(Media):
self.hydrate_ocr(blob) self.hydrate_ocr(blob)
def hydrate_ocr(self, blob): def hydrate_ocr(self, blob):
"""Extract OCR (optical character recognition) data from image bytes blob.
"""
image = PIL.Image.open(io.BytesIO(blob)) image = PIL.Image.open(io.BytesIO(blob))
self.ocr = pytesseract.image_to_string(image) self.ocr = pytesseract.image_to_string(image)
@dataclass @dataclass
class Video(Media): class Video(Media):
"""Class for organizing information about an image file.
"""
pass pass
mapper_registry = registry()
raw_data_table = Table('raw_data', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('scraper', String),
Column('platform', String),
Column('channel', Integer),
Column('platform_id', String),
Column('date', DateTime),
Column('raw_data', String),
Column('date_archived', DateTime),
Column('archived_urls', JSON))
analysis_table = Table('analysis', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('raw_id', Integer, ForeignKey('raw_data.id')),
Column('scraper', String),
Column('transformer', String),
Column('platform', String),
Column('channel', Integer),
Column('date', DateTime),
Column('date_archived', DateTime),
Column('url', String),
Column('author_id', String),
Column('author_username', String),
Column('content', String))
media_table = Table('media', mapper_registry.metadata, media_table = Table('media', mapper_registry.metadata,
Column('id', Integer, primary_key=True, Column('id', Integer, primary_key=True,
autoincrement=True), autoincrement=True),
Column('type', String), Column('type', String),
Column('raw_id', Integer, ForeignKey('raw_data.id')), Column('raw_id', Integer, ForeignKey('raw_data.id')),
Column('post', Integer, ForeignKey('posts.id')), Column('post', Integer, ForeignKey('posts.id')),
Column('url', String), Column('url', String),
Column('original_url', String), Column('original_url', String),
Column('exif', String), Column('exif', String),
Column('ocr', String) Column('ocr', String))
)
mapper_registry.map_imperatively(TransformedResult, analysis_table)
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media') mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image') mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video') mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')

View File

@@ -14,29 +14,91 @@ from cisticola.base import Channel, ScraperResult, mapper_registry
from cisticola.utils import make_request from cisticola.utils import make_request
class Scraper: class Scraper:
"""Base class for defining platform-specific scrapers for scraping all posts
from a given channel on that specific platform.
"""
__version__ = "Scraper 0.0.0" __version__ = "Scraper 0.0.0"
def __init__(self): def __init__(self):
self.s3_client = boto3.client('s3',
region_name=os.environ['DO_SPACES_REGION'],
endpoint_url='https://{}.digitaloceanspaces.com'.format(
os.environ['DO_SPACES_REGION']),
aws_access_key_id=os.environ['DO_SPACES_KEY'],
aws_secret_access_key=os.environ['DO_SPACES_SECRET'])
# Initialize client to transfer files to the storage archive
self.s3_client = boto3.client(
service_name='s3',
region_name=os.environ['DO_SPACES_REGION'],
endpoint_url=f'https://{os.environ["DO_SPACES_REGION"]}.digitaloceanspaces.com',
aws_access_key_id=os.environ['DO_SPACES_KEY'],
aws_secret_access_key=os.environ['DO_SPACES_SECRET'])
# Define request headers (necessary to bypass scraping protection
# for several platform scrapers)
self.headers = { self.headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'} 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'}
pass
def __str__(self): def __str__(self):
return self.__version__ return self.__version__
def get_username_from_url(self, url: str) -> str:
"""Extract a channel's username from its URL.
Parameters
----------
url: str
URL of the channel on a given platform
e.g. ``"https://twitter.com/EliotHiggins"``
Returns
-------
username: str
Extracted username of the channel.
e.g. ``"EliotHiggins"``
"""
raise NotImplementedError
def url_to_key(self, url: str, content_type: str) -> str: def url_to_key(self, url: str, content_type: str) -> str:
"""Generate a unique identifier for media from a specified post.
Parameters
---------
url: str
URL of original post.
e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"``
content_type: str
Content-Type of media.
e.g. ``"image/jpeg"``
Returns
-------
key: str
Unique identifier for the media file from a specified post based on
the original post URL and the media's Content-Type.
"""
key = urlparse(url).path.split('/')[-1] key = urlparse(url).path.split('/')[-1]
return key return key
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
"""Download media file from a specified media file URL.
Parameters
---------
url: str
URL of media file from original post.
e.g. ``"https://pbs.twimg.com/media/FN0j0dYWUAcQxfK?format=png&name=medium"``
key: str or None
Pre-defined unique identifier for the media file.
Returns
-------
blob: bytes
Raw bytes of the downloaded media file.
content_type: str
Content-Type of media.
e.g. ``"image/jpeg"``.
key: str
Unique identifier for the media file.
"""
r = make_request(url, headers = self.headers) r = make_request(url, headers = self.headers)
@@ -49,6 +111,27 @@ class Scraper:
return blob, content_type, key return blob, content_type, key
def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
"""Download media file from a specified media URL, where the media file
is formatted as an m3u8 playlist, which is then decoded to an mp4 file.
Parameters
---------
url: str
URL of m3u8 playlist file from original post.
e.g. ``"https://media.gettr.com/group47/origin/2022/03/15/01/cbc436c1-1a1a-4b97-671d-c42109f3ec9b/out.m3u8"``
key: str or None
Pre-defined unique identifier for the media file.
Returns
-------
blob: bytes
Raw bytes of the downloaded media file.
content_type: str
Content-Type of media.
e.g. ``"video/mp4"``.
key: str
Unique identifier for the media file.
"""
content_type = 'video/mp4' content_type = 'video/mp4'
ext = '.' + content_type.split('/')[-1] ext = '.' + content_type.split('/')[-1]
@@ -71,7 +154,28 @@ class Scraper:
return blob, content_type, key return blob, content_type, key
def ytdlp_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: def ytdlp_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
"""Download media file from a specified media URL, using a fork of
youtube-dl that enables faster downloading.
Parameters
---------
url: str
URL of media file from original post.
e.g. ``"https://rumble.com/embed/vgt7gh/"``
key: str or None
Pre-defined unique identifier for the media file.
Returns
-------
blob: bytes
Raw bytes of the downloaded media file.
content_type: str
Content-Type of media.
e.g. ``"video/mp4"``.
key: str
Unique identifier for the media file.
"""
content_type = 'video/mp4' content_type = 'video/mp4'
with tempfile.TemporaryDirectory() as temp_dir: with tempfile.TemporaryDirectory() as temp_dir:
@@ -103,6 +207,23 @@ class Scraper:
return blob, content_type, key return blob, content_type, key
def archive_blob(self, blob: bytes, content_type: str, key: str) -> str: def archive_blob(self, blob: bytes, content_type: str, key: str) -> str:
"""Upload raw bytes of a media file to the storage archive.
Parameters
----------
blob: bytes
Raw bytes of the media file to be archived.
content_type: str
Content-Type of media.
e.g. ``"video/mp4"``.
key: str
Unique identifier for the media file.
Returns
-------
archived_url: str
URL specifying the file on the storage archive.
"""
filename = self.__version__.replace(' ', '_') + '/' + key filename = self.__version__.replace(' ', '_') + '/' + key
@@ -114,9 +235,42 @@ class Scraper:
return archived_url return archived_url
def can_handle(self, channel: Channel) -> bool: def can_handle(self, channel: Channel) -> bool:
"""Whether or not the scraper can scrape the specified channel.
Parameters
----------
channel: Channel
Channel to be scraped.
Returns
-------
bool
``True`` if the scraper is capable of scraping ``channel``,
``False`` if not.
"""
raise NotImplementedError raise NotImplementedError
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
"""Scrape all posts from the specified Channel.
Parameters
----------
channel: Channel
Channel to be scraped.
since: ScraperResult or None
Most recently scraped ScraperResult from a previous scrape, or
``None`` if scraper has not run before.
archive_media: bool
If ``True``, any media files (images, video, etc.) from posts are archived.
If ``False``, media files are not archived.
Yields
------
ScraperResult
Scraper result from a single post/comment from the specified Channel.
"""
raise NotImplementedError raise NotImplementedError
@@ -129,9 +283,13 @@ class ScraperController:
self.session = None self.session = None
def register_scraper(self, scraper: Scraper): def register_scraper(self, scraper: Scraper):
"""Register a single Scraper instance to the controller.
"""
self.scrapers.append(scraper) self.scrapers.append(scraper)
def register_scrapers(self, scraper: List[Scraper]): def register_scrapers(self, scraper: List[Scraper]):
"""Register a list of Scraper instances to the controller.
"""
self.scrapers.extend(scraper) self.scrapers.extend(scraper)
def scrape_all_channels(self, archive_media: bool = True): def scrape_all_channels(self, archive_media: bool = True):
@@ -147,6 +305,17 @@ class ScraperController:
@logger.catch(reraise = True) @logger.catch(reraise = True)
def scrape_channels(self, channels: List[Channel], archive_media: bool = True): def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
"""Scrape all posts for all specified channels.
Parameters
----------
channels: list<Channel>
List of Channel instances to be scraped
archive_media: bool
If ``True``, any media files (images, video, etc.) from posts are archived.
If ``False``, media files are not archived.
"""
if self.session is None: if self.session is None:
logger.error("No DB session") logger.error("No DB session")
return return
@@ -185,6 +354,9 @@ class ScraperController:
logger.warning(f"No handler found for Channel {channel}") logger.warning(f"No handler found for Channel {channel}")
def connect_to_db(self, engine): def connect_to_db(self, engine):
"""Connect the specified SQLAlchemy engine to the controller.
"""
# create tables # create tables
mapper_registry.metadata.create_all(bind=engine) mapper_registry.metadata.create_all(bind=engine)
@@ -193,8 +365,8 @@ class ScraperController:
self.session.configure(bind=self.engine) self.session.configure(bind=self.engine)
def reset_db(self): def reset_db(self):
"""Drop all data from the connected SQLAlchemy database.
"""
mapper_registry.metadata.drop_all(bind=self.engine) mapper_registry.metadata.drop_all(bind=self.engine)
self.connect_to_db(self.engine) self.connect_to_db(self.engine)

View File

@@ -17,7 +17,7 @@ class BitchuteScraper(Scraper):
library""" library"""
__version__ = "BitchuteScraper 0.0.1" __version__ = "BitchuteScraper 0.0.1"
def get_username_from_url(url): def get_username_from_url(self, url):
username = url.split('bitchute.com/channel/')[-1].strip('/') username = url.split('bitchute.com/channel/')[-1].strip('/')
return username return username
@@ -33,7 +33,7 @@ class BitchuteScraper(Scraper):
detail = 'comments' detail = 'comments'
username = BitchuteScraper.get_username_from_url(channel.url) username = self.get_username_from_url(channel.url)
scraper = get_videos_user(session, username, csrftoken, detail) scraper = get_videos_user(session, username, csrftoken, detail)
for post in scraper: for post in scraper:
@@ -61,7 +61,7 @@ class BitchuteScraper(Scraper):
archived_urls=archived_urls) archived_urls=archived_urls)
def can_handle(self, channel): def can_handle(self, channel):
if channel.platform == "Bitchute" and BitchuteScraper.get_username_from_url(channel.url) is not None: if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
return True return True
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -11,14 +11,14 @@ class GabScraper(Scraper):
"""An implementation of a Scraper for Gab, using GARC library""" """An implementation of a Scraper for Gab, using GARC library"""
__version__ = "GabScraper 0.0.1" __version__ = "GabScraper 0.0.1"
def get_username_from_url(url): def get_username_from_url(self, url):
username = url.split('https://gab.com/')[-1] username = url.split('https://gab.com/')[-1]
return username return username
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
client = Garc(profile = 'main') client = Garc(profile = 'main')
username = GabScraper.get_username_from_url(channel.url) username = self.get_username_from_url(channel.url)
scraper = client.userposts(username) scraper = client.userposts(username)
@@ -52,5 +52,5 @@ class GabScraper(Scraper):
archived_urls=archived_urls) archived_urls=archived_urls)
def can_handle(self, channel): def can_handle(self, channel):
if channel.platform == "Gab" and GabScraper.get_username_from_url(channel.url) is not None: if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:
return True return True

View File

@@ -12,7 +12,7 @@ class GettrScraper(Scraper):
"""An implementation of a Scraper for Gettr, using gogettr library""" """An implementation of a Scraper for Gettr, using gogettr library"""
__version__ = "GettrScraper 0.0.1" __version__ = "GettrScraper 0.0.1"
def get_username_from_url(url): def get_username_from_url(self, url):
username = url.split("gettr.com/user/")[1] username = url.split("gettr.com/user/")[1]
if len(username.split("/")) > 1: if len(username.split("/")) > 1:
return None return None
@@ -21,7 +21,7 @@ class GettrScraper(Scraper):
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
client = PublicClient() client = PublicClient()
username = GettrScraper.get_username_from_url(channel.url) username = self.get_username_from_url(channel.url)
scraper = client.user_activity(username=username, type="posts") scraper = client.user_activity(username=username, type="posts")
for post in scraper: for post in scraper:
@@ -62,7 +62,7 @@ class GettrScraper(Scraper):
archived_urls=archived_urls) archived_urls=archived_urls)
def can_handle(self, channel): def can_handle(self, channel):
if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None: if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None:
return True return True
def url_to_key(self, url: str, content_type: str) -> str: def url_to_key(self, url: str, content_type: str) -> str:

View File

@@ -18,6 +18,7 @@ CONTENT_TYPES = {
'mp4' : 'video/mp4'} 'mp4' : 'video/mp4'}
class InstagramScraper(Scraper): class InstagramScraper(Scraper):
"""An implementation of a Scraper for Instagram, using instaloader library"""
__version__ = "InstagramScraper 0.0.1" __version__ = "InstagramScraper 0.0.1"
def get_username_from_url(self, url): def get_username_from_url(self, url):

View File

@@ -13,7 +13,7 @@ class OdyseeScraper(Scraper):
"""An implementation of a Scraper for Odysee, using polyphemus library""" """An implementation of a Scraper for Odysee, using polyphemus library"""
__version__ = "OdyseeScraper 0.0.1" __version__ = "OdyseeScraper 0.0.1"
def get_username_from_url(url): def get_username_from_url(self, url):
username = url.split('odysee.com/')[-1].strip('@').split(':')[0] username = url.split('odysee.com/')[-1].strip('@').split(':')[0]
@@ -21,7 +21,7 @@ class OdyseeScraper(Scraper):
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = OdyseeScraper.get_username_from_url(channel.url) username = self.get_username_from_url(channel.url)
odysee_channel = OdyseeChannel(channel_name = username) odysee_channel = OdyseeChannel(channel_name = username)
all_videos = odysee_channel.get_all_videos() all_videos = odysee_channel.get_all_videos()
@@ -70,7 +70,7 @@ class OdyseeScraper(Scraper):
archived_urls={}) archived_urls={})
def can_handle(self, channel): def can_handle(self, channel):
if channel.platform == "Odysee" and OdyseeScraper.get_username_from_url(channel.url) is not None: if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None:
return True return True
def url_to_key(self, url: str, content_type: str) -> str: def url_to_key(self, url: str, content_type: str) -> str:

View File

@@ -14,14 +14,14 @@ class RumbleScraper(Scraper):
"""An implementation of a Scraper for Rumble, using custom functions""" """An implementation of a Scraper for Rumble, using custom functions"""
__version__ = "RumbleScraper 0.0.1" __version__ = "RumbleScraper 0.0.1"
def get_username_from_url(url): def get_username_from_url(self, url):
username = url.split('https://rumble.com/c/')[1] username = url.split('https://rumble.com/c/')[1]
return username return username
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = RumbleScraper.get_username_from_url(channel.url) username = self.get_username_from_url(channel.url)
scraper = get_channel_videos(username) scraper = get_channel_videos(username)
for post in scraper: for post in scraper:
@@ -54,7 +54,7 @@ class RumbleScraper(Scraper):
return key return key
def can_handle(self, channel): def can_handle(self, channel):
if channel.platform == "Rumble" and RumbleScraper.get_username_from_url(channel.url) is not None: if channel.platform == "Rumble" and self.get_username_from_url(channel.url) is not None:
return True return True
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -8,6 +8,7 @@ from cisticola.base import Channel, ScraperResult
from cisticola.scraper.base import Scraper from cisticola.scraper.base import Scraper
class TelegramSnscrapeScraper(Scraper): class TelegramSnscrapeScraper(Scraper):
"""An implementation of a Scraper for Telegram, using snscrape library"""
__version__ = "TelegramSnscrapeScraper 0.0.1" __version__ = "TelegramSnscrapeScraper 0.0.1"
def can_handle(self, channel): def can_handle(self, channel):

View File

@@ -14,6 +14,7 @@ from cisticola.scraper.base import Scraper
MEDIA_TYPES = ['photo', 'video', 'document', 'webpage'] MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
class TelegramTelethonScraper(Scraper): class TelegramTelethonScraper(Scraper):
"""An implementation of a Scraper for Telegram, using Telethon library"""
__version__ = "TelegramTelethonScraper 0.0.1" __version__ = "TelegramTelethonScraper 0.0.1"
def get_username_from_url(self, url): def get_username_from_url(self, url):
@@ -30,9 +31,9 @@ class TelegramTelethonScraper(Scraper):
username = self.get_username_from_url(channel.url) username = self.get_username_from_url(channel.url)
api_id = os.environ['TELEGRAM_API_ID_1'] api_id = os.environ['TELEGRAM_API_ID']
api_hash = os.environ['TELEGRAM_API_HASH_1'] api_hash = os.environ['TELEGRAM_API_HASH']
phone = os.environ['TELEGRAM_PHONE_1'] phone = os.environ['TELEGRAM_PHONE']
with TelegramClient(phone, api_id, api_hash) as client: with TelegramClient(phone, api_id, api_hash) as client:

View File

@@ -8,12 +8,24 @@ SPHINXBUILD ?= sphinx-build
SOURCEDIR = source SOURCEDIR = source
BUILDDIR = build BUILDDIR = build
SPHINXAPIDOC = sphinx-apidoc
APIDOCFLAGS = --separate --private --module-first
MODULEPATH = ../cisticola
SOURCEFILES = cisticola.*
MODULEFILE = modules.rst
# Put it first so that "make" without argument is like "make help". # Put it first so that "make" without argument is like "make help".
help: help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile .PHONY: help Makefile
# Custom process and flags for generating Sphinx sources
apidoc:
rm $(SOURCEDIR)/$(SOURCEFILES)
$(SPHINXAPIDOC) $(APIDOCFLAGS) -o "$(SOURCEDIR)" "$(MODULEPATH)"
rm $(SOURCEDIR)/$(MODULEFILE)
# Catch-all target: route all unknown targets to Sphinx using the new # Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile %: Makefile

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 7.0 KiB

BIN
docs/images/favicon.ico Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 614 B

View File

@@ -10,6 +10,12 @@ if "%SPHINXBUILD%" == "" (
set SOURCEDIR=source set SOURCEDIR=source
set BUILDDIR=build set BUILDDIR=build
set SPHINXAPIDOC=sphinx-apidoc
set APIDOCFLAGS=--separate --private --module-first
set MODULEPATH=../cisticola
set SOURCEFILES=cisticola.*
set MODULEFILE=modules.rst
if "%1" == "" goto help if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL %SPHINXBUILD% >NUL 2>NUL
@@ -28,6 +34,11 @@ if errorlevel 9009 (
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end goto end
:apidoc
del %SOURCEDIR%\%SOURCEFILES%
%SPHINXAPIDOC% %APIDOCFLAGS% -o %SOURCEDIR% %MODULEPATH%
del %SOURCEDIR%\%MODULEFILE%
:help :help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%

View File

@@ -0,0 +1,8 @@
cisticola.base module
=====================
.. automodule:: cisticola.base
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -1,6 +1,12 @@
cisticola package cisticola package
================= =================
.. automodule:: cisticola
:members:
:undoc-members:
:show-inheritance:
:private-members:
Subpackages Subpackages
----------- -----------
@@ -13,18 +19,8 @@ Subpackages
Submodules Submodules
---------- ----------
cisticola.base module .. toctree::
--------------------- :maxdepth: 4
.. automodule:: cisticola.base cisticola.base
:members: cisticola.utils
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: cisticola
:members:
:undoc-members:
:show-inheritance:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.base module
=============================
.. automodule:: cisticola.scraper.base
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.bitchute module
=================================
.. automodule:: cisticola.scraper.bitchute
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.gab module
============================
.. automodule:: cisticola.scraper.gab
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.gettr module
==============================
.. automodule:: cisticola.scraper.gettr
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.instagram module
==================================
.. automodule:: cisticola.scraper.instagram
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.odysee module
===============================
.. automodule:: cisticola.scraper.odysee
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -1,37 +1,27 @@
cisticola.scraper package cisticola.scraper package
========================= =========================
Submodules
----------
cisticola.scraper.bitchute module
---------------------------------
.. automodule:: cisticola.scraper.bitchute
:members:
:undoc-members:
:show-inheritance:
cisticola.scraper.gettr module
------------------------------
.. automodule:: cisticola.scraper.gettr
:members:
:undoc-members:
:show-inheritance:
cisticola.scraper.twitter module
--------------------------------
.. automodule:: cisticola.scraper.twitter
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: cisticola.scraper .. automodule:: cisticola.scraper
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
:private-members:
Submodules
----------
.. toctree::
:maxdepth: 4
cisticola.scraper.base
cisticola.scraper.bitchute
cisticola.scraper.gab
cisticola.scraper.gettr
cisticola.scraper.instagram
cisticola.scraper.odysee
cisticola.scraper.rumble
cisticola.scraper.telegram_snscrape
cisticola.scraper.telegram_telethon
cisticola.scraper.twitter
cisticola.scraper.vkontakte
cisticola.scraper.youtube

View File

@@ -0,0 +1,8 @@
cisticola.scraper.rumble module
===============================
.. automodule:: cisticola.scraper.rumble
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.telegram\_snscrape module
===========================================
.. automodule:: cisticola.scraper.telegram_snscrape
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.telegram\_telethon module
===========================================
.. automodule:: cisticola.scraper.telegram_telethon
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.twitter module
================================
.. automodule:: cisticola.scraper.twitter
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.vkontakte module
==================================
.. automodule:: cisticola.scraper.vkontakte
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.youtube module
================================
.. automodule:: cisticola.scraper.youtube
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.transformer.base module
=================================
.. automodule:: cisticola.transformer.base
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.transformer.bitchute module
=====================================
.. automodule:: cisticola.transformer.bitchute
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -1,21 +1,18 @@
cisticola.transformer package cisticola.transformer package
============================= =============================
Submodules
----------
cisticola.transformer.twitter module
------------------------------------
.. automodule:: cisticola.transformer.twitter
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: cisticola.transformer .. automodule:: cisticola.transformer
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
:private-members:
Submodules
----------
.. toctree::
:maxdepth: 4
cisticola.transformer.base
cisticola.transformer.bitchute
cisticola.transformer.twitter

View File

@@ -0,0 +1,8 @@
cisticola.transformer.twitter module
====================================
.. automodule:: cisticola.transformer.twitter
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.utils module
======================
.. automodule:: cisticola.utils
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -43,9 +43,18 @@ exclude_patterns = []
# The theme to use for HTML and HTML Help pages. See the documentation for # The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes. # a list of builtin themes.
# #
html_theme = 'alabaster' html_theme = 'sphinx_rtd_theme'
# Add any paths that contain custom static files (such as style sheets) here, # Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files, # relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css". # so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = [] html_static_path = []
# -- Default flags for autodoc------------------------------------------------
autodoc_default_options = {'exclude-members': '_sa_class_manager'}
html_favicon = '../images/favicon.ico'
html_logo = '../images/cisticola_logo.svg'
html_theme_options = {'style_nav_header_background': '#000000'}

View File

@@ -2,16 +2,7 @@ Welcome to Cisticola's documentation!
===================================== =====================================
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 1
:caption: Contents:
modules quickstart
cisticola
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

View File

@@ -1,7 +0,0 @@
cisticola
=========
.. toctree::
:maxdepth: 4
cisticola

View File

@@ -0,0 +1,96 @@
Quickstart
==========
Installation
------------
The *cisticola* application uses pipenv_ for dependency management. To install the dependencies of *cisticola*, first install pipenv using the following command:
.. code-block::
pip install pipenv
and then install the dependencies using the following command from the package root directory:
.. code-block::
pipenv install
To install the necessary dependencies for building the documentation and running unit tests, run the following command from the package root directory:
.. code-block::
pipenv install --dev
Environment Variables
---------------------
Three of the scrapers in *cisticola* (:py:mod:`~cisticola.scraper.gab.GabScraper`, :py:mod:`~cisticola.scraper.instagram.InstagramScraper`, and :py:mod:`~cisticola.scraper.telegram_telethon.TelegramTelethonScraper`) require platform credentials to work correctly.
Gab
"""
The Gab credentials can be configured by running the following command from the root directory:
.. code-block::
pipenv run garc configure
which will direct you to provide the username and password for your Gab account.
Instagram
"""""""""
The Instagram credentials can be configured by setting the following environment variables, either in the project's ``.env`` file or in the system's environment:
- ``INSTAGRAM_USERNAME``: username of your Instagram account
- ``INSTAGRAM_PASSWORD``: password of your Instagram account
Telegram Telethon
"""""""""""""""""
The Telegram credentials can be configured by setting the following environment variables, either in the project's ``.env`` file or in the system's environment:
- ``TELEGRAM_API_ID``: API ID number for your Telegram application
- ``TELEGRAM_API_HASH``: API hash for your Telegram application
- ``TELEGRAM_PHONE``: phone number for the account corresponding to your your Telegram application
If you do not already have a Telegram application, you can create one by following the instructions on `this page`_.
Documentation
-------------
The *cisticola* application uses Sphinx_ to generate and display its documentation. To build the documentation in the HTML format, run the following command from the ``docs/`` directory:
.. code-block::
pipenv run make html
For developers, if changes are made to the package structure or additional modules are created, you can update the Sphinx source ``*.rst`` files by running the following command from the ``docs/`` directory:
.. code-block::
pipenv run make apidoc
Testing
-------
The *cisticola* application uses pytest_ for unit testing. To run the test suite, run the following command from the package root directory:
.. code-block::
pipenv run pytest
Examples
--------
An example of a *cisticola* ingest file ``russian_telegram_ingest.py`` is included in the package root directory, showing how the list of channels to scrape is defined, and how the :py:mod:`~cisticola.scraper.base.ScraperController` and :py:mod:`~cisticola.transformer.base.Transformer` classes are used. To run the ingest script, run the following command from the package root directory:
.. code-block::
pipenv run python russian_telegram_ingest.py
.. _pipenv: https://pipenv.pypa.io/en/latest/
.. _Sphinx: https://www.sphinx-doc.org/en/master/
.. _pytest: https://docs.pytest.org/en/7.1.x/
.. _this page: https://core.telegram.org/api/obtaining_api_id

View File

@@ -1,6 +1,6 @@
[pytest] [pytest]
minversion = minversion =
6.0.2 7.0.0
testpaths = testpaths =
tests/ tests/
python_files = python_files =
@@ -13,4 +13,5 @@ addopts =
--self-contained-html --self-contained-html
filterwarnings = filterwarnings =
ignore:the imp module is deprecated:DeprecationWarning ignore:the imp module is deprecated:DeprecationWarning
ignore:The localize method is no longer necessary, as this time zone supports the fold attribute ignore:The localize method is no longer necessary, as this time zone supports the fold attribute
ignore:invalid escape sequence:DeprecationWarning