diff --git a/Pipfile b/Pipfile index ba81a5c..47ccb65 100644 --- a/Pipfile +++ b/Pipfile @@ -10,7 +10,6 @@ gogettr = "*" requests = "*" bs4 = "*" dateparser = "*" -sphinx = "*" boto3 = "*" snscrape = {git = "https://github.com/bellingcat/snscrape.git"} ffmpeg-python = "*" @@ -29,6 +28,8 @@ pytest-cov = "*" pytest-html = "*" pytest-metadata = "*" black = "*" +sphinx = "*" +sphinx_rtd_theme = "*" [requires] python_version = "3.9" diff --git a/Pipfile.lock b/Pipfile.lock index 50622b1..b83b155 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "d465f2d09a728ee76cb0af521890ecc1e1bce672acbd1caf2e4d01b6567480d5" + "sha256": "e3b96b0ac8c80d4817f9adac4ab171bf4b7e07e80927c7b152a24e8bbdbf7faa" }, "pipfile-spec": 6, "requires": { @@ -16,13 +16,6 @@ ] }, "default": { - "alabaster": { - "hashes": [ - "sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359", - "sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02" - ], - "version": "==0.7.12" - }, "attrs": { "hashes": [ "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4", @@ -31,14 +24,6 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==21.4.0" }, - "babel": { - "hashes": [ - "sha256:ab49e12b91d937cd11f0b67cb259a57ab4ad2b59ac7a3b41d6c06c0ac5b0def9", - "sha256:bc0c176f9f6a994582230df350aa6e05ba2ebe4b3ac317eab29d9be5d2768da0" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==2.9.1" - }, "beautifulsoup4": { "hashes": [ "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf", @@ -49,19 +34,19 @@ }, "boto3": { "hashes": [ - "sha256:8d6f3c548f0ee03d742f404c96515e7579fc6968135aaa50dd855a046698ff79", - "sha256:d857feb6af9932e1ee3a748060a2cd9fd6043dbbccf66976eda54586597efdc0" + "sha256:76d5b90400c54b25278150768e946edf166acce2c1597c0ecfbebb1dbe9acf2c", + "sha256:7bb2e6506a6ad44d111dd20a5d510374b6958fe989b4ef887109c79d812f926f" ], "index": "pypi", - "version": "==1.21.18" + "version": "==1.21.19" }, "botocore": { "hashes": [ - "sha256:7ea8ef1ff7c4882ab59b337662f90ddf5ea860e95e7e209dca593a34ea585b1b", - "sha256:d2da7ccbc5ddd61fe3cd45fcbd3de380d9e3a15bfa8fbfd2d9259a93dcc60c56" + "sha256:5ed2be0e413961134f4c17eab16396d41a5b4b73a637588260c04d20806d52ea", + "sha256:d0d77bce152ca51f3c2cd0f9bf05cb3b623e719406ad58b4c20444e237fe82eb" ], "markers": "python_version >= '3.6'", - "version": "==1.24.18" + "version": "==1.24.19" }, "brotli": { "hashes": [ @@ -169,14 +154,6 @@ "index": "pypi", "version": "==1.1.0" }, - "docutils": { - "hashes": [ - "sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125", - "sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", - "version": "==0.17.1" - }, "ffmpeg-python": { "hashes": [ "sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127", @@ -284,22 +261,6 @@ "markers": "python_version >= '3'", "version": "==3.3" }, - "imagesize": { - "hashes": [ - "sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c", - "sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==1.3.0" - }, - "importlib-metadata": { - "hashes": [ - "sha256:1208431ca90a8cca1a6b8af391bb53c1a2db74e5d1cef6ddced95d4b2062edc6", - "sha256:ea4c597ebf37142f827b8f39299579e31685c31d3a438b59f469406afd0f2539" - ], - "markers": "python_version < '3.10'", - "version": "==4.11.3" - }, "iniconfig": { "hashes": [ "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3", @@ -314,14 +275,6 @@ "index": "pypi", "version": "==4.8.4" }, - "jinja2": { - "hashes": [ - "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8", - "sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7" - ], - "markers": "python_version >= '3.6'", - "version": "==3.0.3" - }, "jmespath": { "hashes": [ "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9", @@ -405,52 +358,6 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==4.8.0" }, - "markupsafe": { - "hashes": [ - "sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3", - "sha256:09c86c9643cceb1d87ca08cdc30160d1b7ab49a8a21564868921959bd16441b8", - "sha256:142119fb14a1ef6d758912b25c4e803c3ff66920635c44078666fe7cc3f8f759", - "sha256:1d1fb9b2eec3c9714dd936860850300b51dbaa37404209c8d4cb66547884b7ed", - "sha256:204730fd5fe2fe3b1e9ccadb2bd18ba8712b111dcabce185af0b3b5285a7c989", - "sha256:24c3be29abb6b34052fd26fc7a8e0a49b1ee9d282e3665e8ad09a0a68faee5b3", - "sha256:290b02bab3c9e216da57c1d11d2ba73a9f73a614bbdcc027d299a60cdfabb11a", - "sha256:3028252424c72b2602a323f70fbf50aa80a5d3aa616ea6add4ba21ae9cc9da4c", - "sha256:30c653fde75a6e5eb814d2a0a89378f83d1d3f502ab710904ee585c38888816c", - "sha256:3cace1837bc84e63b3fd2dfce37f08f8c18aeb81ef5cf6bb9b51f625cb4e6cd8", - "sha256:4056f752015dfa9828dce3140dbadd543b555afb3252507348c493def166d454", - "sha256:454ffc1cbb75227d15667c09f164a0099159da0c1f3d2636aa648f12675491ad", - "sha256:598b65d74615c021423bd45c2bc5e9b59539c875a9bdb7e5f2a6b92dfcfc268d", - "sha256:599941da468f2cf22bf90a84f6e2a65524e87be2fce844f96f2dd9a6c9d1e635", - "sha256:5ddea4c352a488b5e1069069f2f501006b1a4362cb906bee9a193ef1245a7a61", - "sha256:62c0285e91414f5c8f621a17b69fc0088394ccdaa961ef469e833dbff64bd5ea", - "sha256:679cbb78914ab212c49c67ba2c7396dc599a8479de51b9a87b174700abd9ea49", - "sha256:6e104c0c2b4cd765b4e83909cde7ec61a1e313f8a75775897db321450e928cce", - "sha256:736895a020e31b428b3382a7887bfea96102c529530299f426bf2e636aacec9e", - "sha256:75bb36f134883fdbe13d8e63b8675f5f12b80bb6627f7714c7d6c5becf22719f", - "sha256:7d2f5d97fcbd004c03df8d8fe2b973fe2b14e7bfeb2cfa012eaa8759ce9a762f", - "sha256:80beaf63ddfbc64a0452b841d8036ca0611e049650e20afcb882f5d3c266d65f", - "sha256:84ad5e29bf8bab3ad70fd707d3c05524862bddc54dc040982b0dbcff36481de7", - "sha256:8da5924cb1f9064589767b0f3fc39d03e3d0fb5aa29e0cb21d43106519bd624a", - "sha256:961eb86e5be7d0973789f30ebcf6caab60b844203f4396ece27310295a6082c7", - "sha256:96de1932237abe0a13ba68b63e94113678c379dca45afa040a17b6e1ad7ed076", - "sha256:a0a0abef2ca47b33fb615b491ce31b055ef2430de52c5b3fb19a4042dbc5cadb", - "sha256:b2a5a856019d2833c56a3dcac1b80fe795c95f401818ea963594b345929dffa7", - "sha256:b8811d48078d1cf2a6863dafb896e68406c5f513048451cd2ded0473133473c7", - "sha256:c532d5ab79be0199fa2658e24a02fce8542df196e60665dd322409a03db6a52c", - "sha256:d3b64c65328cb4cd252c94f83e66e3d7acf8891e60ebf588d7b493a55a1dbf26", - "sha256:d4e702eea4a2903441f2735799d217f4ac1b55f7d8ad96ab7d4e25417cb0827c", - "sha256:d5653619b3eb5cbd35bfba3c12d575db2a74d15e0e1c08bf1db788069d410ce8", - "sha256:d66624f04de4af8bbf1c7f21cc06649c1c69a7f84109179add573ce35e46d448", - "sha256:e67ec74fada3841b8c5f4c4f197bea916025cb9aa3fe5abf7d52b655d042f956", - "sha256:e6f7f3f41faffaea6596da86ecc2389672fa949bd035251eab26dc6697451d05", - "sha256:f02cf7221d5cd915d7fa58ab64f7ee6dd0f6cddbb48683debf5d04ae9b1c2cc1", - "sha256:f0eddfcabd6936558ec020130f932d479930581171368fd728efcfb6ef0dd357", - "sha256:fabbe18087c3d33c5824cb145ffca52eccd053061df1d79d4b66dafa5ad2a5ea", - "sha256:fc3150f85e2dbcf99e65238c842d1cfe69d3e7649b19864c1cc043213d9cd730" - ], - "markers": "python_version >= '3.7'", - "version": "==2.1.0" - }, "mutagen": { "hashes": [ "sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1", @@ -642,14 +549,6 @@ "git": "https://github.com/smarnach/pyexiftool.git", "ref": "3db3764895e687d75b42d3ae4e554ca8664a7f6f" }, - "pygments": { - "hashes": [ - "sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65", - "sha256:4e426f72023d88d03b2fa258de560726ce890ff3b630f88c21cbb8b2503b8c6a" - ], - "markers": "python_version >= '3.5'", - "version": "==2.11.2" - }, "pyparsing": { "hashes": [ "sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea", @@ -786,6 +685,9 @@ "version": "==2022.3.2" }, "requests": { + "extras": [ + "socks" + ], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" @@ -817,13 +719,6 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.16.0" }, - "snowballstemmer": { - "hashes": [ - "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1", - "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a" - ], - "version": "==2.2.0" - }, "snscrape": { "git": "https://github.com/bellingcat/snscrape.git", "ref": "de4ebed81f3f6a4bb4c65630daab6ec63784959b" @@ -836,62 +731,6 @@ "markers": "python_version >= '3.6'", "version": "==2.3.1" }, - "sphinx": { - "hashes": [ - "sha256:5da895959511473857b6d0200f56865ed62c31e8f82dd338063b84ec022701fe", - "sha256:6caad9786055cb1fa22b4a365c1775816b876f91966481765d7d50e9f0dd35cc" - ], - "index": "pypi", - "version": "==4.4.0" - }, - "sphinxcontrib-applehelp": { - "hashes": [ - "sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a", - "sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58" - ], - "markers": "python_version >= '3.5'", - "version": "==1.0.2" - }, - "sphinxcontrib-devhelp": { - "hashes": [ - "sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e", - "sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4" - ], - "markers": "python_version >= '3.5'", - "version": "==1.0.2" - }, - "sphinxcontrib-htmlhelp": { - "hashes": [ - "sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07", - "sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2" - ], - "markers": "python_version >= '3.6'", - "version": "==2.0.0" - }, - "sphinxcontrib-jsmath": { - "hashes": [ - "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", - "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8" - ], - "markers": "python_version >= '3.5'", - "version": "==1.0.1" - }, - "sphinxcontrib-qthelp": { - "hashes": [ - "sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72", - "sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6" - ], - "markers": "python_version >= '3.5'", - "version": "==1.0.3" - }, - "sphinxcontrib-serializinghtml": { - "hashes": [ - "sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd", - "sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952" - ], - "markers": "python_version >= '3.5'", - "version": "==1.1.5" - }, "sqlalchemy": { "hashes": [ "sha256:04164e0063feb7aedd9d073db0fd496edb244be40d46ea1f0d8990815e4b8c34", @@ -1034,17 +873,16 @@ ], "index": "pypi", "version": "==2022.3.8.2" - }, - "zipp": { - "hashes": [ - "sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d", - "sha256:b47250dd24f92b7dd6a0a8fc5244da14608f3ca90a5efcd37a3b1642fac9a375" - ], - "markers": "python_version >= '3.7'", - "version": "==3.7.0" } }, "develop": { + "alabaster": { + "hashes": [ + "sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359", + "sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02" + ], + "version": "==0.7.12" + }, "attrs": { "hashes": [ "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4", @@ -1053,6 +891,29 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==21.4.0" }, + "babel": { + "hashes": [ + "sha256:ab49e12b91d937cd11f0b67cb259a57ab4ad2b59ac7a3b41d6c06c0ac5b0def9", + "sha256:bc0c176f9f6a994582230df350aa6e05ba2ebe4b3ac317eab29d9be5d2768da0" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==2.9.1" + }, + "certifi": { + "hashes": [ + "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872", + "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569" + ], + "version": "==2021.10.8" + }, + "charset-normalizer": { + "hashes": [ + "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597", + "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df" + ], + "markers": "python_version >= '3'", + "version": "==2.0.12" + }, "coverage": { "extras": [ "toml" @@ -1103,6 +964,38 @@ "markers": "python_version >= '3.7'", "version": "==6.3.2" }, + "docutils": { + "hashes": [ + "sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125", + "sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==0.17.1" + }, + "idna": { + "hashes": [ + "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff", + "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d" + ], + "markers": "python_version >= '3'", + "version": "==3.3" + }, + "imagesize": { + "hashes": [ + "sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c", + "sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==1.3.0" + }, + "importlib-metadata": { + "hashes": [ + "sha256:1208431ca90a8cca1a6b8af391bb53c1a2db74e5d1cef6ddced95d4b2062edc6", + "sha256:ea4c597ebf37142f827b8f39299579e31685c31d3a438b59f469406afd0f2539" + ], + "markers": "python_version < '3.10'", + "version": "==4.11.3" + }, "iniconfig": { "hashes": [ "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3", @@ -1110,6 +1003,60 @@ ], "version": "==1.1.1" }, + "jinja2": { + "hashes": [ + "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8", + "sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7" + ], + "markers": "python_version >= '3.6'", + "version": "==3.0.3" + }, + "markupsafe": { + "hashes": [ + "sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003", + "sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88", + "sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5", + "sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7", + "sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a", + "sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603", + "sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1", + "sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135", + "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247", + "sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6", + "sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601", + "sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77", + "sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02", + "sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e", + "sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63", + "sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f", + "sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980", + "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b", + "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812", + "sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff", + "sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96", + "sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1", + "sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925", + "sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a", + "sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6", + "sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e", + "sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f", + "sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4", + "sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f", + "sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3", + "sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c", + "sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a", + "sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417", + "sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a", + "sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a", + "sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37", + "sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452", + "sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933", + "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a", + "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7" + ], + "markers": "python_version >= '3.7'", + "version": "==2.1.1" + }, "packaging": { "hashes": [ "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", @@ -1134,6 +1081,14 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==1.11.0" }, + "pygments": { + "hashes": [ + "sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65", + "sha256:4e426f72023d88d03b2fa258de560726ce890ff3b630f88c21cbb8b2503b8c6a" + ], + "markers": "python_version >= '3.5'", + "version": "==2.11.2" + }, "pyparsing": { "hashes": [ "sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea", @@ -1174,6 +1129,95 @@ "index": "pypi", "version": "==1.11.0" }, + "pytz": { + "hashes": [ + "sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c", + "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326" + ], + "version": "==2021.3" + }, + "requests": { + "extras": [ + "socks" + ], + "hashes": [ + "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", + "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" + ], + "index": "pypi", + "version": "==2.27.1" + }, + "snowballstemmer": { + "hashes": [ + "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1", + "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a" + ], + "version": "==2.2.0" + }, + "sphinx": { + "hashes": [ + "sha256:5da895959511473857b6d0200f56865ed62c31e8f82dd338063b84ec022701fe", + "sha256:6caad9786055cb1fa22b4a365c1775816b876f91966481765d7d50e9f0dd35cc" + ], + "index": "pypi", + "version": "==4.4.0" + }, + "sphinx-rtd-theme": { + "hashes": [ + "sha256:4d35a56f4508cfee4c4fb604373ede6feae2a306731d533f409ef5c3496fdbd8", + "sha256:eec6d497e4c2195fa0e8b2016b337532b8a699a68bcb22a512870e16925c6a5c" + ], + "index": "pypi", + "version": "==1.0.0" + }, + "sphinxcontrib-applehelp": { + "hashes": [ + "sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a", + "sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.2" + }, + "sphinxcontrib-devhelp": { + "hashes": [ + "sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e", + "sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.2" + }, + "sphinxcontrib-htmlhelp": { + "hashes": [ + "sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07", + "sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2" + ], + "markers": "python_version >= '3.6'", + "version": "==2.0.0" + }, + "sphinxcontrib-jsmath": { + "hashes": [ + "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", + "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.1" + }, + "sphinxcontrib-qthelp": { + "hashes": [ + "sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72", + "sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.3" + }, + "sphinxcontrib-serializinghtml": { + "hashes": [ + "sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd", + "sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952" + ], + "markers": "python_version >= '3.5'", + "version": "==1.1.5" + }, "tomli": { "hashes": [ "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", @@ -1181,6 +1225,22 @@ ], "markers": "python_version >= '3.7'", "version": "==2.0.1" + }, + "urllib3": { + "hashes": [ + "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed", + "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", + "version": "==1.26.8" + }, + "zipp": { + "hashes": [ + "sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d", + "sha256:b47250dd24f92b7dd6a0a8fc5244da14608f3ca90a5efcd37a3b1642fac9a375" + ], + "markers": "python_version >= '3.7'", + "version": "==3.7.0" } } } diff --git a/cisticola/base.py b/cisticola/base.py index b58926b..d5b10e8 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -1,33 +1,47 @@ from typing import List from dataclasses import dataclass from datetime import datetime +import tempfile +import json +import io + from sqlalchemy.orm import registry from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean import pytesseract import PIL -import io import exiftool -import json -import os from .utils import make_request -mapper_registry = registry() - @dataclass class ScraperResult: - """A minimally processed result from a scraper""" + """A minimally processed result from a scraper + """ + #: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``. scraper: str + + #: Name of platform from which result was scraped, e.g. ``"Twitter"``. platform: str + + #: Foreign key of channel ID that this was scraped from channel: int + + #: String that uniquely identifies the scraped post on the given platform, e.g. ``"1503397267675533313"`` platform_id: str + + #: Datetime (relative to UTC) that the scraped post was created at. date: datetime + + #: JSON dump of dict that contains all data scraped for the post. raw_data: str + + #: Datetime (relative to UTC) that the scraped post was archived at. date_archived: datetime + + #: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files. archived_urls: dict - - + raw_data_table = Table('raw_data', mapper_registry.metadata, Column('id', Integer, primary_key=True, autoincrement=True), @@ -40,22 +54,45 @@ raw_data_table = Table('raw_data', mapper_registry.metadata, Column('date_archived', DateTime), Column('archived_urls', JSON)) -mapper_registry.map_imperatively(ScraperResult, raw_data_table) - - @dataclass class Channel: + """Information about a specific channel to be scraped. + """ + + #: Name of channel (different from username because it can be non-unique and contain emojis), e.g. ``T🕊Редакция Президент Гордон🕊"``. name: str + + #: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``. platform_id: str + + #: User-specified category for the channel, e.g. ``"explicit_qanon"``. category: str + + #: Name of platform the given channel is on, e.g. ``"Telegram"``. platform: str + + #: URL for the given channel on the platform, e.g. ``"https://t.me/prezidentgordonteam"`` url: str + + #: Screen name/username of channel. screenname: str + + #: 2 digit country code for the country of origin for the channel, e.g. ``"RU"``. country: str = None + + #: Name of influencer, if channel belongs to an influencer that operates on multiple platforms. influencer: str = None + + #: Whether or not the channel is publicly-accessible. public: bool = None + + #: Whether or not the channel is a chat (i.e. allows users who are not the channel creator to post/message) chat: bool = None + + #: Any other additional notes about the channel. notes: str = "" + + #: Did the channel come from a researcher or a scraping process? source: str = None def hydrate(self): @@ -82,26 +119,52 @@ mapper_registry.map_imperatively(Channel, channel_table) @dataclass class Post: """An object with fields for columns in the analysis table""" + + #: ID number of the scraped post in the ``raw_data`` table raw_id: int + + #: Platform specific post ID platform_id: str + + #: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``. scraper: str + + #: String specifying name and version of transformer used to tranform result, e.g. ``"TwitterTransformer 0.0.1"``. transformer: str + + #: Name of platform from which result was scraped, e.g. ``"Twitter"``. platform: str + + #: User-specified integer that uniquely identifies a channel, e.g. ``15``. channel: int + + #: Datetime (relative to UTC) that the scraped post was created at. date: datetime + + #: Datetime (relative to UTC) that the scraped post was archived at. date_archived: datetime + + #: URL of the original post url: str + + #: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``. author_id: str + + #: Username of author who made post. author_username: str + + #: Text of the original post content: str + + #: The ID of the Channel that the post was forwarded or quoted from forwarded_from: int = None + + #: The ID of the Post that this Post is a reply to or reblog of reply_to: int = None def hydrate(self): pass - - post_table = Table('posts', mapper_registry.metadata, Column('id', Integer, primary_key=True, autoincrement=True), @@ -125,39 +188,64 @@ mapper_registry.map_imperatively(Post, post_table) @dataclass class Media: + """Base class for organizing information about a media file. + """ + + #: ID number of the media's corresponding scraped post in the ``raw_data`` table. raw_id: int + + #: ID number of the media's corresponging scraped post in the ``analysis`` table. post: int + + #: URL of the original post. url: str + + #: Original URL of the media from the the original post. original_url: str + #: JSON dump of the dict containing metadata information for the media file. exif: str = None def get_blob(self): + """Download media file as bytes blob. + """ + blob = make_request(self.url) return blob.content def hydrate(self, blob = None): + """Download media file as bytes blob and extract data from content. + """ + if blob is None: blob = self.get_blob() self.hydrate_exif(blob) def hydrate_exif(self, blob): - f = open('tmp', 'wb') - f.write(blob) - f.close() + """Extract Exif metadata from bytes blob. + """ - with exiftool.ExifTool() as et: - exif = et.get_metadata('tmp') - self.exif = json.dumps(exif) + with tempfile.NamedTemporaryFile() as temp_file: + temp_file.write(blob) - os.remove('tmp') + with exiftool.ExifTool() as et: + exif = et.get_metadata(temp_file.name) + self.exif = json.dumps(exif) @dataclass class Image(Media): + """Class for organizing information about an image file. + """ + + #: Extracted OCR content from image ocr: str = None def hydrate(self, blob=None): + """Download image file as bytes blob and extract Exif and OCR content + from the image. + """ + if blob is None: blob = self.get_blob() @@ -165,25 +253,62 @@ class Image(Media): self.hydrate_ocr(blob) def hydrate_ocr(self, blob): + """Extract OCR (optical character recognition) data from image bytes blob. + """ + image = PIL.Image.open(io.BytesIO(blob)) self.ocr = pytesseract.image_to_string(image) @dataclass class Video(Media): + """Class for organizing information about an image file. + """ + pass +mapper_registry = registry() + +raw_data_table = Table('raw_data', mapper_registry.metadata, + Column('id', Integer, primary_key=True, + autoincrement=True), + Column('scraper', String), + Column('platform', String), + Column('channel', Integer), + Column('platform_id', String), + Column('date', DateTime), + Column('raw_data', String), + Column('date_archived', DateTime), + Column('archived_urls', JSON)) + + +analysis_table = Table('analysis', mapper_registry.metadata, + Column('id', Integer, primary_key=True, + autoincrement=True), + Column('raw_id', Integer, ForeignKey('raw_data.id')), + Column('scraper', String), + Column('transformer', String), + Column('platform', String), + Column('channel', Integer), + Column('date', DateTime), + Column('date_archived', DateTime), + Column('url', String), + Column('author_id', String), + Column('author_username', String), + Column('content', String)) + media_table = Table('media', mapper_registry.metadata, Column('id', Integer, primary_key=True, autoincrement=True), - Column('type', String), + Column('type', String), Column('raw_id', Integer, ForeignKey('raw_data.id')), Column('post', Integer, ForeignKey('posts.id')), Column('url', String), Column('original_url', String), Column('exif', String), - Column('ocr', String) - ) + Column('ocr', String)) +mapper_registry.map_imperatively(TransformedResult, analysis_table) +mapper_registry.map_imperatively(ScraperResult, raw_data_table) mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media') mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image') mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video') \ No newline at end of file diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 4a3e57e..ddc5510 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -14,29 +14,91 @@ from cisticola.base import Channel, ScraperResult, mapper_registry from cisticola.utils import make_request class Scraper: + """Base class for defining platform-specific scrapers for scraping all posts + from a given channel on that specific platform. + """ + __version__ = "Scraper 0.0.0" def __init__(self): - self.s3_client = boto3.client('s3', - region_name=os.environ['DO_SPACES_REGION'], - endpoint_url='https://{}.digitaloceanspaces.com'.format( - os.environ['DO_SPACES_REGION']), - aws_access_key_id=os.environ['DO_SPACES_KEY'], - aws_secret_access_key=os.environ['DO_SPACES_SECRET']) + # Initialize client to transfer files to the storage archive + self.s3_client = boto3.client( + service_name='s3', + region_name=os.environ['DO_SPACES_REGION'], + endpoint_url=f'https://{os.environ["DO_SPACES_REGION"]}.digitaloceanspaces.com', + aws_access_key_id=os.environ['DO_SPACES_KEY'], + aws_secret_access_key=os.environ['DO_SPACES_SECRET']) + + # Define request headers (necessary to bypass scraping protection + # for several platform scrapers) self.headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'} - pass - def __str__(self): return self.__version__ + def get_username_from_url(self, url: str) -> str: + """Extract a channel's username from its URL. + + Parameters + ---------- + url: str + URL of the channel on a given platform + e.g. ``"https://twitter.com/EliotHiggins"`` + + Returns + ------- + username: str + Extracted username of the channel. + e.g. ``"EliotHiggins"`` + """ + + raise NotImplementedError + def url_to_key(self, url: str, content_type: str) -> str: + """Generate a unique identifier for media from a specified post. + + Parameters + --------- + url: str + URL of original post. + e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"`` + content_type: str + Content-Type of media. + e.g. ``"image/jpeg"`` + + Returns + ------- + key: str + Unique identifier for the media file from a specified post based on + the original post URL and the media's Content-Type. + """ + key = urlparse(url).path.split('/')[-1] return key def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: + """Download media file from a specified media file URL. + + Parameters + --------- + url: str + URL of media file from original post. + e.g. ``"https://pbs.twimg.com/media/FN0j0dYWUAcQxfK?format=png&name=medium"`` + key: str or None + Pre-defined unique identifier for the media file. + + Returns + ------- + blob: bytes + Raw bytes of the downloaded media file. + content_type: str + Content-Type of media. + e.g. ``"image/jpeg"``. + key: str + Unique identifier for the media file. + """ r = make_request(url, headers = self.headers) @@ -49,6 +111,27 @@ class Scraper: return blob, content_type, key def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: + """Download media file from a specified media URL, where the media file + is formatted as an m3u8 playlist, which is then decoded to an mp4 file. + + Parameters + --------- + url: str + URL of m3u8 playlist file from original post. + e.g. ``"https://media.gettr.com/group47/origin/2022/03/15/01/cbc436c1-1a1a-4b97-671d-c42109f3ec9b/out.m3u8"`` + key: str or None + Pre-defined unique identifier for the media file. + + Returns + ------- + blob: bytes + Raw bytes of the downloaded media file. + content_type: str + Content-Type of media. + e.g. ``"video/mp4"``. + key: str + Unique identifier for the media file. + """ content_type = 'video/mp4' ext = '.' + content_type.split('/')[-1] @@ -71,7 +154,28 @@ class Scraper: return blob, content_type, key def ytdlp_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: - + """Download media file from a specified media URL, using a fork of + youtube-dl that enables faster downloading. + + Parameters + --------- + url: str + URL of media file from original post. + e.g. ``"https://rumble.com/embed/vgt7gh/"`` + key: str or None + Pre-defined unique identifier for the media file. + + Returns + ------- + blob: bytes + Raw bytes of the downloaded media file. + content_type: str + Content-Type of media. + e.g. ``"video/mp4"``. + key: str + Unique identifier for the media file. + """ + content_type = 'video/mp4' with tempfile.TemporaryDirectory() as temp_dir: @@ -103,6 +207,23 @@ class Scraper: return blob, content_type, key def archive_blob(self, blob: bytes, content_type: str, key: str) -> str: + """Upload raw bytes of a media file to the storage archive. + + Parameters + ---------- + blob: bytes + Raw bytes of the media file to be archived. + content_type: str + Content-Type of media. + e.g. ``"video/mp4"``. + key: str + Unique identifier for the media file. + + Returns + ------- + archived_url: str + URL specifying the file on the storage archive. + """ filename = self.__version__.replace(' ', '_') + '/' + key @@ -114,9 +235,42 @@ class Scraper: return archived_url def can_handle(self, channel: Channel) -> bool: + """Whether or not the scraper can scrape the specified channel. + + Parameters + ---------- + channel: Channel + Channel to be scraped. + + Returns + ------- + bool + ``True`` if the scraper is capable of scraping ``channel``, + ``False`` if not. + """ + raise NotImplementedError def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: + """Scrape all posts from the specified Channel. + + Parameters + ---------- + channel: Channel + Channel to be scraped. + since: ScraperResult or None + Most recently scraped ScraperResult from a previous scrape, or + ``None`` if scraper has not run before. + archive_media: bool + If ``True``, any media files (images, video, etc.) from posts are archived. + If ``False``, media files are not archived. + + Yields + ------ + ScraperResult + Scraper result from a single post/comment from the specified Channel. + """ + raise NotImplementedError @@ -129,9 +283,13 @@ class ScraperController: self.session = None def register_scraper(self, scraper: Scraper): + """Register a single Scraper instance to the controller. + """ self.scrapers.append(scraper) def register_scrapers(self, scraper: List[Scraper]): + """Register a list of Scraper instances to the controller. + """ self.scrapers.extend(scraper) def scrape_all_channels(self, archive_media: bool = True): @@ -147,6 +305,17 @@ class ScraperController: @logger.catch(reraise = True) def scrape_channels(self, channels: List[Channel], archive_media: bool = True): + """Scrape all posts for all specified channels. + + Parameters + ---------- + channels: list + List of Channel instances to be scraped + archive_media: bool + If ``True``, any media files (images, video, etc.) from posts are archived. + If ``False``, media files are not archived. + """ + if self.session is None: logger.error("No DB session") return @@ -185,6 +354,9 @@ class ScraperController: logger.warning(f"No handler found for Channel {channel}") def connect_to_db(self, engine): + """Connect the specified SQLAlchemy engine to the controller. + """ + # create tables mapper_registry.metadata.create_all(bind=engine) @@ -193,8 +365,8 @@ class ScraperController: self.session.configure(bind=self.engine) def reset_db(self): + """Drop all data from the connected SQLAlchemy database. + """ mapper_registry.metadata.drop_all(bind=self.engine) - self.connect_to_db(self.engine) - - + self.connect_to_db(self.engine) \ No newline at end of file diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index 8a365f4..b11d27a 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -17,7 +17,7 @@ class BitchuteScraper(Scraper): library""" __version__ = "BitchuteScraper 0.0.1" - def get_username_from_url(url): + def get_username_from_url(self, url): username = url.split('bitchute.com/channel/')[-1].strip('/') return username @@ -33,7 +33,7 @@ class BitchuteScraper(Scraper): detail = 'comments' - username = BitchuteScraper.get_username_from_url(channel.url) + username = self.get_username_from_url(channel.url) scraper = get_videos_user(session, username, csrftoken, detail) for post in scraper: @@ -61,7 +61,7 @@ class BitchuteScraper(Scraper): archived_urls=archived_urls) def can_handle(self, channel): - if channel.platform == "Bitchute" and BitchuteScraper.get_username_from_url(channel.url) is not None: + if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None: return True #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# diff --git a/cisticola/scraper/gab.py b/cisticola/scraper/gab.py index 910ebc2..f90f2a3 100644 --- a/cisticola/scraper/gab.py +++ b/cisticola/scraper/gab.py @@ -11,14 +11,14 @@ class GabScraper(Scraper): """An implementation of a Scraper for Gab, using GARC library""" __version__ = "GabScraper 0.0.1" - def get_username_from_url(url): + def get_username_from_url(self, url): username = url.split('https://gab.com/')[-1] return username def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: client = Garc(profile = 'main') - username = GabScraper.get_username_from_url(channel.url) + username = self.get_username_from_url(channel.url) scraper = client.userposts(username) @@ -52,5 +52,5 @@ class GabScraper(Scraper): archived_urls=archived_urls) def can_handle(self, channel): - if channel.platform == "Gab" and GabScraper.get_username_from_url(channel.url) is not None: + if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None: return True \ No newline at end of file diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index 3cd069e..4fb15cc 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -12,7 +12,7 @@ class GettrScraper(Scraper): """An implementation of a Scraper for Gettr, using gogettr library""" __version__ = "GettrScraper 0.0.1" - def get_username_from_url(url): + def get_username_from_url(self, url): username = url.split("gettr.com/user/")[1] if len(username.split("/")) > 1: return None @@ -21,7 +21,7 @@ class GettrScraper(Scraper): def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: client = PublicClient() - username = GettrScraper.get_username_from_url(channel.url) + username = self.get_username_from_url(channel.url) scraper = client.user_activity(username=username, type="posts") for post in scraper: @@ -62,7 +62,7 @@ class GettrScraper(Scraper): archived_urls=archived_urls) def can_handle(self, channel): - if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None: + if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None: return True def url_to_key(self, url: str, content_type: str) -> str: diff --git a/cisticola/scraper/instagram.py b/cisticola/scraper/instagram.py index eb20ecb..f9ae76e 100644 --- a/cisticola/scraper/instagram.py +++ b/cisticola/scraper/instagram.py @@ -18,6 +18,7 @@ CONTENT_TYPES = { 'mp4' : 'video/mp4'} class InstagramScraper(Scraper): + """An implementation of a Scraper for Instagram, using instaloader library""" __version__ = "InstagramScraper 0.0.1" def get_username_from_url(self, url): diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py index 61ed9ca..eb7ec04 100644 --- a/cisticola/scraper/odysee.py +++ b/cisticola/scraper/odysee.py @@ -13,7 +13,7 @@ class OdyseeScraper(Scraper): """An implementation of a Scraper for Odysee, using polyphemus library""" __version__ = "OdyseeScraper 0.0.1" - def get_username_from_url(url): + def get_username_from_url(self, url): username = url.split('odysee.com/')[-1].strip('@').split(':')[0] @@ -21,7 +21,7 @@ class OdyseeScraper(Scraper): def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: - username = OdyseeScraper.get_username_from_url(channel.url) + username = self.get_username_from_url(channel.url) odysee_channel = OdyseeChannel(channel_name = username) all_videos = odysee_channel.get_all_videos() @@ -70,7 +70,7 @@ class OdyseeScraper(Scraper): archived_urls={}) def can_handle(self, channel): - if channel.platform == "Odysee" and OdyseeScraper.get_username_from_url(channel.url) is not None: + if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None: return True def url_to_key(self, url: str, content_type: str) -> str: diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index 8546d6e..9863fb0 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -14,14 +14,14 @@ class RumbleScraper(Scraper): """An implementation of a Scraper for Rumble, using custom functions""" __version__ = "RumbleScraper 0.0.1" - def get_username_from_url(url): + def get_username_from_url(self, url): username = url.split('https://rumble.com/c/')[1] return username def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: - username = RumbleScraper.get_username_from_url(channel.url) + username = self.get_username_from_url(channel.url) scraper = get_channel_videos(username) for post in scraper: @@ -54,7 +54,7 @@ class RumbleScraper(Scraper): return key def can_handle(self, channel): - if channel.platform == "Rumble" and RumbleScraper.get_username_from_url(channel.url) is not None: + if channel.platform == "Rumble" and self.get_username_from_url(channel.url) is not None: return True #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py index 3f3f45d..ec5b292 100644 --- a/cisticola/scraper/telegram_snscrape.py +++ b/cisticola/scraper/telegram_snscrape.py @@ -8,6 +8,7 @@ from cisticola.base import Channel, ScraperResult from cisticola.scraper.base import Scraper class TelegramSnscrapeScraper(Scraper): + """An implementation of a Scraper for Telegram, using snscrape library""" __version__ = "TelegramSnscrapeScraper 0.0.1" def can_handle(self, channel): diff --git a/cisticola/scraper/telegram_telethon.py b/cisticola/scraper/telegram_telethon.py index 76d68f2..b8231bc 100644 --- a/cisticola/scraper/telegram_telethon.py +++ b/cisticola/scraper/telegram_telethon.py @@ -14,6 +14,7 @@ from cisticola.scraper.base import Scraper MEDIA_TYPES = ['photo', 'video', 'document', 'webpage'] class TelegramTelethonScraper(Scraper): + """An implementation of a Scraper for Telegram, using Telethon library""" __version__ = "TelegramTelethonScraper 0.0.1" def get_username_from_url(self, url): @@ -30,9 +31,9 @@ class TelegramTelethonScraper(Scraper): username = self.get_username_from_url(channel.url) - api_id = os.environ['TELEGRAM_API_ID_1'] - api_hash = os.environ['TELEGRAM_API_HASH_1'] - phone = os.environ['TELEGRAM_PHONE_1'] + api_id = os.environ['TELEGRAM_API_ID'] + api_hash = os.environ['TELEGRAM_API_HASH'] + phone = os.environ['TELEGRAM_PHONE'] with TelegramClient(phone, api_id, api_hash) as client: diff --git a/docs/Makefile b/docs/Makefile index d0c3cbf..ab3e9be 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -8,12 +8,24 @@ SPHINXBUILD ?= sphinx-build SOURCEDIR = source BUILDDIR = build +SPHINXAPIDOC = sphinx-apidoc +APIDOCFLAGS = --separate --private --module-first +MODULEPATH = ../cisticola +SOURCEFILES = cisticola.* +MODULEFILE = modules.rst + # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile +# Custom process and flags for generating Sphinx sources +apidoc: + rm $(SOURCEDIR)/$(SOURCEFILES) + $(SPHINXAPIDOC) $(APIDOCFLAGS) -o "$(SOURCEDIR)" "$(MODULEPATH)" + rm $(SOURCEDIR)/$(MODULEFILE) + # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile diff --git a/docs/images/cisticola_logo.svg b/docs/images/cisticola_logo.svg new file mode 100644 index 0000000..f570be8 --- /dev/null +++ b/docs/images/cisticola_logo.svg @@ -0,0 +1,64 @@ + + + + + + + + + + + + + diff --git a/docs/images/favicon.ico b/docs/images/favicon.ico new file mode 100644 index 0000000..75d9446 Binary files /dev/null and b/docs/images/favicon.ico differ diff --git a/docs/make.bat b/docs/make.bat index 6fcf05b..3ab2ef7 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -10,6 +10,12 @@ if "%SPHINXBUILD%" == "" ( set SOURCEDIR=source set BUILDDIR=build +set SPHINXAPIDOC=sphinx-apidoc +set APIDOCFLAGS=--separate --private --module-first +set MODULEPATH=../cisticola +set SOURCEFILES=cisticola.* +set MODULEFILE=modules.rst + if "%1" == "" goto help %SPHINXBUILD% >NUL 2>NUL @@ -28,6 +34,11 @@ if errorlevel 9009 ( %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% goto end +:apidoc + del %SOURCEDIR%\%SOURCEFILES% + %SPHINXAPIDOC% %APIDOCFLAGS% -o %SOURCEDIR% %MODULEPATH% + del %SOURCEDIR%\%MODULEFILE% + :help %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% diff --git a/docs/source/cisticola.base.rst b/docs/source/cisticola.base.rst new file mode 100644 index 0000000..db91e8b --- /dev/null +++ b/docs/source/cisticola.base.rst @@ -0,0 +1,8 @@ +cisticola.base module +===================== + +.. automodule:: cisticola.base + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.rst b/docs/source/cisticola.rst index df2792c..22cdf67 100644 --- a/docs/source/cisticola.rst +++ b/docs/source/cisticola.rst @@ -1,6 +1,12 @@ cisticola package ================= +.. automodule:: cisticola + :members: + :undoc-members: + :show-inheritance: + :private-members: + Subpackages ----------- @@ -13,18 +19,8 @@ Subpackages Submodules ---------- -cisticola.base module ---------------------- +.. toctree:: + :maxdepth: 4 -.. automodule:: cisticola.base - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: cisticola - :members: - :undoc-members: - :show-inheritance: + cisticola.base + cisticola.utils diff --git a/docs/source/cisticola.scraper.base.rst b/docs/source/cisticola.scraper.base.rst new file mode 100644 index 0000000..1c6f6e2 --- /dev/null +++ b/docs/source/cisticola.scraper.base.rst @@ -0,0 +1,8 @@ +cisticola.scraper.base module +============================= + +.. automodule:: cisticola.scraper.base + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.scraper.bitchute.rst b/docs/source/cisticola.scraper.bitchute.rst new file mode 100644 index 0000000..dc44b13 --- /dev/null +++ b/docs/source/cisticola.scraper.bitchute.rst @@ -0,0 +1,8 @@ +cisticola.scraper.bitchute module +================================= + +.. automodule:: cisticola.scraper.bitchute + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.scraper.gab.rst b/docs/source/cisticola.scraper.gab.rst new file mode 100644 index 0000000..b0777c7 --- /dev/null +++ b/docs/source/cisticola.scraper.gab.rst @@ -0,0 +1,8 @@ +cisticola.scraper.gab module +============================ + +.. automodule:: cisticola.scraper.gab + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.scraper.gettr.rst b/docs/source/cisticola.scraper.gettr.rst new file mode 100644 index 0000000..3275e9a --- /dev/null +++ b/docs/source/cisticola.scraper.gettr.rst @@ -0,0 +1,8 @@ +cisticola.scraper.gettr module +============================== + +.. automodule:: cisticola.scraper.gettr + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.scraper.instagram.rst b/docs/source/cisticola.scraper.instagram.rst new file mode 100644 index 0000000..53ddc43 --- /dev/null +++ b/docs/source/cisticola.scraper.instagram.rst @@ -0,0 +1,8 @@ +cisticola.scraper.instagram module +================================== + +.. automodule:: cisticola.scraper.instagram + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.scraper.odysee.rst b/docs/source/cisticola.scraper.odysee.rst new file mode 100644 index 0000000..491b0db --- /dev/null +++ b/docs/source/cisticola.scraper.odysee.rst @@ -0,0 +1,8 @@ +cisticola.scraper.odysee module +=============================== + +.. automodule:: cisticola.scraper.odysee + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.scraper.rst b/docs/source/cisticola.scraper.rst index dcd8bca..b93592c 100644 --- a/docs/source/cisticola.scraper.rst +++ b/docs/source/cisticola.scraper.rst @@ -1,37 +1,27 @@ cisticola.scraper package ========================= -Submodules ----------- - -cisticola.scraper.bitchute module ---------------------------------- - -.. automodule:: cisticola.scraper.bitchute - :members: - :undoc-members: - :show-inheritance: - -cisticola.scraper.gettr module ------------------------------- - -.. automodule:: cisticola.scraper.gettr - :members: - :undoc-members: - :show-inheritance: - -cisticola.scraper.twitter module --------------------------------- - -.. automodule:: cisticola.scraper.twitter - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - .. automodule:: cisticola.scraper :members: :undoc-members: :show-inheritance: + :private-members: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + cisticola.scraper.base + cisticola.scraper.bitchute + cisticola.scraper.gab + cisticola.scraper.gettr + cisticola.scraper.instagram + cisticola.scraper.odysee + cisticola.scraper.rumble + cisticola.scraper.telegram_snscrape + cisticola.scraper.telegram_telethon + cisticola.scraper.twitter + cisticola.scraper.vkontakte + cisticola.scraper.youtube diff --git a/docs/source/cisticola.scraper.rumble.rst b/docs/source/cisticola.scraper.rumble.rst new file mode 100644 index 0000000..726c493 --- /dev/null +++ b/docs/source/cisticola.scraper.rumble.rst @@ -0,0 +1,8 @@ +cisticola.scraper.rumble module +=============================== + +.. automodule:: cisticola.scraper.rumble + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.scraper.telegram_snscrape.rst b/docs/source/cisticola.scraper.telegram_snscrape.rst new file mode 100644 index 0000000..ffc9a7c --- /dev/null +++ b/docs/source/cisticola.scraper.telegram_snscrape.rst @@ -0,0 +1,8 @@ +cisticola.scraper.telegram\_snscrape module +=========================================== + +.. automodule:: cisticola.scraper.telegram_snscrape + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.scraper.telegram_telethon.rst b/docs/source/cisticola.scraper.telegram_telethon.rst new file mode 100644 index 0000000..a41db65 --- /dev/null +++ b/docs/source/cisticola.scraper.telegram_telethon.rst @@ -0,0 +1,8 @@ +cisticola.scraper.telegram\_telethon module +=========================================== + +.. automodule:: cisticola.scraper.telegram_telethon + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.scraper.twitter.rst b/docs/source/cisticola.scraper.twitter.rst new file mode 100644 index 0000000..9e557aa --- /dev/null +++ b/docs/source/cisticola.scraper.twitter.rst @@ -0,0 +1,8 @@ +cisticola.scraper.twitter module +================================ + +.. automodule:: cisticola.scraper.twitter + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.scraper.vkontakte.rst b/docs/source/cisticola.scraper.vkontakte.rst new file mode 100644 index 0000000..405d70d --- /dev/null +++ b/docs/source/cisticola.scraper.vkontakte.rst @@ -0,0 +1,8 @@ +cisticola.scraper.vkontakte module +================================== + +.. automodule:: cisticola.scraper.vkontakte + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.scraper.youtube.rst b/docs/source/cisticola.scraper.youtube.rst new file mode 100644 index 0000000..e990195 --- /dev/null +++ b/docs/source/cisticola.scraper.youtube.rst @@ -0,0 +1,8 @@ +cisticola.scraper.youtube module +================================ + +.. automodule:: cisticola.scraper.youtube + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.transformer.base.rst b/docs/source/cisticola.transformer.base.rst new file mode 100644 index 0000000..0f57e13 --- /dev/null +++ b/docs/source/cisticola.transformer.base.rst @@ -0,0 +1,8 @@ +cisticola.transformer.base module +================================= + +.. automodule:: cisticola.transformer.base + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.transformer.bitchute.rst b/docs/source/cisticola.transformer.bitchute.rst new file mode 100644 index 0000000..7427e9f --- /dev/null +++ b/docs/source/cisticola.transformer.bitchute.rst @@ -0,0 +1,8 @@ +cisticola.transformer.bitchute module +===================================== + +.. automodule:: cisticola.transformer.bitchute + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.transformer.rst b/docs/source/cisticola.transformer.rst index a2eb71e..358d955 100644 --- a/docs/source/cisticola.transformer.rst +++ b/docs/source/cisticola.transformer.rst @@ -1,21 +1,18 @@ cisticola.transformer package ============================= -Submodules ----------- - -cisticola.transformer.twitter module ------------------------------------- - -.. automodule:: cisticola.transformer.twitter - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - .. automodule:: cisticola.transformer :members: :undoc-members: :show-inheritance: + :private-members: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + cisticola.transformer.base + cisticola.transformer.bitchute + cisticola.transformer.twitter diff --git a/docs/source/cisticola.transformer.twitter.rst b/docs/source/cisticola.transformer.twitter.rst new file mode 100644 index 0000000..05f29fa --- /dev/null +++ b/docs/source/cisticola.transformer.twitter.rst @@ -0,0 +1,8 @@ +cisticola.transformer.twitter module +==================================== + +.. automodule:: cisticola.transformer.twitter + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.utils.rst b/docs/source/cisticola.utils.rst new file mode 100644 index 0000000..6e5872a --- /dev/null +++ b/docs/source/cisticola.utils.rst @@ -0,0 +1,8 @@ +cisticola.utils module +====================== + +.. automodule:: cisticola.utils + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/conf.py b/docs/source/conf.py index 4af6aa1..c291fb8 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -43,9 +43,18 @@ exclude_patterns = [] # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'alabaster' +html_theme = 'sphinx_rtd_theme' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = [] + +# -- Default flags for autodoc------------------------------------------------ + +autodoc_default_options = {'exclude-members': '_sa_class_manager'} + +html_favicon = '../images/favicon.ico' +html_logo = '../images/cisticola_logo.svg' + +html_theme_options = {'style_nav_header_background': '#000000'} \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index 67fd022..3c12d81 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -2,16 +2,7 @@ Welcome to Cisticola's documentation! ===================================== .. toctree:: - :maxdepth: 2 - :caption: Contents: + :maxdepth: 1 - modules - - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` + quickstart + cisticola \ No newline at end of file diff --git a/docs/source/modules.rst b/docs/source/modules.rst deleted file mode 100644 index 9af7d5f..0000000 --- a/docs/source/modules.rst +++ /dev/null @@ -1,7 +0,0 @@ -cisticola -========= - -.. toctree:: - :maxdepth: 4 - - cisticola diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst new file mode 100644 index 0000000..a6c5643 --- /dev/null +++ b/docs/source/quickstart.rst @@ -0,0 +1,96 @@ +Quickstart +========== + +Installation +------------ + +The *cisticola* application uses pipenv_ for dependency management. To install the dependencies of *cisticola*, first install pipenv using the following command: + +.. code-block:: + + pip install pipenv + +and then install the dependencies using the following command from the package root directory: + +.. code-block:: + + pipenv install + +To install the necessary dependencies for building the documentation and running unit tests, run the following command from the package root directory: + +.. code-block:: + + pipenv install --dev + +Environment Variables +--------------------- + +Three of the scrapers in *cisticola* (:py:mod:`~cisticola.scraper.gab.GabScraper`, :py:mod:`~cisticola.scraper.instagram.InstagramScraper`, and :py:mod:`~cisticola.scraper.telegram_telethon.TelegramTelethonScraper`) require platform credentials to work correctly. + +Gab +""" + +The Gab credentials can be configured by running the following command from the root directory: + +.. code-block:: + + pipenv run garc configure + +which will direct you to provide the username and password for your Gab account. + +Instagram +""""""""" + +The Instagram credentials can be configured by setting the following environment variables, either in the project's ``.env`` file or in the system's environment: + +- ``INSTAGRAM_USERNAME``: username of your Instagram account +- ``INSTAGRAM_PASSWORD``: password of your Instagram account + +Telegram Telethon +""""""""""""""""" + +The Telegram credentials can be configured by setting the following environment variables, either in the project's ``.env`` file or in the system's environment: + +- ``TELEGRAM_API_ID``: API ID number for your Telegram application +- ``TELEGRAM_API_HASH``: API hash for your Telegram application +- ``TELEGRAM_PHONE``: phone number for the account corresponding to your your Telegram application + +If you do not already have a Telegram application, you can create one by following the instructions on `this page`_. + +Documentation +------------- + +The *cisticola* application uses Sphinx_ to generate and display its documentation. To build the documentation in the HTML format, run the following command from the ``docs/`` directory: + +.. code-block:: + + pipenv run make html + +For developers, if changes are made to the package structure or additional modules are created, you can update the Sphinx source ``*.rst`` files by running the following command from the ``docs/`` directory: + +.. code-block:: + + pipenv run make apidoc + +Testing +------- + +The *cisticola* application uses pytest_ for unit testing. To run the test suite, run the following command from the package root directory: + +.. code-block:: + + pipenv run pytest + +Examples +-------- + +An example of a *cisticola* ingest file ``russian_telegram_ingest.py`` is included in the package root directory, showing how the list of channels to scrape is defined, and how the :py:mod:`~cisticola.scraper.base.ScraperController` and :py:mod:`~cisticola.transformer.base.Transformer` classes are used. To run the ingest script, run the following command from the package root directory: + +.. code-block:: + + pipenv run python russian_telegram_ingest.py + +.. _pipenv: https://pipenv.pypa.io/en/latest/ +.. _Sphinx: https://www.sphinx-doc.org/en/master/ +.. _pytest: https://docs.pytest.org/en/7.1.x/ +.. _this page: https://core.telegram.org/api/obtaining_api_id \ No newline at end of file diff --git a/pytest.ini b/pytest.ini index 09a94e1..f3545f6 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,6 +1,6 @@ [pytest] minversion = - 6.0.2 + 7.0.0 testpaths = tests/ python_files = @@ -13,4 +13,5 @@ addopts = --self-contained-html filterwarnings = ignore:the imp module is deprecated:DeprecationWarning - ignore:The localize method is no longer necessary, as this time zone supports the fold attribute \ No newline at end of file + ignore:The localize method is no longer necessary, as this time zone supports the fold attribute + ignore:invalid escape sequence:DeprecationWarning \ No newline at end of file