From e4cf9daf73265fd36ccc243bae8f25432e5bb58f Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Mon, 14 Mar 2022 18:04:27 -0500 Subject: [PATCH 1/4] added docstrings, improved Sphinx docs --- Pipfile | 3 +- Pipfile.lock | 433 ++++++++++-------- cisticola/base.py | 139 ++++-- cisticola/scraper/base.py | 153 ++++++- docs/images/cisticola_logo.svg | 64 +++ docs/images/favicon.ico | Bin 0 -> 614 bytes docs/source/cisticola.base.rst | 8 + docs/source/cisticola.rst | 23 +- docs/source/cisticola.scraper.base.rst | 8 + docs/source/cisticola.scraper.bitchute.rst | 8 + docs/source/cisticola.scraper.gab.rst | 8 + docs/source/cisticola.scraper.gettr.rst | 8 + docs/source/cisticola.scraper.odysee.rst | 8 + docs/source/cisticola.scraper.rst | 48 +- docs/source/cisticola.scraper.rumble.rst | 8 + .../cisticola.scraper.telegram_snscrape.rst | 8 + .../cisticola.scraper.telegram_telethon.rst | 8 + docs/source/cisticola.scraper.twitter.rst | 8 + docs/source/cisticola.scraper.utils.rst | 8 + docs/source/cisticola.transformer.base.rst | 8 + docs/source/cisticola.transformer.rst | 24 +- docs/source/cisticola.transformer.twitter.rst | 8 + docs/source/conf.py | 11 +- docs/source/index.rst | 2 +- docs/source/modules.rst | 7 - 25 files changed, 700 insertions(+), 311 deletions(-) create mode 100644 docs/images/cisticola_logo.svg create mode 100644 docs/images/favicon.ico create mode 100644 docs/source/cisticola.base.rst create mode 100644 docs/source/cisticola.scraper.base.rst create mode 100644 docs/source/cisticola.scraper.bitchute.rst create mode 100644 docs/source/cisticola.scraper.gab.rst create mode 100644 docs/source/cisticola.scraper.gettr.rst create mode 100644 docs/source/cisticola.scraper.odysee.rst create mode 100644 docs/source/cisticola.scraper.rumble.rst create mode 100644 docs/source/cisticola.scraper.telegram_snscrape.rst create mode 100644 docs/source/cisticola.scraper.telegram_telethon.rst create mode 100644 docs/source/cisticola.scraper.twitter.rst create mode 100644 docs/source/cisticola.scraper.utils.rst create mode 100644 docs/source/cisticola.transformer.base.rst create mode 100644 docs/source/cisticola.transformer.twitter.rst delete mode 100644 docs/source/modules.rst diff --git a/Pipfile b/Pipfile index 62f2c74..328faea 100644 --- a/Pipfile +++ b/Pipfile @@ -10,7 +10,6 @@ gogettr = "*" requests = "*" bs4 = "*" dateparser = "*" -sphinx = "*" boto3 = "*" snscrape = {git = "https://github.com/bellingcat/snscrape.git"} ffmpeg-python = "*" @@ -24,6 +23,8 @@ pytest = "*" pytest-cov = "*" pytest-html = "*" pytest-metadata = "*" +sphinx = "*" +sphinx_rtd_theme = "*" [requires] python_version = "3.9" diff --git a/Pipfile.lock b/Pipfile.lock index 0ca0eda..f75c07f 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "3d293e1f3802d64ae7a8fbfc4c1d742cc33cd4c520a6263f93e566f89faa7013" + "sha256": "495ba305ca55a0ac5754037ba133518b47324965dd3ab0b8db8b69206524d68e" }, "pipfile-spec": 6, "requires": { @@ -16,13 +16,6 @@ ] }, "default": { - "alabaster": { - "hashes": [ - "sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359", - "sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02" - ], - "version": "==0.7.12" - }, "attrs": { "hashes": [ "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4", @@ -31,14 +24,6 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==21.4.0" }, - "babel": { - "hashes": [ - "sha256:ab49e12b91d937cd11f0b67cb259a57ab4ad2b59ac7a3b41d6c06c0ac5b0def9", - "sha256:bc0c176f9f6a994582230df350aa6e05ba2ebe4b3ac317eab29d9be5d2768da0" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==2.9.1" - }, "beautifulsoup4": { "hashes": [ "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf", @@ -49,19 +34,19 @@ }, "boto3": { "hashes": [ - "sha256:30394729b38d5ce2f845440428a55161c6d45478044e553a12ca1acf56d7278a", - "sha256:895489900eb882777124c3b64a13df49785cf77f7bd1504e783464fb3b4c8163" + "sha256:8d6f3c548f0ee03d742f404c96515e7579fc6968135aaa50dd855a046698ff79", + "sha256:d857feb6af9932e1ee3a748060a2cd9fd6043dbbccf66976eda54586597efdc0" ], "index": "pypi", - "version": "==1.21.15" + "version": "==1.21.18" }, "botocore": { "hashes": [ - "sha256:405082f92a9e524e1aee96cbc90134668026d7da3c12f86990c91a12620ca28b", - "sha256:fa4816e94e72111a9341204061e760bcbde74ca5d900d3f2206c2c2e8e4b56e4" + "sha256:7ea8ef1ff7c4882ab59b337662f90ddf5ea860e95e7e209dca593a34ea585b1b", + "sha256:d2da7ccbc5ddd61fe3cd45fcbd3de380d9e3a15bfa8fbfd2d9259a93dcc60c56" ], "markers": "python_version >= '3.6'", - "version": "==1.24.15" + "version": "==1.24.18" }, "bs4": { "hashes": [ @@ -101,14 +86,6 @@ "index": "pypi", "version": "==1.1.0" }, - "docutils": { - "hashes": [ - "sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125", - "sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", - "version": "==0.17.1" - }, "ffmpeg-python": { "hashes": [ "sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127", @@ -216,22 +193,6 @@ "markers": "python_version >= '3'", "version": "==3.3" }, - "imagesize": { - "hashes": [ - "sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c", - "sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==1.3.0" - }, - "importlib-metadata": { - "hashes": [ - "sha256:b36ffa925fe3139b2f6ff11d6925ffd4fa7bc47870165e3ac260ac7b4f91e6ac", - "sha256:d16e8c1deb60de41b8e8ed21c1a7b947b0bc62fab7e1d470bcdf331cea2e6735" - ], - "markers": "python_version < '3.10'", - "version": "==4.11.2" - }, "iniconfig": { "hashes": [ "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3", @@ -239,14 +200,6 @@ ], "version": "==1.1.1" }, - "jinja2": { - "hashes": [ - "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8", - "sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7" - ], - "markers": "python_version >= '3.6'", - "version": "==3.0.3" - }, "jmespath": { "hashes": [ "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9", @@ -330,52 +283,6 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==4.8.0" }, - "markupsafe": { - "hashes": [ - "sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3", - "sha256:09c86c9643cceb1d87ca08cdc30160d1b7ab49a8a21564868921959bd16441b8", - "sha256:142119fb14a1ef6d758912b25c4e803c3ff66920635c44078666fe7cc3f8f759", - "sha256:1d1fb9b2eec3c9714dd936860850300b51dbaa37404209c8d4cb66547884b7ed", - "sha256:204730fd5fe2fe3b1e9ccadb2bd18ba8712b111dcabce185af0b3b5285a7c989", - "sha256:24c3be29abb6b34052fd26fc7a8e0a49b1ee9d282e3665e8ad09a0a68faee5b3", - "sha256:290b02bab3c9e216da57c1d11d2ba73a9f73a614bbdcc027d299a60cdfabb11a", - "sha256:3028252424c72b2602a323f70fbf50aa80a5d3aa616ea6add4ba21ae9cc9da4c", - "sha256:30c653fde75a6e5eb814d2a0a89378f83d1d3f502ab710904ee585c38888816c", - "sha256:3cace1837bc84e63b3fd2dfce37f08f8c18aeb81ef5cf6bb9b51f625cb4e6cd8", - "sha256:4056f752015dfa9828dce3140dbadd543b555afb3252507348c493def166d454", - "sha256:454ffc1cbb75227d15667c09f164a0099159da0c1f3d2636aa648f12675491ad", - "sha256:598b65d74615c021423bd45c2bc5e9b59539c875a9bdb7e5f2a6b92dfcfc268d", - "sha256:599941da468f2cf22bf90a84f6e2a65524e87be2fce844f96f2dd9a6c9d1e635", - "sha256:5ddea4c352a488b5e1069069f2f501006b1a4362cb906bee9a193ef1245a7a61", - "sha256:62c0285e91414f5c8f621a17b69fc0088394ccdaa961ef469e833dbff64bd5ea", - "sha256:679cbb78914ab212c49c67ba2c7396dc599a8479de51b9a87b174700abd9ea49", - "sha256:6e104c0c2b4cd765b4e83909cde7ec61a1e313f8a75775897db321450e928cce", - "sha256:736895a020e31b428b3382a7887bfea96102c529530299f426bf2e636aacec9e", - "sha256:75bb36f134883fdbe13d8e63b8675f5f12b80bb6627f7714c7d6c5becf22719f", - "sha256:7d2f5d97fcbd004c03df8d8fe2b973fe2b14e7bfeb2cfa012eaa8759ce9a762f", - "sha256:80beaf63ddfbc64a0452b841d8036ca0611e049650e20afcb882f5d3c266d65f", - "sha256:84ad5e29bf8bab3ad70fd707d3c05524862bddc54dc040982b0dbcff36481de7", - "sha256:8da5924cb1f9064589767b0f3fc39d03e3d0fb5aa29e0cb21d43106519bd624a", - "sha256:961eb86e5be7d0973789f30ebcf6caab60b844203f4396ece27310295a6082c7", - "sha256:96de1932237abe0a13ba68b63e94113678c379dca45afa040a17b6e1ad7ed076", - "sha256:a0a0abef2ca47b33fb615b491ce31b055ef2430de52c5b3fb19a4042dbc5cadb", - "sha256:b2a5a856019d2833c56a3dcac1b80fe795c95f401818ea963594b345929dffa7", - "sha256:b8811d48078d1cf2a6863dafb896e68406c5f513048451cd2ded0473133473c7", - "sha256:c532d5ab79be0199fa2658e24a02fce8542df196e60665dd322409a03db6a52c", - "sha256:d3b64c65328cb4cd252c94f83e66e3d7acf8891e60ebf588d7b493a55a1dbf26", - "sha256:d4e702eea4a2903441f2735799d217f4ac1b55f7d8ad96ab7d4e25417cb0827c", - "sha256:d5653619b3eb5cbd35bfba3c12d575db2a74d15e0e1c08bf1db788069d410ce8", - "sha256:d66624f04de4af8bbf1c7f21cc06649c1c69a7f84109179add573ce35e46d448", - "sha256:e67ec74fada3841b8c5f4c4f197bea916025cb9aa3fe5abf7d52b655d042f956", - "sha256:e6f7f3f41faffaea6596da86ecc2389672fa949bd035251eab26dc6697451d05", - "sha256:f02cf7221d5cd915d7fa58ab64f7ee6dd0f6cddbb48683debf5d04ae9b1c2cc1", - "sha256:f0eddfcabd6936558ec020130f932d479930581171368fd728efcfb6ef0dd357", - "sha256:fabbe18087c3d33c5824cb145ffca52eccd053061df1d79d4b66dafa5ad2a5ea", - "sha256:fc3150f85e2dbcf99e65238c842d1cfe69d3e7649b19864c1cc043213d9cd730" - ], - "markers": "python_version >= '3.7'", - "version": "==2.1.0" - }, "numpy": { "hashes": [ "sha256:07a8c89a04997625236c5ecb7afe35a02af3896c8aa01890a849913a2309c676", @@ -395,6 +302,7 @@ "sha256:dbc7601a3b7472d559dc7b933b18b4b66f9aa7452c120e87dfb33d02008c8a18", "sha256:e7927a589df200c5e23c57970bafbd0cd322459aa7b1ff73b7c2e84d6e3eae62", "sha256:f8c1f39caad2c896bc0018f699882b345b2a63708008be29b1f355ebf6f933fe", + "sha256:f950f8845b480cffe522913d35567e29dd381b0dc7e4ce6a4a9f9156417d2430", "sha256:fade0d4f4d292b6f39951b6836d7a3c7ef5b2347f3c420cd9820a1d90d794802", "sha256:fdf3c08bce27132395d3c3ba1503cac12e17282358cb4bddc25cc46b0aca07aa" ], @@ -480,14 +388,6 @@ ], "version": "==0.4.8" }, - "pygments": { - "hashes": [ - "sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65", - "sha256:4e426f72023d88d03b2fa258de560726ce890ff3b630f88c21cbb8b2503b8c6a" - ], - "markers": "python_version >= '3.5'", - "version": "==2.11.2" - }, "pyparsing": { "hashes": [ "sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea", @@ -506,11 +406,11 @@ }, "pytest": { "hashes": [ - "sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db", - "sha256:e30905a0c131d3d94b89624a1cc5afec3e0ba2fbdb151867d8e0ebd49850f171" + "sha256:b555252a95bbb2a37a97b5ac2eb050c436f7989993565f5e0c9128fcaacadd0e", + "sha256:f1089d218cfcc63a212c42896f1b7fbf096874d045e1988186861a1a87d27b47" ], - "markers": "python_version >= '3.6'", - "version": "==7.0.1" + "markers": "python_version >= '3.7'", + "version": "==7.1.0" }, "python-dateutil": { "hashes": [ @@ -628,7 +528,7 @@ "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17", "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb" ], - "markers": "python_version >= '3.6' and python_version < '4.0'", + "markers": "python_version >= '3.6' and python_version < '4'", "version": "==4.8" }, "s3transfer": { @@ -647,13 +547,6 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.16.0" }, - "snowballstemmer": { - "hashes": [ - "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1", - "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a" - ], - "version": "==2.2.0" - }, "snscrape": { "git": "https://github.com/bellingcat/snscrape.git", "ref": "de4ebed81f3f6a4bb4c65630daab6ec63784959b" @@ -666,62 +559,6 @@ "markers": "python_version >= '3.6'", "version": "==2.3.1" }, - "sphinx": { - "hashes": [ - "sha256:5da895959511473857b6d0200f56865ed62c31e8f82dd338063b84ec022701fe", - "sha256:6caad9786055cb1fa22b4a365c1775816b876f91966481765d7d50e9f0dd35cc" - ], - "index": "pypi", - "version": "==4.4.0" - }, - "sphinxcontrib-applehelp": { - "hashes": [ - "sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a", - "sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58" - ], - "markers": "python_version >= '3.5'", - "version": "==1.0.2" - }, - "sphinxcontrib-devhelp": { - "hashes": [ - "sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e", - "sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4" - ], - "markers": "python_version >= '3.5'", - "version": "==1.0.2" - }, - "sphinxcontrib-htmlhelp": { - "hashes": [ - "sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07", - "sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2" - ], - "markers": "python_version >= '3.6'", - "version": "==2.0.0" - }, - "sphinxcontrib-jsmath": { - "hashes": [ - "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", - "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8" - ], - "markers": "python_version >= '3.5'", - "version": "==1.0.1" - }, - "sphinxcontrib-qthelp": { - "hashes": [ - "sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72", - "sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6" - ], - "markers": "python_version >= '3.5'", - "version": "==1.0.3" - }, - "sphinxcontrib-serializinghtml": { - "hashes": [ - "sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd", - "sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952" - ], - "markers": "python_version >= '3.5'", - "version": "==1.1.5" - }, "sqlalchemy": { "hashes": [ "sha256:04164e0063feb7aedd9d073db0fd496edb244be40d46ea1f0d8990815e4b8c34", @@ -800,7 +637,7 @@ "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed", "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", "version": "==1.26.8" }, "youtube-dl": { @@ -810,17 +647,16 @@ ], "index": "pypi", "version": "==2021.12.17" - }, - "zipp": { - "hashes": [ - "sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d", - "sha256:b47250dd24f92b7dd6a0a8fc5244da14608f3ca90a5efcd37a3b1642fac9a375" - ], - "markers": "python_version >= '3.7'", - "version": "==3.7.0" } }, "develop": { + "alabaster": { + "hashes": [ + "sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359", + "sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02" + ], + "version": "==0.7.12" + }, "attrs": { "hashes": [ "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4", @@ -829,6 +665,29 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==21.4.0" }, + "babel": { + "hashes": [ + "sha256:ab49e12b91d937cd11f0b67cb259a57ab4ad2b59ac7a3b41d6c06c0ac5b0def9", + "sha256:bc0c176f9f6a994582230df350aa6e05ba2ebe4b3ac317eab29d9be5d2768da0" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==2.9.1" + }, + "certifi": { + "hashes": [ + "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872", + "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569" + ], + "version": "==2021.10.8" + }, + "charset-normalizer": { + "hashes": [ + "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597", + "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df" + ], + "markers": "python_version >= '3'", + "version": "==2.0.12" + }, "coverage": { "extras": [ "toml" @@ -879,6 +738,38 @@ "markers": "python_version >= '3.7'", "version": "==6.3.2" }, + "docutils": { + "hashes": [ + "sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125", + "sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==0.17.1" + }, + "idna": { + "hashes": [ + "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff", + "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d" + ], + "markers": "python_version >= '3'", + "version": "==3.3" + }, + "imagesize": { + "hashes": [ + "sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c", + "sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==1.3.0" + }, + "importlib-metadata": { + "hashes": [ + "sha256:1208431ca90a8cca1a6b8af391bb53c1a2db74e5d1cef6ddced95d4b2062edc6", + "sha256:ea4c597ebf37142f827b8f39299579e31685c31d3a438b59f469406afd0f2539" + ], + "markers": "python_version < '3.10'", + "version": "==4.11.3" + }, "iniconfig": { "hashes": [ "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3", @@ -886,6 +777,60 @@ ], "version": "==1.1.1" }, + "jinja2": { + "hashes": [ + "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8", + "sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7" + ], + "markers": "python_version >= '3.6'", + "version": "==3.0.3" + }, + "markupsafe": { + "hashes": [ + "sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3", + "sha256:09c86c9643cceb1d87ca08cdc30160d1b7ab49a8a21564868921959bd16441b8", + "sha256:142119fb14a1ef6d758912b25c4e803c3ff66920635c44078666fe7cc3f8f759", + "sha256:1d1fb9b2eec3c9714dd936860850300b51dbaa37404209c8d4cb66547884b7ed", + "sha256:204730fd5fe2fe3b1e9ccadb2bd18ba8712b111dcabce185af0b3b5285a7c989", + "sha256:24c3be29abb6b34052fd26fc7a8e0a49b1ee9d282e3665e8ad09a0a68faee5b3", + "sha256:290b02bab3c9e216da57c1d11d2ba73a9f73a614bbdcc027d299a60cdfabb11a", + "sha256:3028252424c72b2602a323f70fbf50aa80a5d3aa616ea6add4ba21ae9cc9da4c", + "sha256:30c653fde75a6e5eb814d2a0a89378f83d1d3f502ab710904ee585c38888816c", + "sha256:3cace1837bc84e63b3fd2dfce37f08f8c18aeb81ef5cf6bb9b51f625cb4e6cd8", + "sha256:4056f752015dfa9828dce3140dbadd543b555afb3252507348c493def166d454", + "sha256:454ffc1cbb75227d15667c09f164a0099159da0c1f3d2636aa648f12675491ad", + "sha256:598b65d74615c021423bd45c2bc5e9b59539c875a9bdb7e5f2a6b92dfcfc268d", + "sha256:599941da468f2cf22bf90a84f6e2a65524e87be2fce844f96f2dd9a6c9d1e635", + "sha256:5ddea4c352a488b5e1069069f2f501006b1a4362cb906bee9a193ef1245a7a61", + "sha256:62c0285e91414f5c8f621a17b69fc0088394ccdaa961ef469e833dbff64bd5ea", + "sha256:679cbb78914ab212c49c67ba2c7396dc599a8479de51b9a87b174700abd9ea49", + "sha256:6e104c0c2b4cd765b4e83909cde7ec61a1e313f8a75775897db321450e928cce", + "sha256:736895a020e31b428b3382a7887bfea96102c529530299f426bf2e636aacec9e", + "sha256:75bb36f134883fdbe13d8e63b8675f5f12b80bb6627f7714c7d6c5becf22719f", + "sha256:7d2f5d97fcbd004c03df8d8fe2b973fe2b14e7bfeb2cfa012eaa8759ce9a762f", + "sha256:80beaf63ddfbc64a0452b841d8036ca0611e049650e20afcb882f5d3c266d65f", + "sha256:84ad5e29bf8bab3ad70fd707d3c05524862bddc54dc040982b0dbcff36481de7", + "sha256:8da5924cb1f9064589767b0f3fc39d03e3d0fb5aa29e0cb21d43106519bd624a", + "sha256:961eb86e5be7d0973789f30ebcf6caab60b844203f4396ece27310295a6082c7", + "sha256:96de1932237abe0a13ba68b63e94113678c379dca45afa040a17b6e1ad7ed076", + "sha256:a0a0abef2ca47b33fb615b491ce31b055ef2430de52c5b3fb19a4042dbc5cadb", + "sha256:b2a5a856019d2833c56a3dcac1b80fe795c95f401818ea963594b345929dffa7", + "sha256:b8811d48078d1cf2a6863dafb896e68406c5f513048451cd2ded0473133473c7", + "sha256:c532d5ab79be0199fa2658e24a02fce8542df196e60665dd322409a03db6a52c", + "sha256:d3b64c65328cb4cd252c94f83e66e3d7acf8891e60ebf588d7b493a55a1dbf26", + "sha256:d4e702eea4a2903441f2735799d217f4ac1b55f7d8ad96ab7d4e25417cb0827c", + "sha256:d5653619b3eb5cbd35bfba3c12d575db2a74d15e0e1c08bf1db788069d410ce8", + "sha256:d66624f04de4af8bbf1c7f21cc06649c1c69a7f84109179add573ce35e46d448", + "sha256:e67ec74fada3841b8c5f4c4f197bea916025cb9aa3fe5abf7d52b655d042f956", + "sha256:e6f7f3f41faffaea6596da86ecc2389672fa949bd035251eab26dc6697451d05", + "sha256:f02cf7221d5cd915d7fa58ab64f7ee6dd0f6cddbb48683debf5d04ae9b1c2cc1", + "sha256:f0eddfcabd6936558ec020130f932d479930581171368fd728efcfb6ef0dd357", + "sha256:fabbe18087c3d33c5824cb145ffca52eccd053061df1d79d4b66dafa5ad2a5ea", + "sha256:fc3150f85e2dbcf99e65238c842d1cfe69d3e7649b19864c1cc043213d9cd730" + ], + "markers": "python_version >= '3.7'", + "version": "==2.1.0" + }, "packaging": { "hashes": [ "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", @@ -910,6 +855,14 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==1.11.0" }, + "pygments": { + "hashes": [ + "sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65", + "sha256:4e426f72023d88d03b2fa258de560726ce890ff3b630f88c21cbb8b2503b8c6a" + ], + "markers": "python_version >= '3.5'", + "version": "==2.11.2" + }, "pyparsing": { "hashes": [ "sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea", @@ -920,11 +873,11 @@ }, "pytest": { "hashes": [ - "sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db", - "sha256:e30905a0c131d3d94b89624a1cc5afec3e0ba2fbdb151867d8e0ebd49850f171" + "sha256:b555252a95bbb2a37a97b5ac2eb050c436f7989993565f5e0c9128fcaacadd0e", + "sha256:f1089d218cfcc63a212c42896f1b7fbf096874d045e1988186861a1a87d27b47" ], - "markers": "python_version >= '3.6'", - "version": "==7.0.1" + "markers": "python_version >= '3.7'", + "version": "==7.1.0" }, "pytest-cov": { "hashes": [ @@ -950,6 +903,92 @@ "index": "pypi", "version": "==1.11.0" }, + "pytz": { + "hashes": [ + "sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c", + "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326" + ], + "version": "==2021.3" + }, + "requests": { + "hashes": [ + "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", + "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" + ], + "index": "pypi", + "version": "==2.27.1" + }, + "snowballstemmer": { + "hashes": [ + "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1", + "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a" + ], + "version": "==2.2.0" + }, + "sphinx": { + "hashes": [ + "sha256:5da895959511473857b6d0200f56865ed62c31e8f82dd338063b84ec022701fe", + "sha256:6caad9786055cb1fa22b4a365c1775816b876f91966481765d7d50e9f0dd35cc" + ], + "index": "pypi", + "version": "==4.4.0" + }, + "sphinx-rtd-theme": { + "hashes": [ + "sha256:4d35a56f4508cfee4c4fb604373ede6feae2a306731d533f409ef5c3496fdbd8", + "sha256:eec6d497e4c2195fa0e8b2016b337532b8a699a68bcb22a512870e16925c6a5c" + ], + "index": "pypi", + "version": "==1.0.0" + }, + "sphinxcontrib-applehelp": { + "hashes": [ + "sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a", + "sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.2" + }, + "sphinxcontrib-devhelp": { + "hashes": [ + "sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e", + "sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.2" + }, + "sphinxcontrib-htmlhelp": { + "hashes": [ + "sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07", + "sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2" + ], + "markers": "python_version >= '3.6'", + "version": "==2.0.0" + }, + "sphinxcontrib-jsmath": { + "hashes": [ + "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", + "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.1" + }, + "sphinxcontrib-qthelp": { + "hashes": [ + "sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72", + "sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.3" + }, + "sphinxcontrib-serializinghtml": { + "hashes": [ + "sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd", + "sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952" + ], + "markers": "python_version >= '3.5'", + "version": "==1.1.5" + }, "tomli": { "hashes": [ "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", @@ -957,6 +996,22 @@ ], "markers": "python_version >= '3.7'", "version": "==2.0.1" + }, + "urllib3": { + "hashes": [ + "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed", + "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", + "version": "==1.26.8" + }, + "zipp": { + "hashes": [ + "sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d", + "sha256:b47250dd24f92b7dd6a0a8fc5244da14608f3ca90a5efcd37a3b1642fac9a375" + ], + "markers": "python_version >= '3.7'", + "version": "==3.7.0" } } } diff --git a/cisticola/base.py b/cisticola/base.py index 97a18df..2c9ad83 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -5,21 +5,118 @@ from datetime import datetime from sqlalchemy.orm import registry from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey -mapper_registry = registry() - @dataclass class ScraperResult: - """A minimally processed result from a scraper""" + """A minimally processed result from a scraper + """ + #: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``. scraper: str + + #: Name of platform from which result was scraped, e.g. ``"Twitter"``. platform: str - channel: int #TODO there is probably a way of making this a Channel object foreign key + + #TODO there is probably a way of making this a Channel object foreign key + #: User-specified integer that uniquely identifies a channel, e.g. ``15``. + channel: int + + #: String that uniquely identifies the scraped post on the given platform, e.g. ``"1503397267675533313"`` platform_id: str + + #: Datetime (relative to UTC) that the scraped post was created at. date: datetime + + #: JSON dump of dict that contains all data scraped for the post. raw_data: str + + #: Datetime (relative to UTC) that the scraped post was archived at. date_archived: datetime + + #: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files. archived_urls: dict +@dataclass +class Channel: + """Information about a specific channel to be scraped. + """ + + #: User-specified integer that uniquely identifies a channel, e.g. ``15``. + id: int + + #: Name of channel (different from username because it can be non-unique and contain emojis), e.g. ``T🕊Редакция Президент Гордон🕊"``. + name: str + + #: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``. + platform_id: str + + #: User-specified category for the channel, e.g. ``"qanon-adjacent"``. + category: str + + #: Number of followers the channel has on the given platform, e.e. ``"1465"``. + followers: int + + #: Name of platform the given channel is on, e.g. ``"Telegram"``. + platform: str + + #: URL for the given channel on the platform, e.g. ``"https://t.me/prezidentgordonteam"`` + url: str + + #: Screen name/username of channel. + screenname: str + + #: 2 digit country code for the country of origin for the channel, e.g. ``"RU"``. + country: str + + #: Name of influencer, if channel belongs to an influencer that operates on multiple platforms. + influencer: str + + #: Whether or not the channel is publicly-accessible. + public: bool + + #: Whether or not the channel is a chat (i.e. allows users who are not the channel creator to post/message) + chat: bool + + #: Any other additional notes about the channel. + notes: str + +@dataclass +class TransformedResult: + """An object with fields for columns in the analysis table""" + + #: ID number of the scraped post in the ``raw_data`` table + raw_id: int + + #: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``. + scraper: str + + #: String specifying name and version of transformer used to tranform result, e.g. ``"TwitterTransformer 0.0.1"``. + transformer: str + + #: Name of platform from which result was scraped, e.g. ``"Twitter"``. + platform: str + + #: User-specified integer that uniquely identifies a channel, e.g. ``15``. + channel: str + + #: Datetime (relative to UTC) that the scraped post was created at. + date: datetime + + #: Datetime (relative to UTC) that the scraped post was archived at. + date_archived: datetime + + #: URL of the original post + url: str + + #: Text of the original post + content: str + + #: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``. + author_id: str + + #: Username of author who made post. + author_username: str + +mapper_registry = registry() raw_data_table = Table('raw_data', mapper_registry.metadata, Column('id', Integer, primary_key=True, @@ -35,40 +132,6 @@ raw_data_table = Table('raw_data', mapper_registry.metadata, mapper_registry.map_imperatively(ScraperResult, raw_data_table) - -@dataclass -class Channel: - id: int - name: str - platform_id: str - category: str - followers: int - platform: str - url: str - screenname: str - country: str - influencer: str - public: bool - chat: bool - notes: str - - -@dataclass -class TransformedResult: - """An object with fields for columns in the analysis table""" - raw_id: int - scraper: str - transformer: str - platform: str - channel: str - date: datetime - date_archived: datetime - url: str - content: str - author_id: str - author_username: str - - analysis_table = Table('analysis', mapper_registry.metadata, Column('id', Integer, primary_key=True, autoincrement=True), diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index ea68f70..524a729 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -13,29 +13,73 @@ from cisticola.base import Channel, ScraperResult, mapper_registry from cisticola.scraper import make_request class Scraper: + """Base class for defining platform-specific scrapers for scraping all posts + from a given channel on that specific platform. + """ + __version__ = "Scraper 0.0.0" def __init__(self): - self.s3_client = boto3.client('s3', - region_name=os.environ['DO_SPACES_REGION'], - endpoint_url='https://{}.digitaloceanspaces.com'.format( - os.environ['DO_SPACES_REGION']), - aws_access_key_id=os.environ['DO_SPACES_KEY'], - aws_secret_access_key=os.environ['DO_SPACES_SECRET']) + # Initialize client to transfer files to the storage archive + self.s3_client = boto3.client( + service_name='s3', + region_name=os.environ['DO_SPACES_REGION'], + endpoint_url=f'https://{os.environ["DO_SPACES_REGION"]}.digitaloceanspaces.com', + aws_access_key_id=os.environ['DO_SPACES_KEY'], + aws_secret_access_key=os.environ['DO_SPACES_SECRET']) + + # Define request headers (necessary to bypass scraping protection + # for several platform scrapers) self.headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'} - pass - def __str__(self): return self.__version__ def url_to_key(self, url: str, content_type: str) -> str: + """Generate a unique identifier for media from a specified post. + + Parameters + --------- + url: str + URL of original post. + e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"`` + content_type: str + Content-Type of media. + e.g. ``"image/jpeg"`` + + Returns + ------- + key: str + Unique identifier for the media file from a specified post based on + the original post URL and the media's Content-Type. + """ + key = urlparse(url).path.split('/')[-1] return key def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: + """Download media file from a specified post URL. + + Parameters + --------- + url: str + URL of original post. + e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"`` + key: str or None + Pre-defined unique identifier for the media file. + + Returns + ------- + blob: bytes + Raw bytes of the downloaded media file. + content_type: str + Content-Type of media. + e.g. ``"image/jpeg"``. + key: str + Unique identifier for the media file. + """ r = make_request(url, headers = self.headers) @@ -48,6 +92,27 @@ class Scraper: return blob, content_type, key def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: + """Download media file from a specified post URL, where the media file + is formatted as an m3u8 playlist, which is then decoded to an mp4 file. + + Parameters + --------- + url: str + URL of original post. + e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"`` + key: str or None + Pre-defined unique identifier for the media file. + + Returns + ------- + blob: bytes + Raw bytes of the downloaded media file. + content_type: str + Content-Type of media. + e.g. ``"video/mp4"``. + key: str + Unique identifier for the media file. + """ content_type = 'video/mp4' ext = '.' + content_type.split('/')[-1] @@ -70,6 +135,23 @@ class Scraper: return blob, content_type, key def archive_blob(self, blob: bytes, content_type: str, key: str) -> str: + """Upload raw bytes of a media file to the storage archive. + + Parameters + ---------- + blob: bytes + Raw bytes of the media file to be archived. + content_type: str + Content-Type of media. + e.g. ``"video/mp4"``. + key: str + Unique identifier for the media file. + + Returns + ------- + archived_url: str + URL specifying the file on the storage archive. + """ filename = self.__version__.replace(' ', '_') + '/' + key @@ -81,9 +163,37 @@ class Scraper: return archived_url def can_handle(self, channel: Channel) -> bool: + """Whether or not the scraper can scrape the specified channel. + + Parameters + ---------- + channel: Channel + Channel to be scraped. + + Returns + ------- + bool + ``True`` if the scraper is capable of scraping ``channel``, + ``False`` if not. + """ + raise NotImplementedError def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: + """Scrape all posts from the specified Channel. + + Parameters + ---------- + channel: Channel + Channel to be scraped. + since: ScraperResult or None + Most recently scraped ScraperResult from a previous scrape, or + ``None`` if scraper has not run before. + archive_media: bool + If ``True``, any media files (images, video, etc.) from posts are archived. + If ``False``, media files are not archived. + """ + raise NotImplementedError @@ -97,13 +207,28 @@ class ScraperController: self.mapper_registry = None def register_scraper(self, scraper: Scraper): + """Register a single Scraper instance to the controller. + """ self.scrapers.append(scraper) def register_scrapers(self, scraper: List[Scraper]): + """Register a list of Scraper instances to the controller. + """ self.scrapers.extend(scraper) @logger.catch def scrape_channels(self, channels: List[Channel], archive_media: bool = True): + """Scrape all posts for all specified channels. + + Parameters + ---------- + channels: list + List of Channel instances to be scraped + archive_media: bool + If ``True``, any media files (images, video, etc.) from posts are archived. + If ``False``, media files are not archived. + """ + if self.session is None: logger.error("No DB session") return @@ -143,15 +268,11 @@ class ScraperController: logger.warning(f"No handler found for Channel {channel}") def connect_to_db(self, engine): + """Connect the specified SQLAlchemy engine to the controller. + """ + # create tables mapper_registry.metadata.create_all(bind=engine) self.session = sessionmaker() - self.session.configure(bind=engine) - - -class ETLController: - """This class will transform the raw_data tables into a format more conducive to analysis.""" - - def __init__(self): - pass + self.session.configure(bind=engine) \ No newline at end of file diff --git a/docs/images/cisticola_logo.svg b/docs/images/cisticola_logo.svg new file mode 100644 index 0000000..f570be8 --- /dev/null +++ b/docs/images/cisticola_logo.svg @@ -0,0 +1,64 @@ + + + + + + + + + + + + + diff --git a/docs/images/favicon.ico b/docs/images/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..75d94461c1835dd1a4c39511a04c96999bbc465b GIT binary patch literal 614 zcmV-s0-61ZP)zG)YHEs^nHgLd9BglIv$wa03xk7DC`7Z_#D&2pV+_$~lt!b03xf|L z!rIyzkw}E=>uX#Xx+Eg3tgLWya>B*M1vfW0xbWzLh_JP_#nI6bxm=FT%}rDl7ao>G zgq@uove_&f8yn9(9U6Ie{QB{76 z!NTn9EMsG1oS&bgs&q0pH^=1UBsL5d;_*0bl}d$9M1(@2Kq{5OhQUH#UmvQ<_xCrQ zjE;^nI5^1b>nk=47T(|A5fP%%D4opD&m$s~N+oOU0p>)_@Ck7VfOd; zxxc^X>FEg@JwTQYK0iMRhr^`PX+oh8<#L&ypCA4ei^aISykux-h=qj(K0ZFMA%J4R z^E?g@4_RMd=j-c>VzJ2E+Z&$e5sSr`o}Q*!t+KqlOs!VKmH Date: Mon, 14 Mar 2022 19:38:33 -0500 Subject: [PATCH 2/4] added more docstrings and comments --- cisticola/base.py | 148 ++++++++++++++++++++++---------------- cisticola/scraper/base.py | 6 +- pytest.ini | 5 +- 3 files changed, 94 insertions(+), 65 deletions(-) diff --git a/cisticola/base.py b/cisticola/base.py index fc07846..dfaaee8 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -1,14 +1,15 @@ from typing import List from dataclasses import dataclass from datetime import datetime +import tempfile +import json +import io + from sqlalchemy.orm import registry from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey import pytesseract import PIL -import io import exiftool -import json -import os from .utils import make_request @@ -123,6 +124,85 @@ class TransformedResult: #: Text of the original post content: str +@dataclass +class Media: + """Base class for organizing information about a media file. + """ + + #: ID number of the media's corresponding scraped post in the ``raw_data`` table. + raw_id: int + + #: ID number of the media's corresponging scraped post in the ``analysis`` table. + post: int + + #: URL of the original post. + url: str + + #: Original URL of the media from the the original post. + original_url: str + + #: JSON dump of the dict containing metadata information for the media file. + exif: str = None + + def get_blob(self): + """Download media file as bytes blob. + """ + + blob = make_request(self.url) + return blob.content + + def hydrate(self, blob = None): + """Download media file as bytes blob and extract data from content. + """ + + if blob is None: + blob = self.get_blob() + + self.hydrate_exif(blob) + + def hydrate_exif(self, blob): + """Extract Exif metadata from bytes blob. + """ + + with tempfile.NamedTemporaryFile() as temp_file: + temp_file.write(blob) + + with exiftool.ExifTool() as et: + exif = et.get_metadata(temp_file.name) + self.exif = json.dumps(exif) + +@dataclass +class Image(Media): + """Class for organizing information about an image file. + """ + + #: Extracted OCR content from image + ocr: str = None + + def hydrate(self, blob=None): + """Download image file as bytes blob and extract Exif and OCR content + from the image. + """ + + if blob is None: + blob = self.get_blob() + + super().hydrate(blob) + self.hydrate_ocr(blob) + + def hydrate_ocr(self, blob): + """Extract OCR (optical character recognition) data from image bytes blob. + """ + + image = PIL.Image.open(io.BytesIO(blob)) + self.ocr = pytesseract.image_to_string(image) + +@dataclass +class Video(Media): + """Class for organizing information about an image file. + """ + + pass mapper_registry = registry() @@ -138,7 +218,6 @@ raw_data_table = Table('raw_data', mapper_registry.metadata, Column('date_archived', DateTime), Column('archived_urls', JSON)) -mapper_registry.map_imperatively(ScraperResult, raw_data_table) analysis_table = Table('analysis', mapper_registry.metadata, Column('id', Integer, primary_key=True, @@ -153,72 +232,21 @@ analysis_table = Table('analysis', mapper_registry.metadata, Column('url', String), Column('author_id', String), Column('author_username', String), - Column('content', String) - ) - -mapper_registry.map_imperatively(TransformedResult, analysis_table) - -@dataclass -class Media: - raw_id: int - post: int - url: str - original_url: str - - exif: str = None - - def get_blob(self): - blob = make_request(self.url) - return blob.content - - def hydrate(self, blob = None): - if blob is None: - blob = self.get_blob() - - self.hydrate_exif(blob) - - def hydrate_exif(self, blob): - f = open('tmp', 'wb') - f.write(blob) - f.close() - - with exiftool.ExifTool() as et: - exif = et.get_metadata('tmp') - self.exif = json.dumps(exif) - - os.remove('tmp') - -@dataclass -class Image(Media): - ocr: str = None - - def hydrate(self, blob=None): - if blob is None: - blob = self.get_blob() - - super().hydrate(blob) - self.hydrate_ocr(blob) - - def hydrate_ocr(self, blob): - image = PIL.Image.open(io.BytesIO(blob)) - self.ocr = pytesseract.image_to_string(image) - -@dataclass -class Video(Media): - pass + Column('content', String)) media_table = Table('media', mapper_registry.metadata, Column('id', Integer, primary_key=True, autoincrement=True), - Column('type', String), + Column('type', String), Column('raw_id', Integer, ForeignKey('raw_data.id')), Column('post', Integer, ForeignKey('analysis.id')), Column('url', String), Column('original_url', String), Column('exif', String), - Column('ocr', String) - ) + Column('ocr', String)) +mapper_registry.map_imperatively(TransformedResult, analysis_table) +mapper_registry.map_imperatively(ScraperResult, raw_data_table) mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media') mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image') mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video') \ No newline at end of file diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index f35a13e..6f853c6 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -278,8 +278,8 @@ class ScraperController: self.session.configure(bind=self.engine) def reset_db(self): + """Drop all data from the SQLAlchemy database. + """ mapper_registry.metadata.drop_all(bind=self.engine) - self.connect_to_db(self.engine) - - + self.connect_to_db(self.engine) \ No newline at end of file diff --git a/pytest.ini b/pytest.ini index 09a94e1..f3545f6 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,6 +1,6 @@ [pytest] minversion = - 6.0.2 + 7.0.0 testpaths = tests/ python_files = @@ -13,4 +13,5 @@ addopts = --self-contained-html filterwarnings = ignore:the imp module is deprecated:DeprecationWarning - ignore:The localize method is no longer necessary, as this time zone supports the fold attribute \ No newline at end of file + ignore:The localize method is no longer necessary, as this time zone supports the fold attribute + ignore:invalid escape sequence:DeprecationWarning \ No newline at end of file From d68d76c0ab92dc6f351a0a27fcd4811868141790 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Tue, 15 Mar 2022 12:40:18 -0500 Subject: [PATCH 3/4] added missing docstrings, created Makefile target for sphinx-apidoc, added quickstart page for installation and configuration instructions --- cisticola/scraper/base.py | 60 ++++++++++-- cisticola/scraper/bitchute.py | 8 +- cisticola/scraper/gab.py | 6 +- cisticola/scraper/gettr.py | 6 +- cisticola/scraper/instagram.py | 1 + cisticola/scraper/odysee.py | 6 +- cisticola/scraper/rumble.py | 6 +- cisticola/scraper/telegram_snscrape.py | 1 + cisticola/scraper/telegram_telethon.py | 7 +- docs/Makefile | 12 +++ docs/make.bat | 11 +++ docs/source/cisticola.rst | 1 + docs/source/cisticola.scraper.instagram.rst | 8 ++ docs/source/cisticola.scraper.rst | 4 +- docs/source/cisticola.scraper.utils.rst | 8 -- docs/source/cisticola.scraper.vkontakte.rst | 8 ++ docs/source/cisticola.scraper.youtube.rst | 8 ++ .../source/cisticola.transformer.bitchute.rst | 8 ++ docs/source/cisticola.transformer.rst | 1 + docs/source/cisticola.utils.rst | 8 ++ docs/source/index.rst | 15 +-- docs/source/quickstart.rst | 96 +++++++++++++++++++ 22 files changed, 241 insertions(+), 48 deletions(-) create mode 100644 docs/source/cisticola.scraper.instagram.rst delete mode 100644 docs/source/cisticola.scraper.utils.rst create mode 100644 docs/source/cisticola.scraper.vkontakte.rst create mode 100644 docs/source/cisticola.scraper.youtube.rst create mode 100644 docs/source/cisticola.transformer.bitchute.rst create mode 100644 docs/source/cisticola.utils.rst create mode 100644 docs/source/quickstart.rst diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index a2f921f..28dbe76 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -38,6 +38,24 @@ class Scraper: def __str__(self): return self.__version__ + def get_username_from_url(self, url: str) -> str: + """Extract a channel's username from its URL. + + Parameters + ---------- + url: str + URL of the channel on a given platform + e.g. ``"https://twitter.com/EliotHiggins"`` + + Returns + ------- + username: str + Extracted username of the channel. + e.g. ``"EliotHiggins"`` + """ + + raise NotImplementedError + def url_to_key(self, url: str, content_type: str) -> str: """Generate a unique identifier for media from a specified post. @@ -61,13 +79,13 @@ class Scraper: return key def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: - """Download media file from a specified post URL. + """Download media file from a specified media file URL. Parameters --------- url: str - URL of original post. - e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"`` + URL of media file from original post. + e.g. ``"https://pbs.twimg.com/media/FN0j0dYWUAcQxfK?format=png&name=medium"`` key: str or None Pre-defined unique identifier for the media file. @@ -93,14 +111,14 @@ class Scraper: return blob, content_type, key def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: - """Download media file from a specified post URL, where the media file + """Download media file from a specified media URL, where the media file is formatted as an m3u8 playlist, which is then decoded to an mp4 file. Parameters --------- url: str - URL of original post. - e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"`` + URL of m3u8 playlist file from original post. + e.g. ``"https://media.gettr.com/group47/origin/2022/03/15/01/cbc436c1-1a1a-4b97-671d-c42109f3ec9b/out.m3u8"`` key: str or None Pre-defined unique identifier for the media file. @@ -136,7 +154,28 @@ class Scraper: return blob, content_type, key def ytdlp_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: - + """Download media file from a specified media URL, using a fork of + youtube-dl that enables faster downloading. + + Parameters + --------- + url: str + URL of media file from original post. + e.g. ``"https://rumble.com/embed/vgt7gh/"`` + key: str or None + Pre-defined unique identifier for the media file. + + Returns + ------- + blob: bytes + Raw bytes of the downloaded media file. + content_type: str + Content-Type of media. + e.g. ``"video/mp4"``. + key: str + Unique identifier for the media file. + """ + content_type = 'video/mp4' with tempfile.TemporaryDirectory() as temp_dir: @@ -225,6 +264,11 @@ class Scraper: archive_media: bool If ``True``, any media files (images, video, etc.) from posts are archived. If ``False``, media files are not archived. + + Yields + ------ + ScraperResult + Scraper result from a single post/comment from the specified Channel. """ raise NotImplementedError @@ -311,7 +355,7 @@ class ScraperController: self.session.configure(bind=self.engine) def reset_db(self): - """Drop all data from the SQLAlchemy database. + """Drop all data from the connected SQLAlchemy database. """ mapper_registry.metadata.drop_all(bind=self.engine) diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index 8a365f4..47a822e 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -1,4 +1,4 @@ -from datetime import datetime, timezone + from datetime import datetime, timezone import time import re from html.parser import HTMLParser @@ -17,7 +17,7 @@ class BitchuteScraper(Scraper): library""" __version__ = "BitchuteScraper 0.0.1" - def get_username_from_url(url): + def get_username_from_url(self, url): username = url.split('bitchute.com/channel/')[-1].strip('/') return username @@ -33,7 +33,7 @@ class BitchuteScraper(Scraper): detail = 'comments' - username = BitchuteScraper.get_username_from_url(channel.url) + username = self.get_username_from_url(channel.url) scraper = get_videos_user(session, username, csrftoken, detail) for post in scraper: @@ -61,7 +61,7 @@ class BitchuteScraper(Scraper): archived_urls=archived_urls) def can_handle(self, channel): - if channel.platform == "Bitchute" and BitchuteScraper.get_username_from_url(channel.url) is not None: + if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None: return True #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# diff --git a/cisticola/scraper/gab.py b/cisticola/scraper/gab.py index 910ebc2..f90f2a3 100644 --- a/cisticola/scraper/gab.py +++ b/cisticola/scraper/gab.py @@ -11,14 +11,14 @@ class GabScraper(Scraper): """An implementation of a Scraper for Gab, using GARC library""" __version__ = "GabScraper 0.0.1" - def get_username_from_url(url): + def get_username_from_url(self, url): username = url.split('https://gab.com/')[-1] return username def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: client = Garc(profile = 'main') - username = GabScraper.get_username_from_url(channel.url) + username = self.get_username_from_url(channel.url) scraper = client.userposts(username) @@ -52,5 +52,5 @@ class GabScraper(Scraper): archived_urls=archived_urls) def can_handle(self, channel): - if channel.platform == "Gab" and GabScraper.get_username_from_url(channel.url) is not None: + if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None: return True \ No newline at end of file diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index 3cd069e..4fb15cc 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -12,7 +12,7 @@ class GettrScraper(Scraper): """An implementation of a Scraper for Gettr, using gogettr library""" __version__ = "GettrScraper 0.0.1" - def get_username_from_url(url): + def get_username_from_url(self, url): username = url.split("gettr.com/user/")[1] if len(username.split("/")) > 1: return None @@ -21,7 +21,7 @@ class GettrScraper(Scraper): def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: client = PublicClient() - username = GettrScraper.get_username_from_url(channel.url) + username = self.get_username_from_url(channel.url) scraper = client.user_activity(username=username, type="posts") for post in scraper: @@ -62,7 +62,7 @@ class GettrScraper(Scraper): archived_urls=archived_urls) def can_handle(self, channel): - if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None: + if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None: return True def url_to_key(self, url: str, content_type: str) -> str: diff --git a/cisticola/scraper/instagram.py b/cisticola/scraper/instagram.py index eb20ecb..f9ae76e 100644 --- a/cisticola/scraper/instagram.py +++ b/cisticola/scraper/instagram.py @@ -18,6 +18,7 @@ CONTENT_TYPES = { 'mp4' : 'video/mp4'} class InstagramScraper(Scraper): + """An implementation of a Scraper for Instagram, using instaloader library""" __version__ = "InstagramScraper 0.0.1" def get_username_from_url(self, url): diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py index 61ed9ca..eb7ec04 100644 --- a/cisticola/scraper/odysee.py +++ b/cisticola/scraper/odysee.py @@ -13,7 +13,7 @@ class OdyseeScraper(Scraper): """An implementation of a Scraper for Odysee, using polyphemus library""" __version__ = "OdyseeScraper 0.0.1" - def get_username_from_url(url): + def get_username_from_url(self, url): username = url.split('odysee.com/')[-1].strip('@').split(':')[0] @@ -21,7 +21,7 @@ class OdyseeScraper(Scraper): def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: - username = OdyseeScraper.get_username_from_url(channel.url) + username = self.get_username_from_url(channel.url) odysee_channel = OdyseeChannel(channel_name = username) all_videos = odysee_channel.get_all_videos() @@ -70,7 +70,7 @@ class OdyseeScraper(Scraper): archived_urls={}) def can_handle(self, channel): - if channel.platform == "Odysee" and OdyseeScraper.get_username_from_url(channel.url) is not None: + if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None: return True def url_to_key(self, url: str, content_type: str) -> str: diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index 8546d6e..9863fb0 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -14,14 +14,14 @@ class RumbleScraper(Scraper): """An implementation of a Scraper for Rumble, using custom functions""" __version__ = "RumbleScraper 0.0.1" - def get_username_from_url(url): + def get_username_from_url(self, url): username = url.split('https://rumble.com/c/')[1] return username def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: - username = RumbleScraper.get_username_from_url(channel.url) + username = self.get_username_from_url(channel.url) scraper = get_channel_videos(username) for post in scraper: @@ -54,7 +54,7 @@ class RumbleScraper(Scraper): return key def can_handle(self, channel): - if channel.platform == "Rumble" and RumbleScraper.get_username_from_url(channel.url) is not None: + if channel.platform == "Rumble" and self.get_username_from_url(channel.url) is not None: return True #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py index 3f3f45d..ec5b292 100644 --- a/cisticola/scraper/telegram_snscrape.py +++ b/cisticola/scraper/telegram_snscrape.py @@ -8,6 +8,7 @@ from cisticola.base import Channel, ScraperResult from cisticola.scraper.base import Scraper class TelegramSnscrapeScraper(Scraper): + """An implementation of a Scraper for Telegram, using snscrape library""" __version__ = "TelegramSnscrapeScraper 0.0.1" def can_handle(self, channel): diff --git a/cisticola/scraper/telegram_telethon.py b/cisticola/scraper/telegram_telethon.py index 76d68f2..b8231bc 100644 --- a/cisticola/scraper/telegram_telethon.py +++ b/cisticola/scraper/telegram_telethon.py @@ -14,6 +14,7 @@ from cisticola.scraper.base import Scraper MEDIA_TYPES = ['photo', 'video', 'document', 'webpage'] class TelegramTelethonScraper(Scraper): + """An implementation of a Scraper for Telegram, using Telethon library""" __version__ = "TelegramTelethonScraper 0.0.1" def get_username_from_url(self, url): @@ -30,9 +31,9 @@ class TelegramTelethonScraper(Scraper): username = self.get_username_from_url(channel.url) - api_id = os.environ['TELEGRAM_API_ID_1'] - api_hash = os.environ['TELEGRAM_API_HASH_1'] - phone = os.environ['TELEGRAM_PHONE_1'] + api_id = os.environ['TELEGRAM_API_ID'] + api_hash = os.environ['TELEGRAM_API_HASH'] + phone = os.environ['TELEGRAM_PHONE'] with TelegramClient(phone, api_id, api_hash) as client: diff --git a/docs/Makefile b/docs/Makefile index d0c3cbf..ab3e9be 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -8,12 +8,24 @@ SPHINXBUILD ?= sphinx-build SOURCEDIR = source BUILDDIR = build +SPHINXAPIDOC = sphinx-apidoc +APIDOCFLAGS = --separate --private --module-first +MODULEPATH = ../cisticola +SOURCEFILES = cisticola.* +MODULEFILE = modules.rst + # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile +# Custom process and flags for generating Sphinx sources +apidoc: + rm $(SOURCEDIR)/$(SOURCEFILES) + $(SPHINXAPIDOC) $(APIDOCFLAGS) -o "$(SOURCEDIR)" "$(MODULEPATH)" + rm $(SOURCEDIR)/$(MODULEFILE) + # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile diff --git a/docs/make.bat b/docs/make.bat index 6fcf05b..3ab2ef7 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -10,6 +10,12 @@ if "%SPHINXBUILD%" == "" ( set SOURCEDIR=source set BUILDDIR=build +set SPHINXAPIDOC=sphinx-apidoc +set APIDOCFLAGS=--separate --private --module-first +set MODULEPATH=../cisticola +set SOURCEFILES=cisticola.* +set MODULEFILE=modules.rst + if "%1" == "" goto help %SPHINXBUILD% >NUL 2>NUL @@ -28,6 +34,11 @@ if errorlevel 9009 ( %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% goto end +:apidoc + del %SOURCEDIR%\%SOURCEFILES% + %SPHINXAPIDOC% %APIDOCFLAGS% -o %SOURCEDIR% %MODULEPATH% + del %SOURCEDIR%\%MODULEFILE% + :help %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% diff --git a/docs/source/cisticola.rst b/docs/source/cisticola.rst index 6857abd..22cdf67 100644 --- a/docs/source/cisticola.rst +++ b/docs/source/cisticola.rst @@ -23,3 +23,4 @@ Submodules :maxdepth: 4 cisticola.base + cisticola.utils diff --git a/docs/source/cisticola.scraper.instagram.rst b/docs/source/cisticola.scraper.instagram.rst new file mode 100644 index 0000000..53ddc43 --- /dev/null +++ b/docs/source/cisticola.scraper.instagram.rst @@ -0,0 +1,8 @@ +cisticola.scraper.instagram module +================================== + +.. automodule:: cisticola.scraper.instagram + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.scraper.rst b/docs/source/cisticola.scraper.rst index 5e3d9a1..b93592c 100644 --- a/docs/source/cisticola.scraper.rst +++ b/docs/source/cisticola.scraper.rst @@ -17,9 +17,11 @@ Submodules cisticola.scraper.bitchute cisticola.scraper.gab cisticola.scraper.gettr + cisticola.scraper.instagram cisticola.scraper.odysee cisticola.scraper.rumble cisticola.scraper.telegram_snscrape cisticola.scraper.telegram_telethon cisticola.scraper.twitter - cisticola.scraper.utils + cisticola.scraper.vkontakte + cisticola.scraper.youtube diff --git a/docs/source/cisticola.scraper.utils.rst b/docs/source/cisticola.scraper.utils.rst deleted file mode 100644 index ceefb4d..0000000 --- a/docs/source/cisticola.scraper.utils.rst +++ /dev/null @@ -1,8 +0,0 @@ -cisticola.scraper.utils module -============================== - -.. automodule:: cisticola.scraper.utils - :members: - :undoc-members: - :show-inheritance: - :private-members: diff --git a/docs/source/cisticola.scraper.vkontakte.rst b/docs/source/cisticola.scraper.vkontakte.rst new file mode 100644 index 0000000..405d70d --- /dev/null +++ b/docs/source/cisticola.scraper.vkontakte.rst @@ -0,0 +1,8 @@ +cisticola.scraper.vkontakte module +================================== + +.. automodule:: cisticola.scraper.vkontakte + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.scraper.youtube.rst b/docs/source/cisticola.scraper.youtube.rst new file mode 100644 index 0000000..e990195 --- /dev/null +++ b/docs/source/cisticola.scraper.youtube.rst @@ -0,0 +1,8 @@ +cisticola.scraper.youtube module +================================ + +.. automodule:: cisticola.scraper.youtube + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.transformer.bitchute.rst b/docs/source/cisticola.transformer.bitchute.rst new file mode 100644 index 0000000..7427e9f --- /dev/null +++ b/docs/source/cisticola.transformer.bitchute.rst @@ -0,0 +1,8 @@ +cisticola.transformer.bitchute module +===================================== + +.. automodule:: cisticola.transformer.bitchute + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.transformer.rst b/docs/source/cisticola.transformer.rst index 218e1ec..358d955 100644 --- a/docs/source/cisticola.transformer.rst +++ b/docs/source/cisticola.transformer.rst @@ -14,4 +14,5 @@ Submodules :maxdepth: 4 cisticola.transformer.base + cisticola.transformer.bitchute cisticola.transformer.twitter diff --git a/docs/source/cisticola.utils.rst b/docs/source/cisticola.utils.rst new file mode 100644 index 0000000..6e5872a --- /dev/null +++ b/docs/source/cisticola.utils.rst @@ -0,0 +1,8 @@ +cisticola.utils module +====================== + +.. automodule:: cisticola.utils + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/index.rst b/docs/source/index.rst index e3f70a9..3c12d81 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -2,16 +2,7 @@ Welcome to Cisticola's documentation! ===================================== .. toctree:: - :maxdepth: 2 - :caption: Contents: + :maxdepth: 1 - cisticola - - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` + quickstart + cisticola \ No newline at end of file diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst new file mode 100644 index 0000000..a6c5643 --- /dev/null +++ b/docs/source/quickstart.rst @@ -0,0 +1,96 @@ +Quickstart +========== + +Installation +------------ + +The *cisticola* application uses pipenv_ for dependency management. To install the dependencies of *cisticola*, first install pipenv using the following command: + +.. code-block:: + + pip install pipenv + +and then install the dependencies using the following command from the package root directory: + +.. code-block:: + + pipenv install + +To install the necessary dependencies for building the documentation and running unit tests, run the following command from the package root directory: + +.. code-block:: + + pipenv install --dev + +Environment Variables +--------------------- + +Three of the scrapers in *cisticola* (:py:mod:`~cisticola.scraper.gab.GabScraper`, :py:mod:`~cisticola.scraper.instagram.InstagramScraper`, and :py:mod:`~cisticola.scraper.telegram_telethon.TelegramTelethonScraper`) require platform credentials to work correctly. + +Gab +""" + +The Gab credentials can be configured by running the following command from the root directory: + +.. code-block:: + + pipenv run garc configure + +which will direct you to provide the username and password for your Gab account. + +Instagram +""""""""" + +The Instagram credentials can be configured by setting the following environment variables, either in the project's ``.env`` file or in the system's environment: + +- ``INSTAGRAM_USERNAME``: username of your Instagram account +- ``INSTAGRAM_PASSWORD``: password of your Instagram account + +Telegram Telethon +""""""""""""""""" + +The Telegram credentials can be configured by setting the following environment variables, either in the project's ``.env`` file or in the system's environment: + +- ``TELEGRAM_API_ID``: API ID number for your Telegram application +- ``TELEGRAM_API_HASH``: API hash for your Telegram application +- ``TELEGRAM_PHONE``: phone number for the account corresponding to your your Telegram application + +If you do not already have a Telegram application, you can create one by following the instructions on `this page`_. + +Documentation +------------- + +The *cisticola* application uses Sphinx_ to generate and display its documentation. To build the documentation in the HTML format, run the following command from the ``docs/`` directory: + +.. code-block:: + + pipenv run make html + +For developers, if changes are made to the package structure or additional modules are created, you can update the Sphinx source ``*.rst`` files by running the following command from the ``docs/`` directory: + +.. code-block:: + + pipenv run make apidoc + +Testing +------- + +The *cisticola* application uses pytest_ for unit testing. To run the test suite, run the following command from the package root directory: + +.. code-block:: + + pipenv run pytest + +Examples +-------- + +An example of a *cisticola* ingest file ``russian_telegram_ingest.py`` is included in the package root directory, showing how the list of channels to scrape is defined, and how the :py:mod:`~cisticola.scraper.base.ScraperController` and :py:mod:`~cisticola.transformer.base.Transformer` classes are used. To run the ingest script, run the following command from the package root directory: + +.. code-block:: + + pipenv run python russian_telegram_ingest.py + +.. _pipenv: https://pipenv.pypa.io/en/latest/ +.. _Sphinx: https://www.sphinx-doc.org/en/master/ +.. _pytest: https://docs.pytest.org/en/7.1.x/ +.. _this page: https://core.telegram.org/api/obtaining_api_id \ No newline at end of file From 93554b19e90c8f70e785696e583b57740fdb53e5 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Tue, 15 Mar 2022 13:05:41 -0500 Subject: [PATCH 4/4] fixed typo --- cisticola/scraper/bitchute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index 47a822e..b11d27a 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -1,4 +1,4 @@ - from datetime import datetime, timezone +from datetime import datetime, timezone import time import re from html.parser import HTMLParser