From a7bd023c213e0091055f8537bea9f8f5262c3bd9 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Fri, 1 Sep 2023 17:05:13 -0500 Subject: [PATCH 1/6] simplified downloading logic (methods for keeping track of files less necessary since scraping can be done in Python), added functionality to use yt-dlp to download videos, added functionality to download TikTok image galleries --- Pipfile | 13 - Pipfile.lock | 416 ------------------ README.md | 44 +- requirements.txt | 2 - scripts/release.sh | 2 +- setup.py | 21 +- tiktok_hashtag_analysis/__init__.py | 1 + tiktok_hashtag_analysis/__main__.py | 139 +++--- tiktok_hashtag_analysis/base.py | 259 +++++++++++ tiktok_hashtag_analysis/data_methods.py | 161 ------- tiktok_hashtag_analysis/file_methods.py | 216 --------- tiktok_hashtag_analysis/global_data.py | 32 -- .../hashtag_frequencies.py | 99 ----- tiktok_hashtag_analysis/hashtag_list.txt | 5 - tiktok_hashtag_analysis/logging.config | 36 -- tiktok_hashtag_analysis/run_downloader.py | 150 ------- tiktok_hashtag_analysis/version.py | 12 - 17 files changed, 364 insertions(+), 1244 deletions(-) delete mode 100644 Pipfile delete mode 100644 Pipfile.lock delete mode 100644 requirements.txt create mode 100644 tiktok_hashtag_analysis/base.py delete mode 100644 tiktok_hashtag_analysis/data_methods.py delete mode 100644 tiktok_hashtag_analysis/file_methods.py delete mode 100644 tiktok_hashtag_analysis/global_data.py delete mode 100644 tiktok_hashtag_analysis/hashtag_frequencies.py delete mode 100644 tiktok_hashtag_analysis/hashtag_list.txt delete mode 100644 tiktok_hashtag_analysis/logging.config delete mode 100644 tiktok_hashtag_analysis/run_downloader.py delete mode 100644 tiktok_hashtag_analysis/version.py diff --git a/Pipfile b/Pipfile deleted file mode 100644 index 2fe1f28..0000000 --- a/Pipfile +++ /dev/null @@ -1,13 +0,0 @@ -[[source]] -url = "https://pypi.org/simple" -verify_ssl = true -name = "pypi" - -[packages] -matplotlib = "*" -seaborn = "*" - -[dev-packages] - -[requires] -python_version = "3.10" diff --git a/Pipfile.lock b/Pipfile.lock deleted file mode 100644 index 1518e4e..0000000 --- a/Pipfile.lock +++ /dev/null @@ -1,416 +0,0 @@ -{ - "_meta": { - "hash": { - "sha256": "97c5ef0126b17f586b5fa1d518cf359b7e984e48f8fc2310e9aa79bd384c2374" - }, - "pipfile-spec": 6, - "requires": { - "python_version": "3.10" - }, - "sources": [ - { - "name": "pypi", - "url": "https://pypi.org/simple", - "verify_ssl": true - } - ] - }, - "default": { - "contourpy": { - "hashes": [ - "sha256:031154ed61f7328ad7f97662e48660a150ef84ee1bc8876b6472af88bf5a9b98", - "sha256:0f9d350b639db6c2c233d92c7f213d94d2e444d8e8fc5ca44c9706cf72193772", - "sha256:130230b7e49825c98edf0b428b7aa1125503d91732735ef897786fe5452b1ec2", - "sha256:152fd8f730c31fd67fe0ffebe1df38ab6a669403da93df218801a893645c6ccc", - "sha256:1c71fdd8f1c0f84ffd58fca37d00ca4ebaa9e502fb49825484da075ac0b0b803", - "sha256:24847601071f740837aefb730e01bd169fbcaa610209779a78db7ebb6e6a7051", - "sha256:2e9ebb4425fc1b658e13bace354c48a933b842d53c458f02c86f371cecbedecc", - "sha256:30676ca45084ee61e9c3da589042c24a57592e375d4b138bd84d8709893a1ba4", - "sha256:31a55dccc8426e71817e3fe09b37d6d48ae40aae4ecbc8c7ad59d6893569c436", - "sha256:366a0cf0fc079af5204801786ad7a1c007714ee3909e364dbac1729f5b0849e5", - "sha256:38e2e577f0f092b8e6774459317c05a69935a1755ecfb621c0a98f0e3c09c9a5", - "sha256:3c184ad2433635f216645fdf0493011a4667e8d46b34082f5a3de702b6ec42e3", - "sha256:3caea6365b13119626ee996711ab63e0c9d7496f65641f4459c60a009a1f3e80", - "sha256:3e927b3868bd1e12acee7cc8f3747d815b4ab3e445a28d2e5373a7f4a6e76ba1", - "sha256:4ee3ee247f795a69e53cd91d927146fb16c4e803c7ac86c84104940c7d2cabf0", - "sha256:54d43960d809c4c12508a60b66cb936e7ed57d51fb5e30b513934a4a23874fae", - "sha256:57119b0116e3f408acbdccf9eb6ef19d7fe7baf0d1e9aaa5381489bc1aa56556", - "sha256:58569c491e7f7e874f11519ef46737cea1d6eda1b514e4eb5ac7dab6aa864d02", - "sha256:5a011cf354107b47c58ea932d13b04d93c6d1d69b8b6dce885e642531f847566", - "sha256:5caeacc68642e5f19d707471890f037a13007feba8427eb7f2a60811a1fc1350", - "sha256:5dd34c1ae752515318224cba7fc62b53130c45ac6a1040c8b7c1a223c46e8967", - "sha256:60835badb5ed5f4e194a6f21c09283dd6e007664a86101431bf870d9e86266c4", - "sha256:62398c80ef57589bdbe1eb8537127321c1abcfdf8c5f14f479dbbe27d0322e66", - "sha256:6381fa66866b0ea35e15d197fc06ac3840a9b2643a6475c8fff267db8b9f1e69", - "sha256:64757f6460fc55d7e16ed4f1de193f362104285c667c112b50a804d482777edd", - "sha256:69f8ff4db108815addd900a74df665e135dbbd6547a8a69333a68e1f6e368ac2", - "sha256:6c180d89a28787e4b73b07e9b0e2dac7741261dbdca95f2b489c4f8f887dd810", - "sha256:71b0bf0c30d432278793d2141362ac853859e87de0a7dee24a1cea35231f0d50", - "sha256:769eef00437edf115e24d87f8926955f00f7704bede656ce605097584f9966dc", - "sha256:7f6979d20ee5693a1057ab53e043adffa1e7418d734c1532e2d9e915b08d8ec2", - "sha256:87f4d8941a9564cda3f7fa6a6cd9b32ec575830780677932abdec7bcb61717b0", - "sha256:89ba9bb365446a22411f0673abf6ee1fea3b2cf47b37533b970904880ceb72f3", - "sha256:8acf74b5d383414401926c1598ed77825cd530ac7b463ebc2e4f46638f56cce6", - "sha256:9056c5310eb1daa33fc234ef39ebfb8c8e2533f088bbf0bc7350f70a29bde1ac", - "sha256:95c3acddf921944f241b6773b767f1cbce71d03307270e2d769fd584d5d1092d", - "sha256:9e20e5a1908e18aaa60d9077a6d8753090e3f85ca25da6e25d30dc0a9e84c2c6", - "sha256:a1e97b86f73715e8670ef45292d7cc033548266f07d54e2183ecb3c87598888f", - "sha256:a877ada905f7d69b2a31796c4b66e31a8068b37aa9b78832d41c82fc3e056ddd", - "sha256:a9d7587d2fdc820cc9177139b56795c39fb8560f540bba9ceea215f1f66e1566", - "sha256:abf298af1e7ad44eeb93501e40eb5a67abbf93b5d90e468d01fc0c4451971afa", - "sha256:ae90d5a8590e5310c32a7630b4b8618cef7563cebf649011da80874d0aa8f414", - "sha256:b6d0f9e1d39dbfb3977f9dd79f156c86eb03e57a7face96f199e02b18e58d32a", - "sha256:b8d587cc39057d0afd4166083d289bdeff221ac6d3ee5046aef2d480dc4b503c", - "sha256:c5210e5d5117e9aec8c47d9156d1d3835570dd909a899171b9535cb4a3f32693", - "sha256:cc331c13902d0f50845099434cd936d49d7a2ca76cb654b39691974cb1e4812d", - "sha256:ce41676b3d0dd16dbcfabcc1dc46090aaf4688fd6e819ef343dbda5a57ef0161", - "sha256:d8165a088d31798b59e91117d1f5fc3df8168d8b48c4acc10fc0df0d0bdbcc5e", - "sha256:e7281244c99fd7c6f27c1c6bfafba878517b0b62925a09b586d88ce750a016d2", - "sha256:e96a08b62bb8de960d3a6afbc5ed8421bf1a2d9c85cc4ea73f4bc81b4910500f", - "sha256:ed33433fc3820263a6368e532f19ddb4c5990855e4886088ad84fd7c4e561c71", - "sha256:efb8f6d08ca7998cf59eaf50c9d60717f29a1a0a09caa46460d33b2924839dbd", - "sha256:efe99298ba37e37787f6a2ea868265465410822f7bea163edcc1bd3903354ea9", - "sha256:f99e9486bf1bb979d95d5cffed40689cb595abb2b841f2991fc894b3452290e8", - "sha256:fc1464c97579da9f3ab16763c32e5c5d5bb5fa1ec7ce509a4ca6108b61b84fab", - "sha256:fd7dc0e6812b799a34f6d12fcb1000539098c249c8da54f3566c6a6461d0dbad" - ], - "markers": "python_version >= '3.8'", - "version": "==1.0.7" - }, - "cycler": { - "hashes": [ - "sha256:3a27e95f763a428a739d2add979fa7494c912a32c17c4c38c4d5f082cad165a3", - "sha256:9c87405839a19696e837b3b818fed3f5f69f16f1eec1a1ad77e043dcea9c772f" - ], - "markers": "python_version >= '3.6'", - "version": "==0.11.0" - }, - "fonttools": { - "hashes": [ - "sha256:2bb244009f9bf3fa100fc3ead6aeb99febe5985fa20afbfbaa2f8946c2fbdaf1", - "sha256:820466f43c8be8c3009aef8b87e785014133508f0de64ec469e4efb643ae54fb" - ], - "markers": "python_version >= '3.7'", - "version": "==4.38.0" - }, - "kiwisolver": { - "hashes": [ - "sha256:02f79693ec433cb4b5f51694e8477ae83b3205768a6fb48ffba60549080e295b", - "sha256:03baab2d6b4a54ddbb43bba1a3a2d1627e82d205c5cf8f4c924dc49284b87166", - "sha256:1041feb4cda8708ce73bb4dcb9ce1ccf49d553bf87c3954bdfa46f0c3f77252c", - "sha256:10ee06759482c78bdb864f4109886dff7b8a56529bc1609d4f1112b93fe6423c", - "sha256:1d1573129aa0fd901076e2bfb4275a35f5b7aa60fbfb984499d661ec950320b0", - "sha256:283dffbf061a4ec60391d51e6155e372a1f7a4f5b15d59c8505339454f8989e4", - "sha256:28bc5b299f48150b5f822ce68624e445040595a4ac3d59251703779836eceff9", - "sha256:2a66fdfb34e05b705620dd567f5a03f239a088d5a3f321e7b6ac3239d22aa286", - "sha256:2e307eb9bd99801f82789b44bb45e9f541961831c7311521b13a6c85afc09767", - "sha256:2e407cb4bd5a13984a6c2c0fe1845e4e41e96f183e5e5cd4d77a857d9693494c", - "sha256:2f5e60fabb7343a836360c4f0919b8cd0d6dbf08ad2ca6b9cf90bf0c76a3c4f6", - "sha256:36dafec3d6d6088d34e2de6b85f9d8e2324eb734162fba59d2ba9ed7a2043d5b", - "sha256:3fe20f63c9ecee44560d0e7f116b3a747a5d7203376abeea292ab3152334d004", - "sha256:41dae968a94b1ef1897cb322b39360a0812661dba7c682aa45098eb8e193dbdf", - "sha256:4bd472dbe5e136f96a4b18f295d159d7f26fd399136f5b17b08c4e5f498cd494", - "sha256:4ea39b0ccc4f5d803e3337dd46bcce60b702be4d86fd0b3d7531ef10fd99a1ac", - "sha256:5853eb494c71e267912275e5586fe281444eb5e722de4e131cddf9d442615626", - "sha256:5bce61af018b0cb2055e0e72e7d65290d822d3feee430b7b8203d8a855e78766", - "sha256:6295ecd49304dcf3bfbfa45d9a081c96509e95f4b9d0eb7ee4ec0530c4a96514", - "sha256:62ac9cc684da4cf1778d07a89bf5f81b35834cb96ca523d3a7fb32509380cbf6", - "sha256:70e7c2e7b750585569564e2e5ca9845acfaa5da56ac46df68414f29fea97be9f", - "sha256:7577c1987baa3adc4b3c62c33bd1118c3ef5c8ddef36f0f2c950ae0b199e100d", - "sha256:75facbe9606748f43428fc91a43edb46c7ff68889b91fa31f53b58894503a191", - "sha256:787518a6789009c159453da4d6b683f468ef7a65bbde796bcea803ccf191058d", - "sha256:78d6601aed50c74e0ef02f4204da1816147a6d3fbdc8b3872d263338a9052c51", - "sha256:7c43e1e1206cd421cd92e6b3280d4385d41d7166b3ed577ac20444b6995a445f", - "sha256:81e38381b782cc7e1e46c4e14cd997ee6040768101aefc8fa3c24a4cc58e98f8", - "sha256:841293b17ad704d70c578f1f0013c890e219952169ce8a24ebc063eecf775454", - "sha256:872b8ca05c40d309ed13eb2e582cab0c5a05e81e987ab9c521bf05ad1d5cf5cb", - "sha256:877272cf6b4b7e94c9614f9b10140e198d2186363728ed0f701c6eee1baec1da", - "sha256:8c808594c88a025d4e322d5bb549282c93c8e1ba71b790f539567932722d7bd8", - "sha256:8ed58b8acf29798b036d347791141767ccf65eee7f26bde03a71c944449e53de", - "sha256:91672bacaa030f92fc2f43b620d7b337fd9a5af28b0d6ed3f77afc43c4a64b5a", - "sha256:968f44fdbf6dd757d12920d63b566eeb4d5b395fd2d00d29d7ef00a00582aac9", - "sha256:9f85003f5dfa867e86d53fac6f7e6f30c045673fa27b603c397753bebadc3008", - "sha256:a553dadda40fef6bfa1456dc4be49b113aa92c2a9a9e8711e955618cd69622e3", - "sha256:a68b62a02953b9841730db7797422f983935aeefceb1679f0fc85cbfbd311c32", - "sha256:abbe9fa13da955feb8202e215c4018f4bb57469b1b78c7a4c5c7b93001699938", - "sha256:ad881edc7ccb9d65b0224f4e4d05a1e85cf62d73aab798943df6d48ab0cd79a1", - "sha256:b1792d939ec70abe76f5054d3f36ed5656021dcad1322d1cc996d4e54165cef9", - "sha256:b428ef021242344340460fa4c9185d0b1f66fbdbfecc6c63eff4b7c29fad429d", - "sha256:b533558eae785e33e8c148a8d9921692a9fe5aa516efbdff8606e7d87b9d5824", - "sha256:ba59c92039ec0a66103b1d5fe588fa546373587a7d68f5c96f743c3396afc04b", - "sha256:bc8d3bd6c72b2dd9decf16ce70e20abcb3274ba01b4e1c96031e0c4067d1e7cd", - "sha256:bc9db8a3efb3e403e4ecc6cd9489ea2bac94244f80c78e27c31dcc00d2790ac2", - "sha256:bf7d9fce9bcc4752ca4a1b80aabd38f6d19009ea5cbda0e0856983cf6d0023f5", - "sha256:c2dbb44c3f7e6c4d3487b31037b1bdbf424d97687c1747ce4ff2895795c9bf69", - "sha256:c79ebe8f3676a4c6630fd3f777f3cfecf9289666c84e775a67d1d358578dc2e3", - "sha256:c97528e64cb9ebeff9701e7938653a9951922f2a38bd847787d4a8e498cc83ae", - "sha256:d0611a0a2a518464c05ddd5a3a1a0e856ccc10e67079bb17f265ad19ab3c7597", - "sha256:d06adcfa62a4431d404c31216f0f8ac97397d799cd53800e9d3efc2fbb3cf14e", - "sha256:d41997519fcba4a1e46eb4a2fe31bc12f0ff957b2b81bac28db24744f333e955", - "sha256:d5b61785a9ce44e5a4b880272baa7cf6c8f48a5180c3e81c59553ba0cb0821ca", - "sha256:da152d8cdcab0e56e4f45eb08b9aea6455845ec83172092f09b0e077ece2cf7a", - "sha256:da7e547706e69e45d95e116e6939488d62174e033b763ab1496b4c29b76fabea", - "sha256:db5283d90da4174865d520e7366801a93777201e91e79bacbac6e6927cbceede", - "sha256:db608a6757adabb32f1cfe6066e39b3706d8c3aa69bbc353a5b61edad36a5cb4", - "sha256:e0ea21f66820452a3f5d1655f8704a60d66ba1191359b96541eaf457710a5fc6", - "sha256:e7da3fec7408813a7cebc9e4ec55afed2d0fd65c4754bc376bf03498d4e92686", - "sha256:e92a513161077b53447160b9bd8f522edfbed4bd9759e4c18ab05d7ef7e49408", - "sha256:ecb1fa0db7bf4cff9dac752abb19505a233c7f16684c5826d1f11ebd9472b871", - "sha256:efda5fc8cc1c61e4f639b8067d118e742b812c930f708e6667a5ce0d13499e29", - "sha256:f0a1dbdb5ecbef0d34eb77e56fcb3e95bbd7e50835d9782a45df81cc46949750", - "sha256:f0a71d85ecdd570ded8ac3d1c0f480842f49a40beb423bb8014539a9f32a5897", - "sha256:f4f270de01dd3e129a72efad823da90cc4d6aafb64c410c9033aba70db9f1ff0", - "sha256:f6cb459eea32a4e2cf18ba5fcece2dbdf496384413bc1bae15583f19e567f3b2", - "sha256:f8ad8285b01b0d4695102546b342b493b3ccc6781fc28c8c6a1bb63e95d22f09", - "sha256:f9f39e2f049db33a908319cf46624a569b36983c7c78318e9726a4cb8923b26c" - ], - "markers": "python_version >= '3.7'", - "version": "==1.4.4" - }, - "matplotlib": { - "hashes": [ - "sha256:01b7f521a9a73c383825813af255f8c4485d1706e4f3e2ed5ae771e4403a40ab", - "sha256:11011c97d62c1db7bc20509572557842dbb8c2a2ddd3dd7f20501aa1cde3e54e", - "sha256:1183877d008c752d7d535396096c910f4663e4b74a18313adee1213328388e1e", - "sha256:12f999661589981e74d793ee2f41b924b3b87d65fd929f6153bf0f30675c59b1", - "sha256:1c235bf9be052347373f589e018988cad177abb3f997ab1a2e2210c41562cc0c", - "sha256:1f4d69707b1677560cd952544ee4962f68ff07952fb9069ff8c12b56353cb8c9", - "sha256:1fcc4cad498533d3c393a160975acc9b36ffa224d15a6b90ae579eacee5d8579", - "sha256:2787a16df07370dcba385fe20cdd0cc3cfaabd3c873ddabca78c10514c799721", - "sha256:29f17b7f2e068dc346687cbdf80b430580bab42346625821c2d3abf3a1ec5417", - "sha256:38d38cb1ea1d80ee0f6351b65c6f76cad6060bbbead015720ba001348ae90f0c", - "sha256:3f56a7252eee8f3438447f75f5e1148a1896a2756a92285fe5d73bed6deebff4", - "sha256:5223affa21050fb6118353c1380c15e23aedfb436bf3e162c26dc950617a7519", - "sha256:57ad1aee29043163374bfa8990e1a2a10ff72c9a1bfaa92e9c46f6ea59269121", - "sha256:59400cc9451094b7f08cc3f321972e6e1db4cd37a978d4e8a12824bf7fd2f03b", - "sha256:68d94a436f62b8a861bf3ace82067a71bafb724b4e4f9133521e4d8012420dd7", - "sha256:6adc441b5b2098a4b904bbf9d9e92fb816fef50c55aa2ea6a823fc89b94bb838", - "sha256:6d81b11ede69e3a751424b98dc869c96c10256b2206bfdf41f9c720eee86844c", - "sha256:73b93af33634ed919e72811c9703e1105185cd3fb46d76f30b7f4cfbbd063f89", - "sha256:77b384cee7ab8cf75ffccbfea351a09b97564fc62d149827a5e864bec81526e5", - "sha256:79e501eb847f4a489eb7065bb8d3187117f65a4c02d12ea3a19d6c5bef173bcc", - "sha256:809119d1cba3ece3c9742eb01827fe7a0e781ea3c5d89534655a75e07979344f", - "sha256:80c166a0e28512e26755f69040e6bf2f946a02ffdb7c00bf6158cca3d2b146e6", - "sha256:81b409b2790cf8d7c1ef35920f01676d2ae7afa8241844e7aa5484fdf493a9a0", - "sha256:994637e2995b0342699b396a320698b07cd148bbcf2dd2fa2daba73f34dd19f2", - "sha256:9ceebaf73f1a3444fa11014f38b9da37ff7ea328d6efa1652241fe3777bfdab9", - "sha256:9fb8fb19d03abf3c5dab89a8677e62c4023632f919a62b6dd1d6d2dbf42cd9f5", - "sha256:acc3b1a4bddbf56fe461e36fb9ef94c2cb607fc90d24ccc650040bfcc7610de4", - "sha256:bbddfeb1495484351fb5b30cf5bdf06b3de0bc4626a707d29e43dfd61af2a780", - "sha256:bbf269e1d24bc25247095d71c7a969813f7080e2a7c6fa28931a603f747ab012", - "sha256:bebcff4c3ed02c6399d47329f3554193abd824d3d53b5ca02cf583bcd94470e2", - "sha256:c3f08df2ac4636249b8bc7a85b8b82c983bef1441595936f62c2918370ca7e1d", - "sha256:ca94f0362f6b6f424b555b956971dcb94b12d0368a6c3e07dc7a40d32d6d873d", - "sha256:d00c248ab6b92bea3f8148714837937053a083ff03b4c5e30ed37e28fc0e7e56", - "sha256:d2cfaa7fd62294d945b8843ea24228a27c8e7c5b48fa634f3c168153b825a21b", - "sha256:d5f18430f5cfa5571ab8f4c72c89af52aa0618e864c60028f11a857d62200cba", - "sha256:debeab8e2ab07e5e3dac33e12456da79c7e104270d2b2d1df92b9e40347cca75", - "sha256:dfba7057609ca9567b9704626756f0142e97ec8c5ba2c70c6e7bd1c25ef99f06", - "sha256:e0a64d7cc336b52e90f59e6d638ae847b966f68582a7af041e063d568e814740", - "sha256:eb9421c403ffd387fbe729de6d9a03005bf42faba5e8432f4e51e703215b49fc", - "sha256:faff486b36530a836a6b4395850322e74211cd81fc17f28b4904e1bd53668e3e", - "sha256:ff2aa84e74f80891e6bcf292ebb1dd57714ffbe13177642d65fee25384a30894" - ], - "index": "pypi", - "version": "==3.6.3" - }, - "numpy": { - "hashes": [ - "sha256:003a9f530e880cb2cd177cba1af7220b9aa42def9c4afc2a2fc3ee6be7eb2b22", - "sha256:150947adbdfeceec4e5926d956a06865c1c690f2fd902efede4ca6fe2e657c3f", - "sha256:2620e8592136e073bd12ee4536149380695fbe9ebeae845b81237f986479ffc9", - "sha256:2eabd64ddb96a1239791da78fa5f4e1693ae2dadc82a76bc76a14cbb2b966e96", - "sha256:4173bde9fa2a005c2c6e2ea8ac1618e2ed2c1c6ec8a7657237854d42094123a0", - "sha256:4199e7cfc307a778f72d293372736223e39ec9ac096ff0a2e64853b866a8e18a", - "sha256:4cecaed30dc14123020f77b03601559fff3e6cd0c048f8b5289f4eeabb0eb281", - "sha256:557d42778a6869c2162deb40ad82612645e21d79e11c1dc62c6e82a2220ffb04", - "sha256:63e45511ee4d9d976637d11e6c9864eae50e12dc9598f531c035265991910468", - "sha256:6524630f71631be2dabe0c541e7675db82651eb998496bbe16bc4f77f0772253", - "sha256:76807b4063f0002c8532cfeac47a3068a69561e9c8715efdad3c642eb27c0756", - "sha256:7de8fdde0003f4294655aa5d5f0a89c26b9f22c0a58790c38fae1ed392d44a5a", - "sha256:889b2cc88b837d86eda1b17008ebeb679d82875022200c6e8e4ce6cf549b7acb", - "sha256:92011118955724465fb6853def593cf397b4a1367495e0b59a7e69d40c4eb71d", - "sha256:97cf27e51fa078078c649a51d7ade3c92d9e709ba2bfb97493007103c741f1d0", - "sha256:9a23f8440561a633204a67fb44617ce2a299beecf3295f0d13c495518908e910", - "sha256:a51725a815a6188c662fb66fb32077709a9ca38053f0274640293a14fdd22978", - "sha256:a77d3e1163a7770164404607b7ba3967fb49b24782a6ef85d9b5f54126cc39e5", - "sha256:adbdce121896fd3a17a77ab0b0b5eedf05a9834a18699db6829a64e1dfccca7f", - "sha256:c29e6bd0ec49a44d7690ecb623a8eac5ab8a923bce0bea6293953992edf3a76a", - "sha256:c72a6b2f4af1adfe193f7beb91ddf708ff867a3f977ef2ec53c0ffb8283ab9f5", - "sha256:d0a2db9d20117bf523dde15858398e7c0858aadca7c0f088ac0d6edd360e9ad2", - "sha256:e3ab5d32784e843fc0dd3ab6dcafc67ef806e6b6828dc6af2f689be0eb4d781d", - "sha256:e428c4fbfa085f947b536706a2fc349245d7baa8334f0c5723c56a10595f9b95", - "sha256:e8d2859428712785e8a8b7d2b3ef0a1d1565892367b32f915c4a4df44d0e64f5", - "sha256:eef70b4fc1e872ebddc38cddacc87c19a3709c0e3e5d20bf3954c147b1dd941d", - "sha256:f64bb98ac59b3ea3bf74b02f13836eb2e24e48e0ab0145bbda646295769bd780", - "sha256:f9006288bcf4895917d02583cf3411f98631275bc67cce355a7f39f8c14338fa" - ], - "markers": "python_version >= '3.8'", - "version": "==1.24.2" - }, - "packaging": { - "hashes": [ - "sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2", - "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97" - ], - "markers": "python_version >= '3.7'", - "version": "==23.0" - }, - "pandas": { - "hashes": [ - "sha256:14e45300521902689a81f3f41386dc86f19b8ba8dd5ac5a3c7010ef8d2932813", - "sha256:26d9c71772c7afb9d5046e6e9cf42d83dd147b5cf5bcb9d97252077118543792", - "sha256:3749077d86e3a2f0ed51367f30bf5b82e131cc0f14260c4d3e499186fccc4406", - "sha256:41179ce559943d83a9b4bbacb736b04c928b095b5f25dd2b7389eda08f46f373", - "sha256:478ff646ca42b20376e4ed3fa2e8d7341e8a63105586efe54fa2508ee087f328", - "sha256:50869a35cbb0f2e0cd5ec04b191e7b12ed688874bd05dd777c19b28cbea90996", - "sha256:565fa34a5434d38e9d250af3c12ff931abaf88050551d9fbcdfafca50d62babf", - "sha256:5f2b952406a1588ad4cad5b3f55f520e82e902388a6d5a4a91baa8d38d23c7f6", - "sha256:5fbcb19d6fceb9e946b3e23258757c7b225ba450990d9ed63ccceeb8cae609f7", - "sha256:6973549c01ca91ec96199e940495219c887ea815b2083722821f1d7abfa2b4dc", - "sha256:74a3fd7e5a7ec052f183273dc7b0acd3a863edf7520f5d3a1765c04ffdb3b0b1", - "sha256:7a0a56cef15fd1586726dace5616db75ebcfec9179a3a55e78f72c5639fa2a23", - "sha256:7cec0bee9f294e5de5bbfc14d0573f65526071029d036b753ee6507d2a21480a", - "sha256:87bd9c03da1ac870a6d2c8902a0e1fd4267ca00f13bc494c9e5a9020920e1d51", - "sha256:972d8a45395f2a2d26733eb8d0f629b2f90bebe8e8eddbb8829b180c09639572", - "sha256:9842b6f4b8479e41968eced654487258ed81df7d1c9b7b870ceea24ed9459b31", - "sha256:9f69c4029613de47816b1bb30ff5ac778686688751a5e9c99ad8c7031f6508e5", - "sha256:a50d9a4336a9621cab7b8eb3fb11adb82de58f9b91d84c2cd526576b881a0c5a", - "sha256:bc4c368f42b551bf72fac35c5128963a171b40dce866fb066540eeaf46faa003", - "sha256:c39a8da13cede5adcd3be1182883aea1c925476f4e84b2807a46e2775306305d", - "sha256:c3ac844a0fe00bfaeb2c9b51ab1424e5c8744f89860b138434a363b1f620f354", - "sha256:c4c00e0b0597c8e4f59e8d461f797e5d70b4d025880516a8261b2817c47759ee", - "sha256:c74a62747864ed568f5a82a49a23a8d7fe171d0c69038b38cedf0976831296fa", - "sha256:dd05f7783b3274aa206a1af06f0ceed3f9b412cf665b7247eacd83be41cf7bf0", - "sha256:dfd681c5dc216037e0b0a2c821f5ed99ba9f03ebcf119c7dac0e9a7b960b9ec9", - "sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae", - "sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc" - ], - "markers": "python_version >= '3.8'", - "version": "==1.5.3" - }, - "pillow": { - "hashes": [ - "sha256:013016af6b3a12a2f40b704677f8b51f72cb007dac785a9933d5c86a72a7fe33", - "sha256:0845adc64fe9886db00f5ab68c4a8cd933ab749a87747555cec1c95acea64b0b", - "sha256:0884ba7b515163a1a05440a138adeb722b8a6ae2c2b33aea93ea3118dd3a899e", - "sha256:09b89ddc95c248ee788328528e6a2996e09eaccddeeb82a5356e92645733be35", - "sha256:0dd4c681b82214b36273c18ca7ee87065a50e013112eea7d78c7a1b89a739153", - "sha256:0e51f608da093e5d9038c592b5b575cadc12fd748af1479b5e858045fff955a9", - "sha256:0f3269304c1a7ce82f1759c12ce731ef9b6e95b6df829dccd9fe42912cc48569", - "sha256:16a8df99701f9095bea8a6c4b3197da105df6f74e6176c5b410bc2df2fd29a57", - "sha256:19005a8e58b7c1796bc0167862b1f54a64d3b44ee5d48152b06bb861458bc0f8", - "sha256:1b4b4e9dda4f4e4c4e6896f93e84a8f0bcca3b059de9ddf67dac3c334b1195e1", - "sha256:28676836c7796805914b76b1837a40f76827ee0d5398f72f7dcc634bae7c6264", - "sha256:2968c58feca624bb6c8502f9564dd187d0e1389964898f5e9e1fbc8533169157", - "sha256:3f4cc516e0b264c8d4ccd6b6cbc69a07c6d582d8337df79be1e15a5056b258c9", - "sha256:3fa1284762aacca6dc97474ee9c16f83990b8eeb6697f2ba17140d54b453e133", - "sha256:43521ce2c4b865d385e78579a082b6ad1166ebed2b1a2293c3be1d68dd7ca3b9", - "sha256:451f10ef963918e65b8869e17d67db5e2f4ab40e716ee6ce7129b0cde2876eab", - "sha256:46c259e87199041583658457372a183636ae8cd56dbf3f0755e0f376a7f9d0e6", - "sha256:46f39cab8bbf4a384ba7cb0bc8bae7b7062b6a11cfac1ca4bc144dea90d4a9f5", - "sha256:519e14e2c49fcf7616d6d2cfc5c70adae95682ae20f0395e9280db85e8d6c4df", - "sha256:53dcb50fbdc3fb2c55431a9b30caeb2f7027fcd2aeb501459464f0214200a503", - "sha256:54614444887e0d3043557d9dbc697dbb16cfb5a35d672b7a0fcc1ed0cf1c600b", - "sha256:575d8912dca808edd9acd6f7795199332696d3469665ef26163cd090fa1f8bfa", - "sha256:5dd5a9c3091a0f414a963d427f920368e2b6a4c2f7527fdd82cde8ef0bc7a327", - "sha256:5f532a2ad4d174eb73494e7397988e22bf427f91acc8e6ebf5bb10597b49c493", - "sha256:60e7da3a3ad1812c128750fc1bc14a7ceeb8d29f77e0a2356a8fb2aa8925287d", - "sha256:653d7fb2df65efefbcbf81ef5fe5e5be931f1ee4332c2893ca638c9b11a409c4", - "sha256:6663977496d616b618b6cfa43ec86e479ee62b942e1da76a2c3daa1c75933ef4", - "sha256:6abfb51a82e919e3933eb137e17c4ae9c0475a25508ea88993bb59faf82f3b35", - "sha256:6c6b1389ed66cdd174d040105123a5a1bc91d0aa7059c7261d20e583b6d8cbd2", - "sha256:6d9dfb9959a3b0039ee06c1a1a90dc23bac3b430842dcb97908ddde05870601c", - "sha256:765cb54c0b8724a7c12c55146ae4647e0274a839fb6de7bcba841e04298e1011", - "sha256:7a21222644ab69ddd9967cfe6f2bb420b460dae4289c9d40ff9a4896e7c35c9a", - "sha256:7ac7594397698f77bce84382929747130765f66406dc2cd8b4ab4da68ade4c6e", - "sha256:7cfc287da09f9d2a7ec146ee4d72d6ea1342e770d975e49a8621bf54eaa8f30f", - "sha256:83125753a60cfc8c412de5896d10a0a405e0bd88d0470ad82e0869ddf0cb3848", - "sha256:847b114580c5cc9ebaf216dd8c8dbc6b00a3b7ab0131e173d7120e6deade1f57", - "sha256:87708d78a14d56a990fbf4f9cb350b7d89ee8988705e58e39bdf4d82c149210f", - "sha256:8a2b5874d17e72dfb80d917213abd55d7e1ed2479f38f001f264f7ce7bae757c", - "sha256:8f127e7b028900421cad64f51f75c051b628db17fb00e099eb148761eed598c9", - "sha256:94cdff45173b1919350601f82d61365e792895e3c3a3443cf99819e6fbf717a5", - "sha256:99d92d148dd03fd19d16175b6d355cc1b01faf80dae93c6c3eb4163709edc0a9", - "sha256:9a3049a10261d7f2b6514d35bbb7a4dfc3ece4c4de14ef5876c4b7a23a0e566d", - "sha256:9d9a62576b68cd90f7075876f4e8444487db5eeea0e4df3ba298ee38a8d067b0", - "sha256:9e5f94742033898bfe84c93c831a6f552bb629448d4072dd312306bab3bd96f1", - "sha256:a1c2d7780448eb93fbcc3789bf3916aa5720d942e37945f4056680317f1cd23e", - "sha256:a2e0f87144fcbbe54297cae708c5e7f9da21a4646523456b00cc956bd4c65815", - "sha256:a4dfdae195335abb4e89cc9762b2edc524f3c6e80d647a9a81bf81e17e3fb6f0", - "sha256:a96e6e23f2b79433390273eaf8cc94fec9c6370842e577ab10dabdcc7ea0a66b", - "sha256:aabdab8ec1e7ca7f1434d042bf8b1e92056245fb179790dc97ed040361f16bfd", - "sha256:b222090c455d6d1a64e6b7bb5f4035c4dff479e22455c9eaa1bdd4c75b52c80c", - "sha256:b52ff4f4e002f828ea6483faf4c4e8deea8d743cf801b74910243c58acc6eda3", - "sha256:b70756ec9417c34e097f987b4d8c510975216ad26ba6e57ccb53bc758f490dab", - "sha256:b8c2f6eb0df979ee99433d8b3f6d193d9590f735cf12274c108bd954e30ca858", - "sha256:b9b752ab91e78234941e44abdecc07f1f0d8f51fb62941d32995b8161f68cfe5", - "sha256:ba6612b6548220ff5e9df85261bddc811a057b0b465a1226b39bfb8550616aee", - "sha256:bd752c5ff1b4a870b7661234694f24b1d2b9076b8bf337321a814c612665f343", - "sha256:c3c4ed2ff6760e98d262e0cc9c9a7f7b8a9f61aa4d47c58835cdaf7b0b8811bb", - "sha256:c5c1362c14aee73f50143d74389b2c158707b4abce2cb055b7ad37ce60738d47", - "sha256:cb362e3b0976dc994857391b776ddaa8c13c28a16f80ac6522c23d5257156bed", - "sha256:d197df5489004db87d90b918033edbeee0bd6df3848a204bca3ff0a903bef837", - "sha256:d3b56206244dc8711f7e8b7d6cad4663917cd5b2d950799425076681e8766286", - "sha256:d5b2f8a31bd43e0f18172d8ac82347c8f37ef3e0b414431157718aa234991b28", - "sha256:d7081c084ceb58278dd3cf81f836bc818978c0ccc770cbbb202125ddabec6628", - "sha256:db74f5562c09953b2c5f8ec4b7dfd3f5421f31811e97d1dbc0a7c93d6e3a24df", - "sha256:df41112ccce5d47770a0c13651479fbcd8793f34232a2dd9faeccb75eb5d0d0d", - "sha256:e1339790c083c5a4de48f688b4841f18df839eb3c9584a770cbd818b33e26d5d", - "sha256:e621b0246192d3b9cb1dc62c78cfa4c6f6d2ddc0ec207d43c0dedecb914f152a", - "sha256:e8c5cf126889a4de385c02a2c3d3aba4b00f70234bfddae82a5eaa3ee6d5e3e6", - "sha256:e9d7747847c53a16a729b6ee5e737cf170f7a16611c143d95aa60a109a59c336", - "sha256:eaef5d2de3c7e9b21f1e762f289d17b726c2239a42b11e25446abf82b26ac132", - "sha256:ed3e4b4e1e6de75fdc16d3259098de7c6571b1a6cc863b1a49e7d3d53e036070", - "sha256:ef21af928e807f10bf4141cad4746eee692a0dd3ff56cfb25fce076ec3cc8abe", - "sha256:f09598b416ba39a8f489c124447b007fe865f786a89dbfa48bb5cf395693132a", - "sha256:f0caf4a5dcf610d96c3bd32932bfac8aee61c96e60481c2a0ea58da435e25acd", - "sha256:f6e78171be3fb7941f9910ea15b4b14ec27725865a73c15277bc39f5ca4f8391", - "sha256:f715c32e774a60a337b2bb8ad9839b4abf75b267a0f18806f6f4f5f1688c4b5a", - "sha256:fb5c1ad6bad98c57482236a21bf985ab0ef42bd51f7ad4e4538e89a997624e12" - ], - "markers": "python_version >= '3.7'", - "version": "==9.4.0" - }, - "pyparsing": { - "hashes": [ - "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb", - "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc" - ], - "markers": "python_full_version >= '3.6.8'", - "version": "==3.0.9" - }, - "python-dateutil": { - "hashes": [ - "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", - "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", - "version": "==2.8.2" - }, - "pytz": { - "hashes": [ - "sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0", - "sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a" - ], - "version": "==2022.7.1" - }, - "seaborn": { - "hashes": [ - "sha256:374645f36509d0dcab895cba5b47daf0586f77bfe3b36c97c607db7da5be0139", - "sha256:ebf15355a4dba46037dfd65b7350f014ceb1f13c05e814eda2c9f5fd731afc08" - ], - "index": "pypi", - "version": "==0.12.2" - }, - "six": { - "hashes": [ - "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", - "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", - "version": "==1.16.0" - } - }, - "develop": {} -} diff --git a/README.md b/README.md index 2cbd3ad..b0e3f25 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,12 @@ # TikTok hashtag analysis toolset -> IMPORTANT NOTE: this tool relies on [drawrowfly/tiktok-scraper](https://github.com/drawrowfly/tiktok-scraper) which seems to be broken at time of writing and without updates for some time with several open issues ([796](https://github.com/drawrowfly/tiktok-scraper/issues/796) [#799](https://github.com/drawrowfly/tiktok-scraper/issues/799)) that need to be fixed before this library can work smoothly :/ - -The tool helps to download posts and videos from TikTok for a given set of hashtags over a period of time. Users can create a growing database of posts for specific hashtags which can then be used for further hashtag analysis. It uses the [tiktok-scraper](https://github.com/drawrowfly/tiktok-scraper) Node package to download the posts and videos. +The tool helps to download posts and videos from TikTok for a given set of hashtags over a period of time. Users can create a growing database of posts for specific hashtags which can then be used for further hashtag analysis. It uses the [TikTokApi](https://github.com/davidteather/TikTok-Api) Python package to download the posts and uses [yt-dlp](https://github.com/yt-dlp/yt-dlp) to download the videos. [![PyPI version](https://badge.fury.io/py/tiktok-hashtag-analysis.svg)](https://badge.fury.io/py/tiktok-hashtag-analysis) ## Pre-requisites -1. Make sure you have Python 3.6 or a later version installed -2. And, you need to have node version 16. On Mac, do `brew install node` followed by `npm install -g n` and then `n 16` -4. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper -5. Install the tool with pip: `pip install tiktok-hashtag-analysis` +1. Make sure you have Python 3.9 or a later version installed +2. Install the tool with pip: `pip install tiktok-hashtag-analysis` 1. or directly from the repo version: `pip install git+https://github.com/bellingcat/tiktok-hashtag-analysis` You should now be ready to start using it. @@ -19,27 +15,23 @@ You should now be ready to start using it. ## About the tool ### Command-line arguments ``` -tiktok-hashtag-analysis --help -usage: tiktok-hashtag-analysis [-h] [-t [T ...]] [-f F] [-p] [-v] [-ht HASHTAG] [-n NUMBER] [-plt] [-d] {download,frequencies} +usage: tiktok-hashtag-analysis [-h] [--file FILE] [-d] [--number NUMBER] [-p] [-t] [--output-dir OUTPUT_DIR] [--log LOG] [hashtags ...] Analyze hashtags within posts scraped from TikTok. positional arguments: - {download,frequencies} - command to initialize + hashtags List of hashtags to scrape -options: +optional arguments: -h, --help show this help message and exit - -t [T ...] List of hashtags to scrape (module: run_downloader) - -f F File name containing list of hashtags to scrape (module: run_downloader) - -p Download post data (module: run_downloader) - -v Download video files (module: run_downloader) - -ht HASHTAG, --hashtag HASHTAG - The hashtag of scraped posts to analyze (module: hashtag_frequencies) - -n NUMBER, --number NUMBER - The number of top n occurrences (module: hashtag_frequencies) - -plt, --plot Plot the occurrences (module: hashtag_frequencies) - -d, --print List top n hashtags (module: hashtag_frequencies) + --file FILE File name containing list of hashtags to scrape + -d, --download Download video files corresponding to scraped posts + --number NUMBER The number of co-occurring hashtags to analyze + -p, --plot Plot the most common co-occurring hashtags + -t, --table Print a table of the most common co-occurring hashtags + --output-dir OUTPUT_DIR + Directory to save scraped data and visualizations to + --log LOG File to write logs to ``` ### Structure of output data @@ -67,9 +59,9 @@ The `data` folder contains all the downloaded data as shown in the tree diagram ## How to use ### Post downloading -Running the `tiktok-hashtag-analysis download` command with the following options will scrape posts containing the hashtags `#london`, `#paris`, or `#newyork`: +Running the `tiktok-hashtag-analysis` command with the following options will scrape posts containing the hashtags `#london`, `#paris`, or `#newyork`: - tiktok-hashtag-analysis download -t london paris newyork -p + tiktok-hashtag-analysis london paris newyork and will produce an output similar to the following log: @@ -100,7 +92,7 @@ Assume we want to analyze the 20 most frequently occurring hashtags in the downl - The results can be plotted and saved as a PNG file by executing the following command: - `tiktok-hashtag-analysis frequencies london 20 -p` + `tiktok-hashtag-analysis frequencies --hashtag london --number 20 --plot` which will produce a figure similar to that shown below:

@@ -111,7 +103,7 @@ Assume we want to analyze the 20 most frequently occurring hashtags in the downl - The results can be displayed in tabular form by executing the following command: - `tiktok-hashtag-analysis frequencies london 20 -d` + `tiktok-hashtag-analysis frequencies --hashtag london --number 20 --print` which will produce a terminal output similar to the following: ``` diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 9a8d369..0000000 --- a/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -matplotlib -seaborn \ No newline at end of file diff --git a/scripts/release.sh b/scripts/release.sh index 6789652..c96718c 100644 --- a/scripts/release.sh +++ b/scripts/release.sh @@ -3,7 +3,7 @@ set -e -TAG=$(python -c 'from tiktok_hashtag_analysis.version import __version__; print("v" + __version__)') +TAG=$(python -c 'from tiktok_hashtag_analysis import __version__; print("v" + __version__)') read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt diff --git a/setup.py b/setup.py index 8a347af..bd6119e 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ -from setuptools import setup, find_packages -from tiktok_hashtag_analysis.version import __version__ +from setuptools import setup +from tiktok_hashtag_analysis import __version__ with open("README.md", "r", encoding="utf-8") as file: long_description = file.read() @@ -10,23 +10,18 @@ setup( author="Bellingcat", author_email="tech@bellingcat.com", packages=["tiktok_hashtag_analysis"], - package_data={ - "tiktok_hashtag_analysis": [ - "logging.config", - ] - }, description="Analyze hashtags within posts scraped from TikTok", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/bellingcat/tiktok-hashtag-analysis", license="MIT License", - install_requires=["seaborn", "matplotlib"], + install_requires=["seaborn", "matplotlib", "TikTokApi", "requests", "yt-dlp"], classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Intended Audience :: Information Technology', - 'License :: OSI Approved :: MIT License', - 'Natural Language :: English', - 'Programming Language :: Python :: 3' + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Information Technology", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Programming Language :: Python :: 3", ], entry_points={ "console_scripts": [ diff --git a/tiktok_hashtag_analysis/__init__.py b/tiktok_hashtag_analysis/__init__.py index e69de29..8c0d5d5 100644 --- a/tiktok_hashtag_analysis/__init__.py +++ b/tiktok_hashtag_analysis/__init__.py @@ -0,0 +1 @@ +__version__ = "2.0.0" diff --git a/tiktok_hashtag_analysis/__main__.py b/tiktok_hashtag_analysis/__main__.py index c5b1525..8e7dce2 100644 --- a/tiktok_hashtag_analysis/__main__.py +++ b/tiktok_hashtag_analysis/__main__.py @@ -1,76 +1,91 @@ -import logging, argparse -from .file_methods import log_writer -from .run_downloader import * # Import everything from run_downloader.py -from .hashtag_frequencies import * # Import everything from hashtag_frequencies.py +import logging +import argparse +from pathlib import Path +import sys -logger = logging.getLogger() +from .base import TikTokDownloader, load_hashtags_from_file -def create_parser() -> argparse.ArgumentParser: - """Create the parser and the arguments for the user input.""" - parser = argparse.ArgumentParser(description="Analyze hashtags within posts scraped from TikTok.") - parser.add_argument("command", help="command to initialize", choices=['download', 'frequencies']) - parser.add_argument("-t", type=str, nargs="*", help="List of hashtags to scrape (module: run_downloader)") - parser.add_argument("-f", type=str, help="File name containing list of hashtags to scrape (module: run_downloader)") - parser.add_argument("-p", action="store_true", help="Download post data (module: run_downloader)") - parser.add_argument("-v", action="store_true", help="Download video files (module: run_downloader)") - parser.add_argument("-ht", "--hashtag", type=str, - help="The hashtag of scraped posts to analyze (module: hashtag_frequencies)", ) - parser.add_argument("-n", "--number", type=int, help="The number of top n occurrences (module: hashtag_frequencies)") - parser.add_argument("-plt", "--plot", help="Plot the occurrences (module: hashtag_frequencies)", action="store_true") - parser.add_argument("-d", "--print", help="List top n hashtags (module: hashtag_frequencies)", action="store_true") +def create_parser(): + parser = argparse.ArgumentParser( + description="Analyze hashtags within posts scraped from TikTok." + ) + + parser.add_argument( + "hashtags", + type=str, + nargs="*", + help="List of hashtags to scrape", + ) + parser.add_argument( + "--file", + type=str, + help="File name containing list of hashtags to scrape", + ) + parser.add_argument( + "-d", + "--download", + action="store_true", + help="Download video files corresponding to scraped posts", + ) + parser.add_argument( + "--number", + type=int, + help="The number of co-occurring hashtags to analyze", + default=20, + ) + parser.add_argument( + "-p", + "--plot", + help="Plot the most common co-occurring hashtags", + action="store_true", + ) + parser.add_argument( + "-t", + "--table", + help="Print a table of the most common co-occurring hashtags", + action="store_true", + ) + parser.add_argument( + "--output-dir", + type=str, + help="Directory to save scraped data and visualizations to", + default=Path(".").resolve().parent / "data", + ) + parser.add_argument("--log", type=str, help="File to write logs to", default=None) + return parser def main(): parser = create_parser() args = parser.parse_args() - if args.command == "download": - if not (args.t or args.f): - parser.error( - "No hashtags were given, please use either the `-t` flag or the `-f` flag to specify one or more hashtags.") - if not (args.p or args.v): + logging.basicConfig( + level=logging.INFO, + filename=args.log, + format="%(asctime)s %(levelname)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + + if len(args.hashtags) == 0: + if not args.file: parser.error( - "No argument given, please specify either the `-p` flag to download post data or the `-v` flag to download video files, or both." + "No hashtags were specified, please specify one or more hashtags " + "to scrape or use the `--file` flag to specify a text file containing " + "hashtags." ) - - if args.t: - hashtags = args.t - elif args.f: - file_name = args.f - hashtags = get_hashtag_list(file_name) - - logger.info(f"Hashtags to scrape: {hashtags}") - if not hashtags: - raise ValueError( - "No hashtags were specified: please use either the `-t` flag to specify a sspace-separated list of one or more hashtags as a command-line argument, or use the `-f` flag to specify a text file of newline-separated hashtags.") - - download_data_type = {"posts": args.p, "videos": args.v} - - scraped_summary_list = get_data(hashtags, download_data_type) - if scraped_summary_list: - log_writer(scraped_summary_list) - elif args.command == "frequencies": - img_folder = IMAGES - check_file(img_folder, "dir") - if args.n < 1: - raise ValueError( - f"Specified argument `n` (the number of hashtags to analyze) must be greater than zero, not: {args.n}.") - input_file = data_file = os.path.join( - FILES["data"], args.hashtag, FILES["posts"], FILES["data_file"] - ) - if not check_existence(input_file, "file"): - raise FileNotFoundError( - f"File ({input_file}) for specified argument `hashtag` ({args.hashtag}) does not exist.") - - # base = os.path.splitext(input_file)[0] - # path = f"./{base}_sorted_hashtags.csv" - occs = get_occurrences(input_file, args.n) - if args.plot: - plot(occs, img_folder) else: - print_occurrences(occs) + hashtags = load_hashtags_from_file(file=args.file) + else: + hashtags = args.hashtags -if __name__=="__main__": - main() \ No newline at end of file + downloader = TikTokDownloader(hashtags=hashtags, data_dir=args.output_dir) + + downloader.run( + download=args.download, plot=args.plot, table=args.table, number=args.number + ) + + +if __name__ == "__main__": + main() diff --git a/tiktok_hashtag_analysis/base.py b/tiktok_hashtag_analysis/base.py new file mode 100644 index 0000000..63224ef --- /dev/null +++ b/tiktok_hashtag_analysis/base.py @@ -0,0 +1,259 @@ +import os +import json +from pathlib import Path +from collections import Counter +from datetime import datetime +import warnings +import asyncio +import logging +import re +from typing import List, Dict + +import yt_dlp +import requests +import matplotlib.pyplot as plt +import matplotlib.ticker as mtick +import seaborn as sns + +from TikTokApi import TikTokApi + +warnings.filterwarnings("ignore", message="Glyph (.*) missing from current font") +sns.set_theme(style="darkgrid") + + +def process_hashtag_list(hashtags: List[str]) -> List[str]: + """Convert a list of hashtags to a standard form (remove whitespace, make + lowercase, etc.).""" + return list( + filter(None, (hashtag.strip().strip("#").lower() for hashtag in hashtags)) + ) + + +def load_hashtags_from_file(file: str) -> List[str]: + """Read and process hashtags specified in a text file.""" + if not os.path.isfile(file): + raise OSError(f"{file} does not exist") + with open(file, "r", encoding="utf-8") as f: + hashtags = re.split(r"\n|,", f.read()) + return process_hashtag_list(hashtags=hashtags) + + +async def _fetch_hashtag_data(hashtag: str) -> List[Dict]: + """Fetch data for videos containing a specified hashtag, asynchronously.""" + data = [] + async with TikTokApi() as api: + await api.create_sessions( + ms_tokens=[os.environ["MS_TOKEN"]], num_sessions=1, sleep_after=3 + ) + async for video in api.hashtag(name=hashtag).videos(count=1000): + data.append(video.as_dict) + return data + + +def json_load(file_path: Path) -> List: + """Read a JSON file and return the read data.""" + with open(file_path, "r", encoding="utf-8") as f: + data = json.load(fp=f) + return data + + +def json_dump(file_path: Path, data: List): + """Write data to a JSON file.""" + with open(file_path, "w", encoding="utf-8") as f: + json.dump(obj=data, fp=f) + + +def download_gallery(video_data: Dict, video_dir: Path): + """yt-dlp doesn't seem to support downloading images from an image gallery, + so this is a quick fix that likely will fail on edge cases.""" + + video_id = video_data["id"] + if play_url := video_data["music"]["playUrl"]: + r = requests.get(play_url) + with open(video_dir / f"{video_id}.mp3", "wb") as f: + f.write(r.content) + + for i, image in enumerate(video_data["imagePost"]["images"]): + image_url = image["imageURL"]["urlList"][0] + r = requests.get(image_url) + ext = r.headers["Content-Type"].split("/")[-1] + with open(video_dir / f"{video_id}_{i:02d}.{ext}", "wb") as f: + f.write(r.content) + + +def aggregate_cooccurring_hashtags(hashtag_file: Path) -> Counter: + """Aggregate how frequently hashtags are used, from a file containing a + list of raw TikTok post API responses.""" + videos = json_load(file_path=hashtag_file) + + all_hashtags: List[set] = [] + for video in videos: + video_hashtags = set( + hashtag["hashtagName"] + for hashtag in video.get("textExtra", []) + if hashtag.get("hashtagName") + ) + all_hashtags.extend(video_hashtags) + + return Counter(all_hashtags) + + +class TikTokDownloader: + """Main class for scraping data from TikTok.""" + + def __init__(self, hashtags: List[str], data_dir: str): + self.hashtags = process_hashtag_list(hashtags) + logging.info(f"Hashtags to scrape: {hashtags}") + + self.data_dir = Path(data_dir) + os.makedirs(self.data_dir, exist_ok=True) + + def get_hashtag_posts(self, hashtag: str): + """Fetch data about posts that used a specified hashtag and merge with + existing data, if it exists.""" + + # Define file to store hashtags in and create parent directory + hashtag_file = self.data_dir / hashtag / "posts.json" + hashtag_file.parent.mkdir(exist_ok=True, parents=True) + + # If there are previously scraped posts, load them + if hashtag_file.is_file(): + already_fetched_data = json_load(file_path=hashtag_file) + already_fetched_ids = set(video["id"] for video in already_fetched_data) + else: + already_fetched_ids = set() + already_fetched_data = [] + + # Scrape posts that use the specified hashtag + fetched_data = asyncio.run(_fetch_hashtag_data(hashtag=hashtag)) + if len(fetched_data) == 0: + logging.warning(f"No posts were found for the hashtag: {hashtag}") + + # Determine which newly scraped posts haven't been scraped before + new_fetched_data = [ + video for video in fetched_data if video["id"] not in already_fetched_ids + ] + if len(new_fetched_data) == 0: + logging.warning(f"No new posts were found for the hashtag: {hashtag}") + + # Merge new and old data and write to file + all_fetched_data = already_fetched_data + new_fetched_data + json_dump(file_path=hashtag_file, data=all_fetched_data) + logging.info( + f"Scraped {len(new_fetched_data)} new posts containing the hashtag " + f"'{hashtag}', with {len(already_fetched_data)} posts previously scraped" + ) + + def get_hashtag_videos(self, hashtag: str): + """Download videos and other media corresponding to posts that used a + specified hashtag,""" + + # Define file containing post data and directory to save videos to + hashtag_file = self.data_dir / hashtag / "posts.json" + video_dir = self.data_dir / hashtag / "videos" + video_dir.mkdir(exist_ok=True) + + # Get list of post IDs that have previously had their media downloaded + already_downloaded_ids = set( + file.split(".")[0].split("_")[0] for file in os.listdir(video_dir) + ) + # Get list of posts that have been scraped but not had their media downloaded + video_list = json_load(file_path=hashtag_file) + new_video_list = [ + video for video in video_list if video["id"] not in already_downloaded_ids + ] + if len(new_video_list) == 0: + logging.warning( + f"No new videos to be downloaded for the hashtag: {hashtag}" + ) + + # Populate list of URLs to download using yt-dlp, and list of image + # galleries to download using the `download_gallery` function + urls_to_download = [] + galleries_to_download = [] + for video in new_video_list: + if video.get("imagePost") is None: + url = f"https://www.tiktok.com/@{video['author']['uniqueId']}/video/{video['id']}" + urls_to_download.append(url) + else: + galleries_to_download.append(video) + + # Download audio and image files for all image gallery posts + if len(galleries_to_download) > 0: + logging.info(f"Downloading image galleries for hashtag {hashtag}") + for video in galleries_to_download: + download_gallery(video_data=video, video_dir=video_dir) + + # Download video files for all video posts + if len(urls_to_download) > 0: + logging.info(f"Downloading videos for hashtag {hashtag}") + ydl_opts = {"outtmpl": os.path.join(video_dir, "%(id)s.%(ext)s")} + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + ydl.download(urls_to_download) + + def frequency_table(self, hashtag: str, number: int): + """Print `number`-most commonly co-occurring hashtags for a specified + source hashtag, in tabular form.""" + + # Load video data file and extract co-occurring hashtag frequency information + hashtag_file = self.data_dir / hashtag / "posts.json" + frequencies = aggregate_cooccurring_hashtags(hashtag_file=hashtag_file) + + # Print table that displays most commonly co-occurring hashtags + total_posts = max(frequencies.values()) + print(f"\nCo-occurring hashtags for #{hashtag} posts") + print(f"{'Rank':<8} {'Hashtag':<30} {'Occurrences':<15} {'Frequency':<15}") + for row, (hashtag, frequency) in enumerate(frequencies.most_common(number)): + ratio = frequency / total_posts + print(f"{row:<8} {hashtag:<30} {frequency:<15} {ratio:.4f}") + print(f"Total posts: {total_posts}\n\n") + + def plot(self, hashtag: str, number: int): + """Create plot of `number`-most commonly co-occurring hashtags for a + specified source hashtag.""" + + # Load video data file and extract co-occurring hashtag frequency information + hashtag_file = self.data_dir / hashtag / "posts.json" + frequencies = aggregate_cooccurring_hashtags(hashtag_file=hashtag_file) + + # Define labels and other fields used in plot + total_posts = max(frequencies.values()) + sorted_frequencices = frequencies.most_common(number) + labels = [label for label, _ in sorted_frequencices[1:]] + ratios = [freq / total_posts * 100 for _, freq in sorted_frequencices[1:]] + y_pos = list(reversed(range(len(sorted_frequencices) - 1))) + + # Visualize data in bar chart + fig, ax = plt.subplots(figsize=(5, 6.66)) + ax.barh(y_pos, ratios) + ax.set_yticks(y_pos) + ax.set_yticklabels(labels) + ax.grid(axis="y") + ax.set_xlabel("Percent of posts with co-occurring hashtag") + ax.set_ylim(min(y_pos) - 1, max(y_pos) + 1) + ax.set_title(f"Co-occurring hashtags for #{hashtag} posts") + ax.xaxis.set_major_formatter(mtick.PercentFormatter(decimals=0)) + + # Write image of plot to file + current_time = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + plot_file = self.data_dir / hashtag / "plots" / f"{hashtag}__{current_time}.png" + plot_file.parent.mkdir(exist_ok=True, parents=True) + plt.savefig(plot_file, bbox_inches="tight", facecolor="white", dpi=300) + logging.info(f"Plot saved to file: {plot_file}") + + def run(self, download: bool, plot: bool, table: bool, number: int): + """Execute the specified operations on all specified hashtags.""" + + # Scrape all specified hashtags and perform analyses, depending on if + # `--table` and `--plot` flags are used in the command + for hashtag in self.hashtags: + self.get_hashtag_posts(hashtag=hashtag) + if plot: + self.plot(hashtag=hashtag, number=number) + if table: + self.frequency_table(hashtag=hashtag, number=number) + + # Download media for all hashtags if `--download` flag is used in the command + for hashtag in self.hashtags: + if download: + self.get_hashtag_videos(hashtag=hashtag) diff --git a/tiktok_hashtag_analysis/data_methods.py b/tiktok_hashtag_analysis/data_methods.py deleted file mode 100644 index 24078af..0000000 --- a/tiktok_hashtag_analysis/data_methods.py +++ /dev/null @@ -1,161 +0,0 @@ -"""Utility functions that perform data processing related tasks. -""" - -from typing import NamedTuple, List, Tuple, Set, Optional, Dict, Any -import logging - -from . import file_methods - -logger = logging.getLogger() - - -class Diff(NamedTuple): - """Keep track of scraped post IDs and whether previously-scraped posts have been filtered.""" - - ids: Set[str] - filter_posts: bool - - -class Total(NamedTuple): - """Keep track of number of total and number of unique scraped posts.""" - - total: int - unique: int - - -def get_difference(tag: str, file_name: str, ids: List[str]) -> Optional[Diff]: - """Find TikTok post IDs that haven't previously been scraped. - - Filter out the new posts for the hashtag `tag` by comparing the list of - post IDs contained in `filename` to the list of newly downloaded IDs - contained in `ids`. - """ - filter_posts = False - current_id_data = file_methods.get_data(file_name) - if tag in current_id_data: - current_ids = current_id_data[tag] - set_current_ids = set(current_ids) - total_current_ids = len(set_current_ids) - set_ids = set(ids) - new_ids = set_ids.difference(set_current_ids) - if not new_ids: - return None - else: - total_new_ids = len(new_ids) - if total_new_ids == total_current_ids: - new_data = Diff(new_ids, filter_posts) - else: - new_data = Diff(new_ids, filter_posts) - return new_data - else: - filter_posts = True - new_data = Diff(set(ids), filter_posts) - return new_data - - -def extract_posts( - settings: Dict[Any, Any], file_name: str, tag: str -) -> Optional[Tuple[List[str], List[Dict]]]: - """Find TikTok posts that haven't previously been scraped. - - Compares the file downloaded by tiktok-scraper to the list of - previously-scraped posts (from the file ids/post_ids.json). - """ - ids = [] - posts = [] - - posts = file_methods.get_data(file_name) - for post in posts: - ids.append(post["id"]) - - if not ids: - logger.warn(f"No posts were found for the hashtag: {tag}") - return None - - status = file_methods.check_existence(settings["post_ids"], "file") - if not status: - new_data = (ids, posts) - return new_data - else: - new_ids = get_difference(tag, settings["post_ids"], ids) - if not new_ids: - logger.warn(f"No new posts were found for the hashtag: {tag}") - return None - elif new_ids.filter_posts: - new_posts = [post for post in posts if post["id"] in new_ids.ids] - return (list(new_ids.ids), new_posts) - else: - return (list(new_ids.ids), posts) - - -def extract_videos(settings: dict, tag: str, download_list: List[str]) -> List[str]: - """Find TikTok videos that haven't previously been scraped. - - Compares the file downloaded by tiktok-scraper to the list of - previously-scraped videos (from the file ids/video_ids.json). - """ - status = file_methods.check_existence(settings["video_ids"], "file") - if not status: - new_data = download_list - return new_data - else: - new_videos = get_difference(tag, settings["video_ids"], download_list) - if not new_videos: - logger.warn( - f"No new videos were found for the {tag} in the downloaded folder." - ) - return [] - else: - return list(new_videos.ids) - - -def update_posts( - file_path: str, file_type: str, new_data: List[Any], tag: str = None -) -> Optional[Tuple[str, int]]: - """Update the file containing scraped post IDs (`ids/post_ids.json`) with - the IDs of the recently scraped posts. - """ - status = file_methods.check_existence(file_path, file_type) - if not tag: - file_methods.post_writer(file_path, new_data, status) - return None - else: - scraped_data = file_methods.id_writer(file_path, new_data, tag, status) - return scraped_data - - -def update_videos( - settings: Dict[str, Any], new_data: List[str], tag: str -) -> Tuple[str, int]: - """Update the file containing video IDs (`ids/video_ids.json`) with the IDs - of the recently scraped videos. - """ - file_path = settings["video_ids"] - file_methods.check_file(file_path, "file") - number_scraped = file_methods.id_writer(file_path, new_data, tag, True) - file_methods.clean_video_files(settings, tag, new_data) - return number_scraped - - -def get_total_posts(file_path: str, tag: str) -> Total: - """Count number of total scraped posts and number of unique scraped posts.""" - status = file_methods.check_existence(file_path, "file") - if not status: - raise OSError(f"{file_path} not found!") - else: - data = file_methods.get_data(file_path) - total_posts = len(data[tag]) - unique = len(set(data[tag])) - t = Total(total_posts, unique) - return t - - -def print_total(file_path: str, tag: str, data_type: str): - """Print number of total and unique scraped posts, warn if any non-unique posts.""" - total = get_total_posts(file_path, tag) - if total.total == total.unique: - logger.info(f"Scraped {total.total} {data_type} containing the hashtag '{tag}'") - else: - logger.warn( - f"Out of total {data_type} for the hashtag {tag} {total.total}, only {total.unique} are unique. Something is going wrong..." - ) diff --git a/tiktok_hashtag_analysis/file_methods.py b/tiktok_hashtag_analysis/file_methods.py deleted file mode 100644 index 024eadc..0000000 --- a/tiktok_hashtag_analysis/file_methods.py +++ /dev/null @@ -1,216 +0,0 @@ -"""Utility functions that operate on files, such as writing to reading from a file. -""" - -import os -import json -import subprocess -from os import path -from datetime import datetime -import shutil -from typing import Tuple, List, Optional, Dict, Any - -import logging, logging.config - -logging.config.fileConfig(path.join(path.dirname(path.abspath(__file__)), 'logging.config')) -logger = logging.getLogger("Logger") - - -def create_file(name: str, file_type: str): - """Create a file or directory.""" - if file_type == "dir": - os.makedirs(name, mode=0o777) - elif file_type == "file": - with open(name, "w"): - pass - else: - raise ValueError(f"{file_type} has to be either 'dir' or 'file'") - - -def check_existence(file_path: str, file_type: str): - """Check if a file or a directory exists.""" - if file_type == "file": - return os.path.isfile(file_path) - elif file_type == "dir": - return os.path.isdir(file_path) - else: - raise ValueError(f"{file_type} has to be either 'dir' or 'file'") - - -def check_file(file_path: str, file_type: str): - """If path does not exist, creates a file or directory.""" - status = check_existence(file_path, file_type) - if not status: - create_file(file_path, file_type) - - -def download_posts(settings: Dict, tag: str, output_dir: Any): - """Run the tiktok-scraper command to download posts for a given hashtag. - - Returns the path to the downloaded file of posts. If no file was downloaded, - prints the error and returns nothing in order to move on. - - os.chdir is used to execute shell commands in the correct folder and then - reused to return to the original folder of execution of run_downloader script. - """ - path = os.path.join(settings["data"], tag, settings["posts"]) - os.makedirs(path, exist_ok=True) - tiktok_command = f"tiktok-scraper hashtag {tag} -t 'json' --filepath {output_dir}" - output = subprocess.check_output(tiktok_command, shell=True, encoding="utf-8") - new_file = output.split()[-1] - if "json" in new_file: - return new_file - else: - logger.warn( - f"Something's wrong with what is returned by tiktok-scraper for the hashtag {tag} - *{new_file}* is not a json file.\n\ntiktok-scraper returned {output}" - ) - - -def download_videos(settings: Dict, tag: str): - """Run the tiktok-scraper command to download videos for a given hashtag. - - Note that all the videos are downloaded that are returned by the TikTok API, - making this a time- and data-intensive process. - The list of downloaded video IDs is constucted and returned if the - downloaded folder contains at least 1 video. - - os.chdir is used to execute shell commands in the correct folder and then - reused to return to the original folder of execution of run_downloader script. - """ - path = os.path.join(settings["data"], tag, settings["videos"]) - os.makedirs(path, exist_ok=True) - tiktok_command = f"tiktok-scraper hashtag {tag} -d --filepath {path}" - result = subprocess.check_output(tiktok_command, shell=True) - downloaded_list_tmp = os.listdir(os.path.join(path, f"#{tag}")) - if downloaded_list_tmp: - downloaded_list = [] - for file in downloaded_list_tmp: - file = file.split(".")[0] - downloaded_list.append(file) - - return downloaded_list - else: - logger.warn(f"No video files were downloaded for the hashtag {tag}.") - shutil.rmtree(settings["videos_delete"]) - - -def get_data(file_path: str) -> Any: - """Read a JSON file and return the read data.""" - with open(file_path, "r", encoding="utf-8") as f: - data = json.load(f) - return data - - -def dump_data(file_path: str, data: Any): - """Write data to a JSON file.""" - with open(file_path, "w", encoding="utf-8") as f: - json.dump(data, f) - - -def log_writer(log_data: List[Tuple[str, Tuple[str, int]]]): - """Create the dictionary of total downloads (posts and videos) per hashtag. - - Example : { - timetamp : { - hashtag : { - videos : number_of_new_videos , - posts : number_of_new_posts - } - } - } - - Writes the dictionary to the log file (`logs/log.json`). - """ - - total = 0 - scraped_summary_dict = {} # type: Dict[str, Dict[str, int]] - for hashtag, (data_type, count) in log_data: - if hashtag in scraped_summary_dict: - if data_type in scraped_summary_dict[hashtag]: - scraped_summary_dict[hashtag][data_type] += count - else: - scraped_summary_dict[hashtag][data_type] = count - total += count - else: - scraped_summary_dict[hashtag] = {data_type: count} - total += count - - now_str = datetime.now().strftime("%d-%m-%Y %H:%M:%S") - data = {now_str: scraped_summary_dict} - - logger.debug(f"Logged post data: {data}") - logger.info(f"Successfully scraped {total} total entries") - - -def id_writer( - file_path: str, new_data: List[str], tag: str, status: bool -) -> Tuple[str, int]: - """Write the list of new ids to the post_ids or video_ids file.""" - - total = len(new_data) - if status: - try: - data = get_data(file_path) - if tag in data: - data[tag] += new_data - else: - data[tag] = new_data - dump_data(file_path, data) - except json.decoder.JSONDecodeError: - data = {tag: new_data} - dump_data(file_path, data) - else: - data = {tag: new_data} - dump_data(file_path, data) - logger.debug(f"SUCCESS - {total} entries added to {file_path}") - number_scraped = (tag, total) - return number_scraped - - -def post_writer(file_path: str, new_data: List[Dict], status: bool): - """Write the new posts in the post file of the given hashtag - (`/data/{hashtag}/posts/data.json`). - """ - total = len(new_data) - if status: - try: - data = get_data(file_path) - data += new_data - dump_data(file_path, data) - except json.decoder.JSONDecodeError: - data = new_data - dump_data(file_path, data) - else: - data = new_data - dump_data(file_path, data) - logger.debug(f"SUCCESS - {total} entries added to {file_path}") - - -def delete_file(file_path: str, file_type: str): - """Delete a directory or file.""" - if not check_existence(file_path, file_type): - raise OSError(f"Attempt to delete file failed: {file_path} does not exist") - elif file_type == "file": - os.remove(file_path) - logger.debug(f"Successfully deleted {file_path}") - elif file_type == "dir": - os.rmdir(file_path) - logger.debug(f"Successfully deleted {file_path}") - else: - raise OSError("{file_type} needs to be either 'file' or 'dir'") - - -def clean_video_files(settings: dict, tag: str, new_data: Optional[List[str]] = None): - """Move the new videos from the tiktok-scraper video folder to `/data/{hashtag}/videos/`. - Deletes the residual tiktok-scraper video folder. - """ - if new_data: - for file in new_data: - settings["videos_from"] = ( - settings["data"] + f"/{tag}/videos/#{tag}/{file}.mp4" - ) - shutil.move(settings["videos_from"], settings["videos_to"]) - - shutil.rmtree(settings["videos_delete"]) - logger.debug( - f"Successfully deleted the folder {settings['videos_delete']} folder of videos." - ) diff --git a/tiktok_hashtag_analysis/global_data.py b/tiktok_hashtag_analysis/global_data.py deleted file mode 100644 index ed8c317..0000000 --- a/tiktok_hashtag_analysis/global_data.py +++ /dev/null @@ -1,32 +0,0 @@ -"""Specify global constants including file paths and scraping options. -""" - - -# Directories -DATA = "../data" -IDS = "ids" -POSTS = "posts" -VIDEOS = "videos" -IMAGES = f"{DATA}/img" - -# Files -POST_IDS = "post_ids.json" -VIDEO_IDS = "video_ids.json" -DATA_FILE = "data.json" - -FILES = { - "data": DATA, - "ids": IDS, - "posts": POSTS, - "videos": VIDEOS, - "images": IMAGES, - "post_ids": f"{DATA}/{IDS}/{POST_IDS}", - "video_ids": f"{DATA}/{IDS}/{VIDEO_IDS}", - "data_file": f"{DATA_FILE}", - "downloads": [], -} - -PARAMETERS = { - "scraper_attempts": 3, - "sleep": 8, -} diff --git a/tiktok_hashtag_analysis/hashtag_frequencies.py b/tiktok_hashtag_analysis/hashtag_frequencies.py deleted file mode 100644 index 204e6ee..0000000 --- a/tiktok_hashtag_analysis/hashtag_frequencies.py +++ /dev/null @@ -1,99 +0,0 @@ -"""Analyze the frequency of hashtags appearing in the set of given posts. - -- The "hashtag" positional argument specifies the hashtag of scraped posts to analyze -- The "n" positional argument specifies how many hashtags does the user wants to analyze -- Specifying the "-d" flag prints the hashtag frequencies on the shell -- Specifying the "-p" flag plots the hashtag frequencies and saves as a png file -""" -import json -from datetime import datetime -import warnings -import logging -from typing import List, Tuple, Dict, Any -import matplotlib.pyplot as plt -import matplotlib.ticker as mtick -import seaborn as sns - -warnings.filterwarnings("ignore", message="Glyph (.*) missing from current font") -sns.set_theme(style="darkgrid") - - -def get_hashtags(obj: Dict) -> List[Tuple[str, int]]: - if not obj: - raise ValueError(f"Empty item, no hashtags could be extracted.") - else: - hashtags = {} - tags = [set([tag["name"] for tag in ele["hashtags"]]) for ele in obj] - { - tag: ( - 1 - if tag not in hashtags and not hashtags.update({tag: 1}) - else hashtags[tag] + 1 and not hashtags.update({tag: hashtags[tag] + 1}) - ) - for ele in tags - for tag in ele - } - - return sorted(hashtags.items(), key=lambda e: e[1], reverse=True) - - -def get_occurrences(filename: str, n: int = 1) -> Dict[str, Any]: - """Aggregate hashtag frequency information for a specified JSON file. - - Example: { - "total": total posts in the file, - top_n: [[top n hashtags ], [frequencies of corresponding hashtags]] - } - """ - with open(filename) as f: - obj = json.load(f) - l = len(obj) - tags = get_hashtags(obj) - occs = {"total": l, "top_n": []} - occs["top_n"] = [[ele[i] for ele in tags[0 : min(l, n)]] for i in range(2)] - return occs - - -def plot(occs: dict, img_folder: str): - """Save plot of common hashtags as bar chart to file.""" - y_pos = list(reversed(range(len(occs["top_n"][0]) - 1))) - max_count = occs["top_n"][1][0] - freqs = [count / max_count * 100 for count in occs["top_n"][1][1:]] - labels = occs["top_n"][0][1:] - hashtag = occs["top_n"][0][0] - - fig, ax = plt.subplots(figsize=(5, 6.66)) - ax.barh(y_pos, freqs) - ax.set_yticks(y_pos) - ax.set_yticklabels(labels) - ax.grid(axis="y") - ax.set_xlabel("Percent of posts with common hashtag") - ax.set_ylim(min(y_pos) - 1, max(y_pos) + 1) - ax.set_title(f"Common hashtags for #{hashtag} posts") - ax.xaxis.set_major_formatter(mtick.PercentFormatter(decimals=0)) - save_plot(img_folder, hashtag) - - -def save_plot(img_folder, hashtag): - """Save the plot as a png file in the folder ../data/imgs/""" - now = datetime.now() - current_time = now.strftime("%Y_%m_%d_%H_%M_%S") - filename = f"{img_folder}/{hashtag}_{current_time}.png" - logging.info(f"Plot saved to file: {filename}") - plt.savefig(filename, bbox_inches="tight", facecolor="white", dpi=300) - - -def print_occurrences(occs): - """Print information about the top n hashtags and their frequencies.""" - row_number = 0 - total_posts = occs["total"] - print( - "{:<8} {:<30} {:<15} {:<15}".format( - "Rank", "Hashtag", "Occurrences", "Frequency" - ) - ) - for key, value in zip(occs["top_n"][0], occs["top_n"][1]): - ratio = value / total_posts - print("{:<8} {:<30} {:<15} {:.4f}".format(row_number, key, value, ratio)) - row_number += 1 - print(f"Total posts: {total_posts}") diff --git a/tiktok_hashtag_analysis/hashtag_list.txt b/tiktok_hashtag_analysis/hashtag_list.txt deleted file mode 100644 index d2303f9..0000000 --- a/tiktok_hashtag_analysis/hashtag_list.txt +++ /dev/null @@ -1,5 +0,0 @@ -# Enter a hashtag per line. Each line should contain only one word. -london -paris -tokyo -newyork diff --git a/tiktok_hashtag_analysis/logging.config b/tiktok_hashtag_analysis/logging.config deleted file mode 100644 index faac2d3..0000000 --- a/tiktok_hashtag_analysis/logging.config +++ /dev/null @@ -1,36 +0,0 @@ -[loggers] -keys=root,Logger - -[handlers] -keys=consoleHandler,fileHandler - -[formatters] -keys=consoleFormatter,fileFormatter - -[logger_root] -level=DEBUG -handlers=consoleHandler - -[logger_Logger] -level=DEBUG -handlers=consoleHandler,fileHandler -qualname=Logger -propagate=0 - -[handler_consoleHandler] -class=StreamHandler -level=INFO -formatter=consoleFormatter -args=(sys.stdout,) - -[handler_fileHandler] -class=FileHandler -level=DEBUG -formatter=fileFormatter -args=("../logfile.log",) - -[formatter_consoleFormatter] -format=%(message)s - -[formatter_fileFormatter] -format=%(asctime)s - %(name)s - %(levelname)s - %(message)s diff --git a/tiktok_hashtag_analysis/run_downloader.py b/tiktok_hashtag_analysis/run_downloader.py deleted file mode 100644 index a74825b..0000000 --- a/tiktok_hashtag_analysis/run_downloader.py +++ /dev/null @@ -1,150 +0,0 @@ -"""Download post data or videos from TikToks containing one or more specified hashtags. - -- The "-p" flag specifies that only data from posts is downloaded, no video files -- The "-v" flag specifies that only video files are downloaded, no post data -- Specifying both "-p" and "-v" flags downloads both post data and video files -- The "-t" flag allows the user to specify a list of space-separated hashtags as an argument -- The "-f" flag allows the user to specify the filename of a text file containing a list of newline-separated hashtags as an argument -""" - -import os -import time -from typing import List, Tuple, Dict, Any, Optional -from tempfile import TemporaryDirectory -from tiktok_hashtag_analysis import global_data -import tiktok_hashtag_analysis.file_methods as file_methods -from tiktok_hashtag_analysis import data_methods - - -def get_hashtag_list(file_name: str) -> List[str]: - """Extract list of newline-separated hashtags from text file.""" - if not file_methods.check_existence(file_name, "file"): - raise OSError(f"{file_name} does not exist") - with open(file_name) as f: - tags = list( - filter(None, [line.strip() for line in f if not line.startswith("#")]) - ) - return tags - - -def set_download_settings(download_data_type: Dict[str, bool]) -> Dict[str, Any]: - """Load the constants from global_data module into the `settings` dict.""" - settings = { - "data": global_data.FILES["data"], - "ids": global_data.FILES["ids"], - "sleep": global_data.PARAMETERS["sleep"], - "scraper": global_data.PARAMETERS["scraper_attempts"], - } - file_methods.check_file(f"{settings['data']}/{settings['ids']}", "dir") - if download_data_type["posts"]: - settings["posts"] = global_data.FILES["posts"] - settings["post_ids"] = global_data.FILES["post_ids"] - settings["data_file"] = global_data.FILES["data_file"] - - if download_data_type["videos"]: - settings["videos"] = global_data.FILES["videos"] - settings["video_ids"] = global_data.FILES["video_ids"] - - return settings - - -def get_posts(settings: dict, tag: str) -> Optional[Tuple[str, int]]: - """Scrape trending TikTok post data for the specified hashtag. - - 1. Calls `file_methods.download_posts` to scrape the post data for a given hashtag - 2. Calls `data_methods.extract_posts` to determine which if any posts - haven't previously been downloaded. - 3. Calls `data_methods.update_posts` to update the ID list with the IDs of - newly downloaded posts. - """ - with TemporaryDirectory() as temp_dir: - file_path = file_methods.download_posts(settings, tag, temp_dir) - number_scraped = None - if file_path: - new_data = data_methods.extract_posts(settings, file_path, tag) - if new_data: - data_file = os.path.join( - settings["data"], tag, settings["posts"], settings["data_file"] - ) - data_methods.update_posts(data_file, "file", new_data[1]) - number_scraped = data_methods.update_posts( - settings["post_ids"], "file", new_data[0], tag - ) - - return number_scraped - - -def get_videos(settings: dict, tag: str) -> Optional[Tuple[str, int]]: - """Scrape trending TikTok video files for the specified hashtag. - - 1. Calls `file_methods.download_videos` to download the video files for a given hashtag - 2. Calls `data_methods.extract_videos` to determine which if any videos - haven't previouly been downloaded. - 3. Calls `data_methods.update_videos` to update the ID list with the IDs of - newly downloaded videos. - 4. Calls `clean_video_files` function to delete the residual video folder - after the data processing. - """ - number_scraped = None - download_list = file_methods.download_videos(settings, tag) - if download_list: - new_data = data_methods.extract_videos(settings, tag, download_list) - if new_data: - number_scraped = data_methods.update_videos(settings, new_data, tag) - else: - file_methods.clean_video_files(settings, tag) - - return number_scraped - - -def get_data( - hashtags: list, download_data_type: Dict[str, bool] -) -> List[Tuple[str, Tuple[str, int]]]: - """Check command-line arguments and scrape posts/videos for specified hashtags.""" - counter = 0 - total_hashtags = len(hashtags) - total_hashtags_offset = total_hashtags - 1 - scraped_summary_list = [] - - if download_data_type["posts"]: - settings = set_download_settings(download_data_type) - while counter < total_hashtags: - tag = hashtags[counter] - file_methods.check_file( - os.path.join(settings["data"], tag, settings["posts"]), "dir" - ) - file_methods.check_file( - os.path.join( - settings["data"], tag, settings["posts"], settings["data_file"] - ), - "file", - ) - res = get_posts(settings, tag) - if res: - number_scraped = (res[0], ("posts", res[1])) - scraped_summary_list.append(number_scraped) - data_methods.print_total(settings["post_ids"], tag, "posts") - - counter += 1 - if counter < total_hashtags_offset: - time.sleep(settings["sleep"]) - - if download_data_type["videos"]: - settings = set_download_settings(download_data_type) - while counter < total_hashtags: - tag = hashtags[counter] - file_methods.check_file( - os.path.join(settings["data"], tag, settings["videos"]), "dir" - ) - settings["videos_delete"] = settings["data"] + f"/{tag}/videos/#{tag}" - settings["videos_to"] = settings["data"] + f"/{tag}/videos" - _res = get_videos(settings, tag) - if _res: - scraped_summary_list.append((_res[0], ("videos", _res[1]))) - data_methods.print_total(settings["video_ids"], tag, "videos") - - counter += 1 - if counter < total_hashtags_offset: - time.sleep(settings["sleep"]) - - return scraped_summary_list diff --git a/tiktok_hashtag_analysis/version.py b/tiktok_hashtag_analysis/version.py deleted file mode 100644 index 0899cd6..0000000 --- a/tiktok_hashtag_analysis/version.py +++ /dev/null @@ -1,12 +0,0 @@ - -_MAJOR = "1" -_MINOR = "0" -# On main and in a nightly release the patch should be one ahead of the last -# released build. -_PATCH = "4" -# This is mainly for nightly builds which have the suffix ".dev$DATE". See -# https://semver.org/#is-v123-a-semantic-version for the semantics. -_SUFFIX = "" - -VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR) -__version__ = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX) \ No newline at end of file From cf575e6cf60420587913a1cd64897dd76405f347 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Fri, 1 Sep 2023 18:33:32 -0500 Subject: [PATCH 2/6] updated README and added authorization --- .gitignore | 2 + README.md | 71 ++++++++++++++--------------- tiktok_hashtag_analysis/__main__.py | 1 - tiktok_hashtag_analysis/auth.py | 67 +++++++++++++++++++++++++++ tiktok_hashtag_analysis/base.py | 6 +-- 5 files changed, 107 insertions(+), 40 deletions(-) create mode 100644 tiktok_hashtag_analysis/auth.py diff --git a/.gitignore b/.gitignore index eca42b1..525e540 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ # Data directory data/ +build/ +*.egg-info/ # Miscellaneous files **/.DS_Store diff --git a/README.md b/README.md index b0e3f25..75e5e26 100644 --- a/README.md +++ b/README.md @@ -59,40 +59,38 @@ The `data` folder contains all the downloaded data as shown in the tree diagram ## How to use ### Post downloading -Running the `tiktok-hashtag-analysis` command with the following options will scrape posts containing the hashtags `#london`, `#paris`, or `#newyork`: +Running the `tiktok-hashtag-analysis` command with the following options will scrape posts that contain the hashtags `#london`, `#paris`, or `#newyork`: tiktok-hashtag-analysis london paris newyork and will produce an output similar to the following log: - $ tiktok-hashtag-analysis download -t london paris newyork -p + $ tiktok-hashtag-analysis download london paris newyork Hashtags to scrape: ['london', 'paris', 'newyork'] Scraped 963 posts containing the hashtag 'london' Scraped 961 posts containing the hashtag 'paris' Scraped 940 posts containing the hashtag 'newyork' Successfully scraped 2864 total entries -- The `-t` flag allows a space-separated list of hashtags to be specified as a command line argument -- The `-p` flag specifies that posts, not videos, will be downloaded +- The list of hashtags to scrape is specified as a positional argument ### Video downloading -Running the `tiktok-hashtag-analysis download` script with the following options will scrape trending videos containing the hashtag `#london`: -`tiktok-hashtag-analysis download -t london -v` +Running the `tiktok-hashtag-analysis` script with the following options will scrape trending videos containing the hashtag `#london`: +`tiktok-hashtag-analysis download london --download` -- The `-t` flag allows a space-separated list of hashtags to be specified as a command line argument -- The `-v` flag specifies that videos, not posts, will be downloaded +- The `--download` flag specifies that video files for scraped posts should be downloaded -Note that video downloading is a time and data rate consuming task, as a result we recommend using one hashtag at a time when using the `-v` flag to avoid complications. +Note that video downloading is a time and data rate consuming task, as a result we recommend using one hashtag at a time when using the `--download` flag to avoid complications. ## Analyzing results -### Top n hashtag occurrences -The script `tiktok-hashtag-analysis frequencies` analyzes the frequencies of top occurring hashtags in a given set of posts. +### Most common co-occurring hashtags +In addition to scraping data and downloading videos, the `tiktok-hashtag-analysis` script can also analyze the frequencies of the most common co-occurring hashtags in a given set of posts. -Assume we want to analyze the 20 most frequently occurring hashtags in the downloaded posts of the `#london` hashtag. +Assume we want to analyze the 20 most frequently co-occurring hashtags in the downloaded posts of the `#london` hashtag. - The results can be plotted and saved as a PNG file by executing the following command: - `tiktok-hashtag-analysis frequencies --hashtag london --number 20 --plot` + `tiktok-hashtag-analysis london --number 20 --plot` which will produce a figure similar to that shown below:

@@ -103,32 +101,33 @@ Assume we want to analyze the 20 most frequently occurring hashtags in the downl - The results can be displayed in tabular form by executing the following command: - `tiktok-hashtag-analysis frequencies --hashtag london --number 20 --print` + `tiktok-hashtag-analysis london --number 20 --table` which will produce a terminal output similar to the following: ``` - Rank Hashtag Occurrences Frequency - 0 london 960 1.0000 - 1 fyp 494 0.5146 - 2 uk 238 0.2479 - 3 foryou 221 0.2302 - 4 foryoupage 184 0.1917 - 5 viral 179 0.1865 - 6 fypシ 84 0.0875 - 7 funny 56 0.0583 - 8 xyzbca 51 0.0531 - 9 british 45 0.0469 - 10 england 44 0.0458 - 11 trending 40 0.0417 - 12 fy 33 0.0344 - 13 comedy 32 0.0333 - 14 roadman 28 0.0292 - 15 4u 27 0.0281 - 16 usa 26 0.0271 - 17 tiktok 26 0.0271 - 18 travel 21 0.0219 - 19 america 20 0.0208 - Total posts: 960 + Co-occurring hashtags for #london posts + Rank Hashtag Occurrences Frequency + 0 london 881 1.0000 + 1 fyp 399 0.4529 + 2 uk 174 0.1975 + 3 foryou 168 0.1907 + 4 viral 152 0.1725 + 5 foryoupage 137 0.1555 + 6 fypシ 73 0.0829 + 7 funny 54 0.0613 + 8 tiktok 43 0.0488 + 9 trending 43 0.0488 + 10 british 41 0.0465 + 11 england 38 0.0431 + 12 xyzbca 34 0.0386 + 13 fy 33 0.0375 + 14 usa 33 0.0375 + 15 love 29 0.0329 + 16 comedy 25 0.0284 + 17 royalfamily 23 0.0261 + 18 queen 23 0.0261 + 19 queenelizabeth 22 0.0250 + Total posts: 881 ``` The `Frequency` column shows the ratio of the occurrence to the total number of downloaded posts. diff --git a/tiktok_hashtag_analysis/__main__.py b/tiktok_hashtag_analysis/__main__.py index 8e7dce2..8a9e5ee 100644 --- a/tiktok_hashtag_analysis/__main__.py +++ b/tiktok_hashtag_analysis/__main__.py @@ -1,7 +1,6 @@ import logging import argparse from pathlib import Path -import sys from .base import TikTokDownloader, load_hashtags_from_file diff --git a/tiktok_hashtag_analysis/auth.py b/tiktok_hashtag_analysis/auth.py new file mode 100644 index 0000000..17b8f3c --- /dev/null +++ b/tiktok_hashtag_analysis/auth.py @@ -0,0 +1,67 @@ +import os +import configparser +from pathlib import Path +import logging + + +class Authorization: + """Handle authorization for TikTok, using the `msToken`.""" + + def __init__(self): + self.config_file = Path.home() / ".tiktok" + self.section = "TikTok" + self.ms_token = None + + def get_token(self): + """Load the "msToken" cookie taken from TikTok, which the scraper requires.""" + + # Step 1: check if MS_TOKEN is defined as environment variable + if ms_token := os.environ.get("MS_TOKEN"): + self.ms_token = ms_token + logging.info("Loaded token from environment variable") + + # Step 2: check if MS_TOKEN is defined in config file + elif self.config_file.is_file(): + if ms_token := self.load_token(): + self.ms_token = ms_token + logging.info(f"Loaded token from config file: {self.config_file}") + + # Step 3: have user enter MS_TOKEN via terminal + else: + ms_token = self.input_token() + self.dump_token(ms_token=ms_token) + self.ms_token = ms_token + logging.info( + f"Loaded token from user input and saved to config file: {self.config_file}" + ) + + return self.ms_token + + def load_token(self): + """Parse a config file and extract the token.""" + + config = configparser.ConfigParser() + config.read(self.config_file) + return config.get(section=self.section, option="MS_TOKEN", fallback=None) + + def dump_token(self, ms_token): + """Write the token to a config file.""" + + config = configparser.ConfigParser() + config.read(self.config_file) + config.add_section(self.section) + config.set(section=self.section, option="MS_TOKEN", value=ms_token) + + with open(self.config_file, "w") as f: + config.write(f) + + def input_token(self): + """Allow user to manually enter the token in the terminal.""" + + print( + "\nPlease copy and paste your `msToken` cookie taken from your web browser when visiting the TikTok website. See [THIS VIDEO] for more information.\n" + ) + + ms_token = input("msToken: ") + + return ms_token diff --git a/tiktok_hashtag_analysis/base.py b/tiktok_hashtag_analysis/base.py index 63224ef..74df81b 100644 --- a/tiktok_hashtag_analysis/base.py +++ b/tiktok_hashtag_analysis/base.py @@ -167,7 +167,7 @@ class TikTokDownloader: f"No new videos to be downloaded for the hashtag: {hashtag}" ) - # Populate list of URLs to download using yt-dlp, and list of image + # Populate list of URLs to download using yt-dlp, and list of image # galleries to download using the `download_gallery` function urls_to_download = [] galleries_to_download = [] @@ -233,7 +233,7 @@ class TikTokDownloader: ax.set_ylim(min(y_pos) - 1, max(y_pos) + 1) ax.set_title(f"Co-occurring hashtags for #{hashtag} posts") ax.xaxis.set_major_formatter(mtick.PercentFormatter(decimals=0)) - + # Write image of plot to file current_time = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") plot_file = self.data_dir / hashtag / "plots" / f"{hashtag}__{current_time}.png" @@ -244,7 +244,7 @@ class TikTokDownloader: def run(self, download: bool, plot: bool, table: bool, number: int): """Execute the specified operations on all specified hashtags.""" - # Scrape all specified hashtags and perform analyses, depending on if + # Scrape all specified hashtags and perform analyses, depending on if # `--table` and `--plot` flags are used in the command for hashtag in self.hashtags: self.get_hashtag_posts(hashtag=hashtag) From 0f8e865bf3c958db567f80a056ee207f81094a15 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Mon, 4 Sep 2023 10:40:30 -0500 Subject: [PATCH 3/6] added type hints for auth, incorporated auth into base module --- tiktok_hashtag_analysis/__main__.py | 14 +++++++++++++- tiktok_hashtag_analysis/auth.py | 23 ++++++++++++++--------- tiktok_hashtag_analysis/base.py | 17 +++++++++++------ 3 files changed, 38 insertions(+), 16 deletions(-) diff --git a/tiktok_hashtag_analysis/__main__.py b/tiktok_hashtag_analysis/__main__.py index 8a9e5ee..3c3bbfd 100644 --- a/tiktok_hashtag_analysis/__main__.py +++ b/tiktok_hashtag_analysis/__main__.py @@ -6,6 +6,8 @@ from .base import TikTokDownloader, load_hashtags_from_file def create_parser(): + """Create parser tp parse input command-line arguments.""" + parser = argparse.ArgumentParser( description="Analyze hashtags within posts scraped from TikTok." ) @@ -51,12 +53,20 @@ def create_parser(): help="Directory to save scraped data and visualizations to", default=Path(".").resolve().parent / "data", ) + parser.add_argument( + "--config", + type=str, + help="File name of configuration file to store TikTok credentials to", + default=None, + ) parser.add_argument("--log", type=str, help="File to write logs to", default=None) return parser def main(): + """Parse and process command-line arguments, scrape specified hashtags, and perform specified analyses.""" + parser = create_parser() args = parser.parse_args() @@ -79,7 +89,9 @@ def main(): else: hashtags = args.hashtags - downloader = TikTokDownloader(hashtags=hashtags, data_dir=args.output_dir) + downloader = TikTokDownloader( + hashtags=hashtags, data_dir=args.output_dir, config_file=args.config + ) downloader.run( download=args.download, plot=args.plot, table=args.table, number=args.number diff --git a/tiktok_hashtag_analysis/auth.py b/tiktok_hashtag_analysis/auth.py index 17b8f3c..5d5ac16 100644 --- a/tiktok_hashtag_analysis/auth.py +++ b/tiktok_hashtag_analysis/auth.py @@ -2,17 +2,22 @@ import os import configparser from pathlib import Path import logging +from typing import Optional class Authorization: """Handle authorization for TikTok, using the `msToken`.""" - def __init__(self): - self.config_file = Path.home() / ".tiktok" - self.section = "TikTok" - self.ms_token = None + def __init__(self, config_file: Optional[str] = None): + if config_file: + self.config_file = Path(config_file) + else: + self.config_file = Path.home() / ".tiktok" - def get_token(self): + self.section = "TikTok" + self.get_token() + + def get_token(self) -> str: """Load the "msToken" cookie taken from TikTok, which the scraper requires.""" # Step 1: check if MS_TOKEN is defined as environment variable @@ -37,14 +42,14 @@ class Authorization: return self.ms_token - def load_token(self): + def load_token(self) -> Optional[str]: """Parse a config file and extract the token.""" config = configparser.ConfigParser() config.read(self.config_file) return config.get(section=self.section, option="MS_TOKEN", fallback=None) - def dump_token(self, ms_token): + def dump_token(self, ms_token: str): """Write the token to a config file.""" config = configparser.ConfigParser() @@ -52,10 +57,10 @@ class Authorization: config.add_section(self.section) config.set(section=self.section, option="MS_TOKEN", value=ms_token) - with open(self.config_file, "w") as f: + with open(self.config_file, "w", encoding="utf-8") as f: config.write(f) - def input_token(self): + def input_token(self) -> str: """Allow user to manually enter the token in the terminal.""" print( diff --git a/tiktok_hashtag_analysis/base.py b/tiktok_hashtag_analysis/base.py index 74df81b..77623a3 100644 --- a/tiktok_hashtag_analysis/base.py +++ b/tiktok_hashtag_analysis/base.py @@ -17,6 +17,8 @@ import seaborn as sns from TikTokApi import TikTokApi +from .auth import Authorization + warnings.filterwarnings("ignore", message="Glyph (.*) missing from current font") sns.set_theme(style="darkgrid") @@ -38,13 +40,11 @@ def load_hashtags_from_file(file: str) -> List[str]: return process_hashtag_list(hashtags=hashtags) -async def _fetch_hashtag_data(hashtag: str) -> List[Dict]: +async def _fetch_hashtag_data(hashtag: str, ms_token: str) -> List[Dict]: """Fetch data for videos containing a specified hashtag, asynchronously.""" data = [] async with TikTokApi() as api: - await api.create_sessions( - ms_tokens=[os.environ["MS_TOKEN"]], num_sessions=1, sleep_after=3 - ) + await api.create_sessions(ms_tokens=[ms_token], num_sessions=1, sleep_after=3) async for video in api.hashtag(name=hashtag).videos(count=1000): data.append(video.as_dict) return data @@ -101,13 +101,16 @@ def aggregate_cooccurring_hashtags(hashtag_file: Path) -> Counter: class TikTokDownloader: """Main class for scraping data from TikTok.""" - def __init__(self, hashtags: List[str], data_dir: str): + def __init__(self, hashtags: List[str], data_dir: str, config_file: str = None): self.hashtags = process_hashtag_list(hashtags) logging.info(f"Hashtags to scrape: {hashtags}") self.data_dir = Path(data_dir) os.makedirs(self.data_dir, exist_ok=True) + self.auth = Authorization(config_file=config_file) + self.ms_token = self.auth.ms_token + def get_hashtag_posts(self, hashtag: str): """Fetch data about posts that used a specified hashtag and merge with existing data, if it exists.""" @@ -125,7 +128,9 @@ class TikTokDownloader: already_fetched_data = [] # Scrape posts that use the specified hashtag - fetched_data = asyncio.run(_fetch_hashtag_data(hashtag=hashtag)) + fetched_data = asyncio.run( + _fetch_hashtag_data(hashtag=hashtag, ms_token=self.ms_token) + ) if len(fetched_data) == 0: logging.warning(f"No posts were found for the hashtag: {hashtag}") From 5ae962496826bed58b80336e91dda3c19d2db12d Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Mon, 4 Sep 2023 13:26:38 -0500 Subject: [PATCH 4/6] added tests, changed __main__ to cli --- pytest.ini | 15 +++++++++ setup.py | 3 +- tests/__init__.py | 0 tests/auth.py | 24 ++++++++++++++ tests/base.py | 15 +++++++++ tests/cli.py | 31 +++++++++++++++++++ tests/conftest.py | 11 +++++++ tiktok_hashtag_analysis/__init__.py | 2 ++ tiktok_hashtag_analysis/auth.py | 1 - tiktok_hashtag_analysis/base.py | 2 +- .../{__main__.py => cli.py} | 0 11 files changed, 101 insertions(+), 3 deletions(-) create mode 100644 pytest.ini create mode 100644 tests/__init__.py create mode 100644 tests/auth.py create mode 100644 tests/base.py create mode 100644 tests/cli.py create mode 100644 tests/conftest.py rename tiktok_hashtag_analysis/{__main__.py => cli.py} (100%) diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..4004079 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,15 @@ +[pytest] +minversion = + 7.0.0 +testpaths = + tests/ +python_files = + *.py +addopts = + -vvv + --cov='tiktok_hashtag_analysis' + --cov-report html:reports/coverage + --html='reports/tests.html' + --self-contained-html +filterwarnings = + ignore:Glyph (.*) missing from current font \ No newline at end of file diff --git a/setup.py b/setup.py index bd6119e..f5d5377 100644 --- a/setup.py +++ b/setup.py @@ -16,6 +16,7 @@ setup( url="https://github.com/bellingcat/tiktok-hashtag-analysis", license="MIT License", install_requires=["seaborn", "matplotlib", "TikTokApi", "requests", "yt-dlp"], + extras_require={"test": ["pytest", "pytest-cov", "pytest-html", "pytest-metadata"]}, classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Information Technology", @@ -25,7 +26,7 @@ setup( ], entry_points={ "console_scripts": [ - "tiktok-hashtag-analysis=tiktok_hashtag_analysis.__main__:main", + "tiktok-hashtag-analysis=tiktok_hashtag_analysis.cli:main", ] }, ) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/auth.py b/tests/auth.py new file mode 100644 index 0000000..6d0c078 --- /dev/null +++ b/tests/auth.py @@ -0,0 +1,24 @@ +import pytest + +from tiktok_hashtag_analysis.auth import Authorization + +MS_TOKEN = "thisisafakemstokenfortiktok" + + +def test_auth_input(tmp_path, monkeypatch): + config_file = tmp_path / ".tiktok" + monkeypatch.setattr("builtins.input", lambda _: MS_TOKEN) + auth = Authorization(config_file=config_file) + auth.get_token() + + assert auth.ms_token == MS_TOKEN + + +def test_auth(tmp_path): + config_file = tmp_path / ".tiktok" + auth = Authorization(config_file=config_file) + + auth.dump_token(ms_token=MS_TOKEN) + auth.get_token() + + assert auth.ms_token == MS_TOKEN diff --git a/tests/base.py b/tests/base.py new file mode 100644 index 0000000..c0d2a07 --- /dev/null +++ b/tests/base.py @@ -0,0 +1,15 @@ +from tiktok_hashtag_analysis.base import TikTokDownloader, load_hashtags_from_file + + +def test_scrape(tmp_path, hashtags): + downloader = TikTokDownloader(hashtags=hashtags[:1], data_dir=tmp_path) + downloader.run(download=True, plot=True, table=True, number=20) + + +def test_load_hashtags_from_file(tmp_path, hashtags): + file = tmp_path / "hashtags.txt" + with open(file, "w", encoding="utf-8") as f: + f.write("\n".join(hashtags)) + + loaded_hashtags = load_hashtags_from_file(file=file) + assert loaded_hashtags == hashtags diff --git a/tests/cli.py b/tests/cli.py new file mode 100644 index 0000000..dd58f5e --- /dev/null +++ b/tests/cli.py @@ -0,0 +1,31 @@ +import pytest + +from tiktok_hashtag_analysis.cli import create_parser + +ARGUMENTS = [ + ("file", "hashtags.txt", "--file"), + ("download", True, "--download"), + ("download", True, "-d"), + ("number", 20, "--number"), + ("plot", True, "--plot"), + ("plot", True, "-p"), + ("table", True, "--table"), + ("table", True, "-t"), + ("output_dir", "/tmp/tiktok_download", "--output-dir"), + ("config", "~/.tiktok", "--config"), + ("log", "../logfile.log", "--log"), +] + + +@pytest.mark.parametrize("attribute,value,flag", ARGUMENTS) +def test_parser(hashtags, attribute, value, flag): + argument_list = [*hashtags, flag] + + if not isinstance(value, bool): + argument_list.append(str(value)) + + parser = create_parser() + args = vars(parser.parse_args(argument_list)) + + assert args.get(attribute) == value + assert args.get("hashtags") == hashtags diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..b5c096d --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,11 @@ +import os +import tempfile + +import pytest + +TEST_HASHTAGS = ["embraceeuropa", "francisparkeryockey"] + + +@pytest.fixture(scope="package") +def hashtags(): + return TEST_HASHTAGS diff --git a/tiktok_hashtag_analysis/__init__.py b/tiktok_hashtag_analysis/__init__.py index 8c0d5d5..7a97c27 100644 --- a/tiktok_hashtag_analysis/__init__.py +++ b/tiktok_hashtag_analysis/__init__.py @@ -1 +1,3 @@ __version__ = "2.0.0" + +from .base import TikTokDownloader diff --git a/tiktok_hashtag_analysis/auth.py b/tiktok_hashtag_analysis/auth.py index 5d5ac16..25c2222 100644 --- a/tiktok_hashtag_analysis/auth.py +++ b/tiktok_hashtag_analysis/auth.py @@ -15,7 +15,6 @@ class Authorization: self.config_file = Path.home() / ".tiktok" self.section = "TikTok" - self.get_token() def get_token(self) -> str: """Load the "msToken" cookie taken from TikTok, which the scraper requires.""" diff --git a/tiktok_hashtag_analysis/base.py b/tiktok_hashtag_analysis/base.py index 77623a3..e059dbb 100644 --- a/tiktok_hashtag_analysis/base.py +++ b/tiktok_hashtag_analysis/base.py @@ -109,7 +109,7 @@ class TikTokDownloader: os.makedirs(self.data_dir, exist_ok=True) self.auth = Authorization(config_file=config_file) - self.ms_token = self.auth.ms_token + self.ms_token = self.auth.get_token() def get_hashtag_posts(self, hashtag: str): """Fetch data about posts that used a specified hashtag and merge with diff --git a/tiktok_hashtag_analysis/__main__.py b/tiktok_hashtag_analysis/cli.py similarity index 100% rename from tiktok_hashtag_analysis/__main__.py rename to tiktok_hashtag_analysis/cli.py From 8c32a3cf1642998aaa4d9916ba1ec06a588dcaac Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Mon, 4 Sep 2023 13:51:28 -0500 Subject: [PATCH 5/6] updated README, made yt-dlp downloading more robust against errors, changed name of videos folder to media (since images and audio files are also downloaded now) --- README.md | 29 +++++++++++++++-------------- tiktok_hashtag_analysis/base.py | 6 +++--- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 75e5e26..2c51e2e 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ You should now be ready to start using it. ## About the tool ### Command-line arguments ``` -usage: tiktok-hashtag-analysis [-h] [--file FILE] [-d] [--number NUMBER] [-p] [-t] [--output-dir OUTPUT_DIR] [--log LOG] [hashtags ...] +usage: tiktok-hashtag-analysis [-h] [--file FILE] [-d] [--number NUMBER] [-p] [-t] [--output-dir OUTPUT_DIR] [--config CONFIG] [--log LOG] [hashtags ...] Analyze hashtags within posts scraped from TikTok. @@ -31,6 +31,7 @@ optional arguments: -t, --table Print a table of the most common co-occurring hashtags --output-dir OUTPUT_DIR Directory to save scraped data and visualizations to + --config CONFIG File name of configuration file to store TikTok credentials to --log LOG File to write logs to ``` @@ -38,23 +39,23 @@ optional arguments: ``` $ tree ../data ../data -├── ids -│ └── post_ids.json ├── london -│ └── posts -│ └── data.json +│ ├── plots +│ ├── posts.json +│ └── media ├── newyork -│ └── posts -│ └── data.json +│ ├── plots +│ ├── posts.json +│ └── media └── paris - └── posts - └── data.json +│ ├── plots +│ ├── posts.json +│ └── media ``` The `data` folder contains all the downloaded data as shown in the tree diagram above. -- The `ids` folder contains two files `post_ids.json` and `video_ids.json` that record the ids of the downloaded posts and videos for each hashtag. -- Each hashtag has a folder with two subfolders `posts` and `videos` that store posts and videos respectively. The posts are stored in the `data.json` file in the `posts` folder, and videos are stored as the `.mp4` files in the `videos` folder. +- Each hashtag has a folder with two subfolders `plots` and `media` that store plots of the most common co-occurring hashtags, and media downloaded from the posts. The posts are stored in the `posts.json` file, and downloaded media is stored as `.mp4` files (for videos) or audio and image files (for image galleries) in the `media` folder. ## How to use @@ -75,8 +76,8 @@ and will produce an output similar to the following log: - The list of hashtags to scrape is specified as a positional argument ### Video downloading -Running the `tiktok-hashtag-analysis` script with the following options will scrape trending videos containing the hashtag `#london`: -`tiktok-hashtag-analysis download london --download` +Running the `tiktok-hashtag-analysis` script with the following options will scrape trending posts containing the hashtag `#london`: +`tiktok-hashtag-analysis london --download` - The `--download` flag specifies that video files for scraped posts should be downloaded @@ -84,7 +85,7 @@ Note that video downloading is a time and data rate consuming task, as a result ## Analyzing results ### Most common co-occurring hashtags -In addition to scraping data and downloading videos, the `tiktok-hashtag-analysis` script can also analyze the frequencies of the most common co-occurring hashtags in a given set of posts. +In addition to scraping data and downloading media, the `tiktok-hashtag-analysis` script can also analyze the frequencies of the most common co-occurring hashtags in a given set of posts. Assume we want to analyze the 20 most frequently co-occurring hashtags in the downloaded posts of the `#london` hashtag. diff --git a/tiktok_hashtag_analysis/base.py b/tiktok_hashtag_analysis/base.py index e059dbb..c6aed7e 100644 --- a/tiktok_hashtag_analysis/base.py +++ b/tiktok_hashtag_analysis/base.py @@ -155,7 +155,7 @@ class TikTokDownloader: # Define file containing post data and directory to save videos to hashtag_file = self.data_dir / hashtag / "posts.json" - video_dir = self.data_dir / hashtag / "videos" + video_dir = self.data_dir / hashtag / "media" video_dir.mkdir(exist_ok=True) # Get list of post IDs that have previously had their media downloaded @@ -191,8 +191,8 @@ class TikTokDownloader: # Download video files for all video posts if len(urls_to_download) > 0: - logging.info(f"Downloading videos for hashtag {hashtag}") - ydl_opts = {"outtmpl": os.path.join(video_dir, "%(id)s.%(ext)s")} + logging.info(f"Downloading media for hashtag {hashtag}") + ydl_opts = {"outtmpl": os.path.join(video_dir, "%(id)s.%(ext)s"), "ignore_errors": True} with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download(urls_to_download) From 10821e30f2f9dff488d3eedca99880fab7f8bb18 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Wed, 6 Sep 2023 09:51:31 -0500 Subject: [PATCH 6/6] preparing for publishing (removed pipenv commands from workflow, added Contributing section on README, added functionality to pin dependency versions with requirements.txt) --- .github/workflows/python-publish.yaml | 7 ++--- .gitignore | 1 + README.md | 15 +++++++++++ requirements.txt | 5 ++++ setup.py | 38 ++++++++++++++++++++++++--- tiktok_hashtag_analysis/__init__.py | 2 -- tiktok_hashtag_analysis/auth.py | 1 + tiktok_hashtag_analysis/base.py | 5 +++- tiktok_hashtag_analysis/version.py | 11 ++++++++ 9 files changed, 74 insertions(+), 11 deletions(-) create mode 100644 requirements.txt create mode 100644 tiktok_hashtag_analysis/version.py diff --git a/.github/workflows/python-publish.yaml b/.github/workflows/python-publish.yaml index 5ce8e63..83d16e0 100644 --- a/.github/workflows/python-publish.yaml +++ b/.github/workflows/python-publish.yaml @@ -33,15 +33,12 @@ jobs: - name: Install dependencies run: | - python -m pip install --upgrade --upgrade-strategy=eager pip setuptools wheel twine pipenv + python -m pip install --upgrade --upgrade-strategy=eager pip setuptools wheel twine python -m pip install -e . --upgrade - python -m pipenv install --dev --python 3.10 - env: - PIPENV_DEFAULT_PYTHON_VERSION: "3.10" - name: Build wheels run: | - python -m pipenv run python setup.py sdist bdist_wheel + python setup.py sdist bdist_wheel - name: Publish a Python distribution to PyPI uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.gitignore b/.gitignore index 525e540..d5095d6 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ data/ build/ *.egg-info/ +dist/ # Miscellaneous files **/.DS_Store diff --git a/README.md b/README.md index 2c51e2e..5f891ac 100644 --- a/README.md +++ b/README.md @@ -132,3 +132,18 @@ Assume we want to analyze the 20 most frequently co-occurring hashtags in the do ``` The `Frequency` column shows the ratio of the occurrence to the total number of downloaded posts. + +### Contributing +To run the build-in tests in the `tests/` directory, first install the test dependency packages: + +``` +pip install .[test] +``` + +and then run the tests using the following command: + +``` +pytest +``` + +This repo uses [black](https://github.com/psf/black) to format source code, please run the `black` command before submitting a PR. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e4144ef --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +seaborn==0.12.2 +matplotlib==3.7.2 +yt-dlp==2023.7.6 +TikTokApi==6.1.1 +requests==2.31.0 \ No newline at end of file diff --git a/setup.py b/setup.py index f5d5377..5760f41 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,42 @@ from setuptools import setup -from tiktok_hashtag_analysis import __version__ + + +def read_requirements(filename: str): + with open(filename) as requirements_file: + import re + + def fix_url_dependencies(req: str) -> str: + """Pip and setuptools disagree about how URL dependencies should be handled.""" + m = re.match( + r"^(git\+)?(https|ssh)://(git@)?github\.com/([\w-]+)/(?P[\w-]+)\.git", + req, + ) + if m is None: + return req + else: + return f"{m.group('name')} @ {req}" + + requirements = [] + for line in requirements_file: + line = line.strip() + if line.startswith("#") or len(line) <= 0: + continue + requirements.append(fix_url_dependencies(line)) + return requirements + with open("README.md", "r", encoding="utf-8") as file: long_description = file.read() +# version.py defines the VERSION and VERSION_SHORT variables. +# We use exec here so we don't import cached_path whilst setting up. +VERSION = {} # type: ignore +with open("tiktok_hashtag_analysis/version.py", "r") as version_file: + exec(version_file.read(), VERSION) + setup( name="tiktok-hashtag-analysis", - version=__version__, + version=VERSION["VERSION"], author="Bellingcat", author_email="tech@bellingcat.com", packages=["tiktok_hashtag_analysis"], @@ -15,7 +45,9 @@ setup( long_description_content_type="text/markdown", url="https://github.com/bellingcat/tiktok-hashtag-analysis", license="MIT License", - install_requires=["seaborn", "matplotlib", "TikTokApi", "requests", "yt-dlp"], + # install_requires=read_requirements("requirements.txt"), + # extras_require={"dev": read_requirements("dev-requirements.txt")}, + install_requires=["seaborn", "matplotlib", "TikTokApi", "requests", "yt_dlp"], extras_require={"test": ["pytest", "pytest-cov", "pytest-html", "pytest-metadata"]}, classifiers=[ "Development Status :: 5 - Production/Stable", diff --git a/tiktok_hashtag_analysis/__init__.py b/tiktok_hashtag_analysis/__init__.py index 7a97c27..eea2898 100644 --- a/tiktok_hashtag_analysis/__init__.py +++ b/tiktok_hashtag_analysis/__init__.py @@ -1,3 +1 @@ -__version__ = "2.0.0" - from .base import TikTokDownloader diff --git a/tiktok_hashtag_analysis/auth.py b/tiktok_hashtag_analysis/auth.py index 25c2222..545e2ce 100644 --- a/tiktok_hashtag_analysis/auth.py +++ b/tiktok_hashtag_analysis/auth.py @@ -15,6 +15,7 @@ class Authorization: self.config_file = Path.home() / ".tiktok" self.section = "TikTok" + self.ms_token = None def get_token(self) -> str: """Load the "msToken" cookie taken from TikTok, which the scraper requires.""" diff --git a/tiktok_hashtag_analysis/base.py b/tiktok_hashtag_analysis/base.py index c6aed7e..d7a9e9e 100644 --- a/tiktok_hashtag_analysis/base.py +++ b/tiktok_hashtag_analysis/base.py @@ -192,7 +192,10 @@ class TikTokDownloader: # Download video files for all video posts if len(urls_to_download) > 0: logging.info(f"Downloading media for hashtag {hashtag}") - ydl_opts = {"outtmpl": os.path.join(video_dir, "%(id)s.%(ext)s"), "ignore_errors": True} + ydl_opts = { + "outtmpl": os.path.join(video_dir, "%(id)s.%(ext)s"), + "ignore_errors": True, + } with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download(urls_to_download) diff --git a/tiktok_hashtag_analysis/version.py b/tiktok_hashtag_analysis/version.py new file mode 100644 index 0000000..aba80f2 --- /dev/null +++ b/tiktok_hashtag_analysis/version.py @@ -0,0 +1,11 @@ +_MAJOR = "2" +_MINOR = "0" +# On main and in a nightly release the patch should be one ahead of the last +# released build. +_PATCH = "0" +# This is mainly for nightly builds which have the suffix ".dev$DATE". See +# https://semver.org/#is-v123-a-semantic-version for the semantics. +_SUFFIX = "" + +VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR) +VERSION = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX)