simplified downloading logic (methods for keeping track of files less necessary since scraping can be done in Python), added functionality to use yt-dlp to download videos, added functionality to download TikTok image galleries

This commit is contained in:
Tristan Lee
2023-09-01 17:05:13 -05:00
parent 06b4a74c7d
commit a7bd023c21
17 changed files with 364 additions and 1244 deletions

13
Pipfile
View File

@@ -1,13 +0,0 @@
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"
[packages]
matplotlib = "*"
seaborn = "*"
[dev-packages]
[requires]
python_version = "3.10"

416
Pipfile.lock generated
View File

@@ -1,416 +0,0 @@
{
"_meta": {
"hash": {
"sha256": "97c5ef0126b17f586b5fa1d518cf359b7e984e48f8fc2310e9aa79bd384c2374"
},
"pipfile-spec": 6,
"requires": {
"python_version": "3.10"
},
"sources": [
{
"name": "pypi",
"url": "https://pypi.org/simple",
"verify_ssl": true
}
]
},
"default": {
"contourpy": {
"hashes": [
"sha256:031154ed61f7328ad7f97662e48660a150ef84ee1bc8876b6472af88bf5a9b98",
"sha256:0f9d350b639db6c2c233d92c7f213d94d2e444d8e8fc5ca44c9706cf72193772",
"sha256:130230b7e49825c98edf0b428b7aa1125503d91732735ef897786fe5452b1ec2",
"sha256:152fd8f730c31fd67fe0ffebe1df38ab6a669403da93df218801a893645c6ccc",
"sha256:1c71fdd8f1c0f84ffd58fca37d00ca4ebaa9e502fb49825484da075ac0b0b803",
"sha256:24847601071f740837aefb730e01bd169fbcaa610209779a78db7ebb6e6a7051",
"sha256:2e9ebb4425fc1b658e13bace354c48a933b842d53c458f02c86f371cecbedecc",
"sha256:30676ca45084ee61e9c3da589042c24a57592e375d4b138bd84d8709893a1ba4",
"sha256:31a55dccc8426e71817e3fe09b37d6d48ae40aae4ecbc8c7ad59d6893569c436",
"sha256:366a0cf0fc079af5204801786ad7a1c007714ee3909e364dbac1729f5b0849e5",
"sha256:38e2e577f0f092b8e6774459317c05a69935a1755ecfb621c0a98f0e3c09c9a5",
"sha256:3c184ad2433635f216645fdf0493011a4667e8d46b34082f5a3de702b6ec42e3",
"sha256:3caea6365b13119626ee996711ab63e0c9d7496f65641f4459c60a009a1f3e80",
"sha256:3e927b3868bd1e12acee7cc8f3747d815b4ab3e445a28d2e5373a7f4a6e76ba1",
"sha256:4ee3ee247f795a69e53cd91d927146fb16c4e803c7ac86c84104940c7d2cabf0",
"sha256:54d43960d809c4c12508a60b66cb936e7ed57d51fb5e30b513934a4a23874fae",
"sha256:57119b0116e3f408acbdccf9eb6ef19d7fe7baf0d1e9aaa5381489bc1aa56556",
"sha256:58569c491e7f7e874f11519ef46737cea1d6eda1b514e4eb5ac7dab6aa864d02",
"sha256:5a011cf354107b47c58ea932d13b04d93c6d1d69b8b6dce885e642531f847566",
"sha256:5caeacc68642e5f19d707471890f037a13007feba8427eb7f2a60811a1fc1350",
"sha256:5dd34c1ae752515318224cba7fc62b53130c45ac6a1040c8b7c1a223c46e8967",
"sha256:60835badb5ed5f4e194a6f21c09283dd6e007664a86101431bf870d9e86266c4",
"sha256:62398c80ef57589bdbe1eb8537127321c1abcfdf8c5f14f479dbbe27d0322e66",
"sha256:6381fa66866b0ea35e15d197fc06ac3840a9b2643a6475c8fff267db8b9f1e69",
"sha256:64757f6460fc55d7e16ed4f1de193f362104285c667c112b50a804d482777edd",
"sha256:69f8ff4db108815addd900a74df665e135dbbd6547a8a69333a68e1f6e368ac2",
"sha256:6c180d89a28787e4b73b07e9b0e2dac7741261dbdca95f2b489c4f8f887dd810",
"sha256:71b0bf0c30d432278793d2141362ac853859e87de0a7dee24a1cea35231f0d50",
"sha256:769eef00437edf115e24d87f8926955f00f7704bede656ce605097584f9966dc",
"sha256:7f6979d20ee5693a1057ab53e043adffa1e7418d734c1532e2d9e915b08d8ec2",
"sha256:87f4d8941a9564cda3f7fa6a6cd9b32ec575830780677932abdec7bcb61717b0",
"sha256:89ba9bb365446a22411f0673abf6ee1fea3b2cf47b37533b970904880ceb72f3",
"sha256:8acf74b5d383414401926c1598ed77825cd530ac7b463ebc2e4f46638f56cce6",
"sha256:9056c5310eb1daa33fc234ef39ebfb8c8e2533f088bbf0bc7350f70a29bde1ac",
"sha256:95c3acddf921944f241b6773b767f1cbce71d03307270e2d769fd584d5d1092d",
"sha256:9e20e5a1908e18aaa60d9077a6d8753090e3f85ca25da6e25d30dc0a9e84c2c6",
"sha256:a1e97b86f73715e8670ef45292d7cc033548266f07d54e2183ecb3c87598888f",
"sha256:a877ada905f7d69b2a31796c4b66e31a8068b37aa9b78832d41c82fc3e056ddd",
"sha256:a9d7587d2fdc820cc9177139b56795c39fb8560f540bba9ceea215f1f66e1566",
"sha256:abf298af1e7ad44eeb93501e40eb5a67abbf93b5d90e468d01fc0c4451971afa",
"sha256:ae90d5a8590e5310c32a7630b4b8618cef7563cebf649011da80874d0aa8f414",
"sha256:b6d0f9e1d39dbfb3977f9dd79f156c86eb03e57a7face96f199e02b18e58d32a",
"sha256:b8d587cc39057d0afd4166083d289bdeff221ac6d3ee5046aef2d480dc4b503c",
"sha256:c5210e5d5117e9aec8c47d9156d1d3835570dd909a899171b9535cb4a3f32693",
"sha256:cc331c13902d0f50845099434cd936d49d7a2ca76cb654b39691974cb1e4812d",
"sha256:ce41676b3d0dd16dbcfabcc1dc46090aaf4688fd6e819ef343dbda5a57ef0161",
"sha256:d8165a088d31798b59e91117d1f5fc3df8168d8b48c4acc10fc0df0d0bdbcc5e",
"sha256:e7281244c99fd7c6f27c1c6bfafba878517b0b62925a09b586d88ce750a016d2",
"sha256:e96a08b62bb8de960d3a6afbc5ed8421bf1a2d9c85cc4ea73f4bc81b4910500f",
"sha256:ed33433fc3820263a6368e532f19ddb4c5990855e4886088ad84fd7c4e561c71",
"sha256:efb8f6d08ca7998cf59eaf50c9d60717f29a1a0a09caa46460d33b2924839dbd",
"sha256:efe99298ba37e37787f6a2ea868265465410822f7bea163edcc1bd3903354ea9",
"sha256:f99e9486bf1bb979d95d5cffed40689cb595abb2b841f2991fc894b3452290e8",
"sha256:fc1464c97579da9f3ab16763c32e5c5d5bb5fa1ec7ce509a4ca6108b61b84fab",
"sha256:fd7dc0e6812b799a34f6d12fcb1000539098c249c8da54f3566c6a6461d0dbad"
],
"markers": "python_version >= '3.8'",
"version": "==1.0.7"
},
"cycler": {
"hashes": [
"sha256:3a27e95f763a428a739d2add979fa7494c912a32c17c4c38c4d5f082cad165a3",
"sha256:9c87405839a19696e837b3b818fed3f5f69f16f1eec1a1ad77e043dcea9c772f"
],
"markers": "python_version >= '3.6'",
"version": "==0.11.0"
},
"fonttools": {
"hashes": [
"sha256:2bb244009f9bf3fa100fc3ead6aeb99febe5985fa20afbfbaa2f8946c2fbdaf1",
"sha256:820466f43c8be8c3009aef8b87e785014133508f0de64ec469e4efb643ae54fb"
],
"markers": "python_version >= '3.7'",
"version": "==4.38.0"
},
"kiwisolver": {
"hashes": [
"sha256:02f79693ec433cb4b5f51694e8477ae83b3205768a6fb48ffba60549080e295b",
"sha256:03baab2d6b4a54ddbb43bba1a3a2d1627e82d205c5cf8f4c924dc49284b87166",
"sha256:1041feb4cda8708ce73bb4dcb9ce1ccf49d553bf87c3954bdfa46f0c3f77252c",
"sha256:10ee06759482c78bdb864f4109886dff7b8a56529bc1609d4f1112b93fe6423c",
"sha256:1d1573129aa0fd901076e2bfb4275a35f5b7aa60fbfb984499d661ec950320b0",
"sha256:283dffbf061a4ec60391d51e6155e372a1f7a4f5b15d59c8505339454f8989e4",
"sha256:28bc5b299f48150b5f822ce68624e445040595a4ac3d59251703779836eceff9",
"sha256:2a66fdfb34e05b705620dd567f5a03f239a088d5a3f321e7b6ac3239d22aa286",
"sha256:2e307eb9bd99801f82789b44bb45e9f541961831c7311521b13a6c85afc09767",
"sha256:2e407cb4bd5a13984a6c2c0fe1845e4e41e96f183e5e5cd4d77a857d9693494c",
"sha256:2f5e60fabb7343a836360c4f0919b8cd0d6dbf08ad2ca6b9cf90bf0c76a3c4f6",
"sha256:36dafec3d6d6088d34e2de6b85f9d8e2324eb734162fba59d2ba9ed7a2043d5b",
"sha256:3fe20f63c9ecee44560d0e7f116b3a747a5d7203376abeea292ab3152334d004",
"sha256:41dae968a94b1ef1897cb322b39360a0812661dba7c682aa45098eb8e193dbdf",
"sha256:4bd472dbe5e136f96a4b18f295d159d7f26fd399136f5b17b08c4e5f498cd494",
"sha256:4ea39b0ccc4f5d803e3337dd46bcce60b702be4d86fd0b3d7531ef10fd99a1ac",
"sha256:5853eb494c71e267912275e5586fe281444eb5e722de4e131cddf9d442615626",
"sha256:5bce61af018b0cb2055e0e72e7d65290d822d3feee430b7b8203d8a855e78766",
"sha256:6295ecd49304dcf3bfbfa45d9a081c96509e95f4b9d0eb7ee4ec0530c4a96514",
"sha256:62ac9cc684da4cf1778d07a89bf5f81b35834cb96ca523d3a7fb32509380cbf6",
"sha256:70e7c2e7b750585569564e2e5ca9845acfaa5da56ac46df68414f29fea97be9f",
"sha256:7577c1987baa3adc4b3c62c33bd1118c3ef5c8ddef36f0f2c950ae0b199e100d",
"sha256:75facbe9606748f43428fc91a43edb46c7ff68889b91fa31f53b58894503a191",
"sha256:787518a6789009c159453da4d6b683f468ef7a65bbde796bcea803ccf191058d",
"sha256:78d6601aed50c74e0ef02f4204da1816147a6d3fbdc8b3872d263338a9052c51",
"sha256:7c43e1e1206cd421cd92e6b3280d4385d41d7166b3ed577ac20444b6995a445f",
"sha256:81e38381b782cc7e1e46c4e14cd997ee6040768101aefc8fa3c24a4cc58e98f8",
"sha256:841293b17ad704d70c578f1f0013c890e219952169ce8a24ebc063eecf775454",
"sha256:872b8ca05c40d309ed13eb2e582cab0c5a05e81e987ab9c521bf05ad1d5cf5cb",
"sha256:877272cf6b4b7e94c9614f9b10140e198d2186363728ed0f701c6eee1baec1da",
"sha256:8c808594c88a025d4e322d5bb549282c93c8e1ba71b790f539567932722d7bd8",
"sha256:8ed58b8acf29798b036d347791141767ccf65eee7f26bde03a71c944449e53de",
"sha256:91672bacaa030f92fc2f43b620d7b337fd9a5af28b0d6ed3f77afc43c4a64b5a",
"sha256:968f44fdbf6dd757d12920d63b566eeb4d5b395fd2d00d29d7ef00a00582aac9",
"sha256:9f85003f5dfa867e86d53fac6f7e6f30c045673fa27b603c397753bebadc3008",
"sha256:a553dadda40fef6bfa1456dc4be49b113aa92c2a9a9e8711e955618cd69622e3",
"sha256:a68b62a02953b9841730db7797422f983935aeefceb1679f0fc85cbfbd311c32",
"sha256:abbe9fa13da955feb8202e215c4018f4bb57469b1b78c7a4c5c7b93001699938",
"sha256:ad881edc7ccb9d65b0224f4e4d05a1e85cf62d73aab798943df6d48ab0cd79a1",
"sha256:b1792d939ec70abe76f5054d3f36ed5656021dcad1322d1cc996d4e54165cef9",
"sha256:b428ef021242344340460fa4c9185d0b1f66fbdbfecc6c63eff4b7c29fad429d",
"sha256:b533558eae785e33e8c148a8d9921692a9fe5aa516efbdff8606e7d87b9d5824",
"sha256:ba59c92039ec0a66103b1d5fe588fa546373587a7d68f5c96f743c3396afc04b",
"sha256:bc8d3bd6c72b2dd9decf16ce70e20abcb3274ba01b4e1c96031e0c4067d1e7cd",
"sha256:bc9db8a3efb3e403e4ecc6cd9489ea2bac94244f80c78e27c31dcc00d2790ac2",
"sha256:bf7d9fce9bcc4752ca4a1b80aabd38f6d19009ea5cbda0e0856983cf6d0023f5",
"sha256:c2dbb44c3f7e6c4d3487b31037b1bdbf424d97687c1747ce4ff2895795c9bf69",
"sha256:c79ebe8f3676a4c6630fd3f777f3cfecf9289666c84e775a67d1d358578dc2e3",
"sha256:c97528e64cb9ebeff9701e7938653a9951922f2a38bd847787d4a8e498cc83ae",
"sha256:d0611a0a2a518464c05ddd5a3a1a0e856ccc10e67079bb17f265ad19ab3c7597",
"sha256:d06adcfa62a4431d404c31216f0f8ac97397d799cd53800e9d3efc2fbb3cf14e",
"sha256:d41997519fcba4a1e46eb4a2fe31bc12f0ff957b2b81bac28db24744f333e955",
"sha256:d5b61785a9ce44e5a4b880272baa7cf6c8f48a5180c3e81c59553ba0cb0821ca",
"sha256:da152d8cdcab0e56e4f45eb08b9aea6455845ec83172092f09b0e077ece2cf7a",
"sha256:da7e547706e69e45d95e116e6939488d62174e033b763ab1496b4c29b76fabea",
"sha256:db5283d90da4174865d520e7366801a93777201e91e79bacbac6e6927cbceede",
"sha256:db608a6757adabb32f1cfe6066e39b3706d8c3aa69bbc353a5b61edad36a5cb4",
"sha256:e0ea21f66820452a3f5d1655f8704a60d66ba1191359b96541eaf457710a5fc6",
"sha256:e7da3fec7408813a7cebc9e4ec55afed2d0fd65c4754bc376bf03498d4e92686",
"sha256:e92a513161077b53447160b9bd8f522edfbed4bd9759e4c18ab05d7ef7e49408",
"sha256:ecb1fa0db7bf4cff9dac752abb19505a233c7f16684c5826d1f11ebd9472b871",
"sha256:efda5fc8cc1c61e4f639b8067d118e742b812c930f708e6667a5ce0d13499e29",
"sha256:f0a1dbdb5ecbef0d34eb77e56fcb3e95bbd7e50835d9782a45df81cc46949750",
"sha256:f0a71d85ecdd570ded8ac3d1c0f480842f49a40beb423bb8014539a9f32a5897",
"sha256:f4f270de01dd3e129a72efad823da90cc4d6aafb64c410c9033aba70db9f1ff0",
"sha256:f6cb459eea32a4e2cf18ba5fcece2dbdf496384413bc1bae15583f19e567f3b2",
"sha256:f8ad8285b01b0d4695102546b342b493b3ccc6781fc28c8c6a1bb63e95d22f09",
"sha256:f9f39e2f049db33a908319cf46624a569b36983c7c78318e9726a4cb8923b26c"
],
"markers": "python_version >= '3.7'",
"version": "==1.4.4"
},
"matplotlib": {
"hashes": [
"sha256:01b7f521a9a73c383825813af255f8c4485d1706e4f3e2ed5ae771e4403a40ab",
"sha256:11011c97d62c1db7bc20509572557842dbb8c2a2ddd3dd7f20501aa1cde3e54e",
"sha256:1183877d008c752d7d535396096c910f4663e4b74a18313adee1213328388e1e",
"sha256:12f999661589981e74d793ee2f41b924b3b87d65fd929f6153bf0f30675c59b1",
"sha256:1c235bf9be052347373f589e018988cad177abb3f997ab1a2e2210c41562cc0c",
"sha256:1f4d69707b1677560cd952544ee4962f68ff07952fb9069ff8c12b56353cb8c9",
"sha256:1fcc4cad498533d3c393a160975acc9b36ffa224d15a6b90ae579eacee5d8579",
"sha256:2787a16df07370dcba385fe20cdd0cc3cfaabd3c873ddabca78c10514c799721",
"sha256:29f17b7f2e068dc346687cbdf80b430580bab42346625821c2d3abf3a1ec5417",
"sha256:38d38cb1ea1d80ee0f6351b65c6f76cad6060bbbead015720ba001348ae90f0c",
"sha256:3f56a7252eee8f3438447f75f5e1148a1896a2756a92285fe5d73bed6deebff4",
"sha256:5223affa21050fb6118353c1380c15e23aedfb436bf3e162c26dc950617a7519",
"sha256:57ad1aee29043163374bfa8990e1a2a10ff72c9a1bfaa92e9c46f6ea59269121",
"sha256:59400cc9451094b7f08cc3f321972e6e1db4cd37a978d4e8a12824bf7fd2f03b",
"sha256:68d94a436f62b8a861bf3ace82067a71bafb724b4e4f9133521e4d8012420dd7",
"sha256:6adc441b5b2098a4b904bbf9d9e92fb816fef50c55aa2ea6a823fc89b94bb838",
"sha256:6d81b11ede69e3a751424b98dc869c96c10256b2206bfdf41f9c720eee86844c",
"sha256:73b93af33634ed919e72811c9703e1105185cd3fb46d76f30b7f4cfbbd063f89",
"sha256:77b384cee7ab8cf75ffccbfea351a09b97564fc62d149827a5e864bec81526e5",
"sha256:79e501eb847f4a489eb7065bb8d3187117f65a4c02d12ea3a19d6c5bef173bcc",
"sha256:809119d1cba3ece3c9742eb01827fe7a0e781ea3c5d89534655a75e07979344f",
"sha256:80c166a0e28512e26755f69040e6bf2f946a02ffdb7c00bf6158cca3d2b146e6",
"sha256:81b409b2790cf8d7c1ef35920f01676d2ae7afa8241844e7aa5484fdf493a9a0",
"sha256:994637e2995b0342699b396a320698b07cd148bbcf2dd2fa2daba73f34dd19f2",
"sha256:9ceebaf73f1a3444fa11014f38b9da37ff7ea328d6efa1652241fe3777bfdab9",
"sha256:9fb8fb19d03abf3c5dab89a8677e62c4023632f919a62b6dd1d6d2dbf42cd9f5",
"sha256:acc3b1a4bddbf56fe461e36fb9ef94c2cb607fc90d24ccc650040bfcc7610de4",
"sha256:bbddfeb1495484351fb5b30cf5bdf06b3de0bc4626a707d29e43dfd61af2a780",
"sha256:bbf269e1d24bc25247095d71c7a969813f7080e2a7c6fa28931a603f747ab012",
"sha256:bebcff4c3ed02c6399d47329f3554193abd824d3d53b5ca02cf583bcd94470e2",
"sha256:c3f08df2ac4636249b8bc7a85b8b82c983bef1441595936f62c2918370ca7e1d",
"sha256:ca94f0362f6b6f424b555b956971dcb94b12d0368a6c3e07dc7a40d32d6d873d",
"sha256:d00c248ab6b92bea3f8148714837937053a083ff03b4c5e30ed37e28fc0e7e56",
"sha256:d2cfaa7fd62294d945b8843ea24228a27c8e7c5b48fa634f3c168153b825a21b",
"sha256:d5f18430f5cfa5571ab8f4c72c89af52aa0618e864c60028f11a857d62200cba",
"sha256:debeab8e2ab07e5e3dac33e12456da79c7e104270d2b2d1df92b9e40347cca75",
"sha256:dfba7057609ca9567b9704626756f0142e97ec8c5ba2c70c6e7bd1c25ef99f06",
"sha256:e0a64d7cc336b52e90f59e6d638ae847b966f68582a7af041e063d568e814740",
"sha256:eb9421c403ffd387fbe729de6d9a03005bf42faba5e8432f4e51e703215b49fc",
"sha256:faff486b36530a836a6b4395850322e74211cd81fc17f28b4904e1bd53668e3e",
"sha256:ff2aa84e74f80891e6bcf292ebb1dd57714ffbe13177642d65fee25384a30894"
],
"index": "pypi",
"version": "==3.6.3"
},
"numpy": {
"hashes": [
"sha256:003a9f530e880cb2cd177cba1af7220b9aa42def9c4afc2a2fc3ee6be7eb2b22",
"sha256:150947adbdfeceec4e5926d956a06865c1c690f2fd902efede4ca6fe2e657c3f",
"sha256:2620e8592136e073bd12ee4536149380695fbe9ebeae845b81237f986479ffc9",
"sha256:2eabd64ddb96a1239791da78fa5f4e1693ae2dadc82a76bc76a14cbb2b966e96",
"sha256:4173bde9fa2a005c2c6e2ea8ac1618e2ed2c1c6ec8a7657237854d42094123a0",
"sha256:4199e7cfc307a778f72d293372736223e39ec9ac096ff0a2e64853b866a8e18a",
"sha256:4cecaed30dc14123020f77b03601559fff3e6cd0c048f8b5289f4eeabb0eb281",
"sha256:557d42778a6869c2162deb40ad82612645e21d79e11c1dc62c6e82a2220ffb04",
"sha256:63e45511ee4d9d976637d11e6c9864eae50e12dc9598f531c035265991910468",
"sha256:6524630f71631be2dabe0c541e7675db82651eb998496bbe16bc4f77f0772253",
"sha256:76807b4063f0002c8532cfeac47a3068a69561e9c8715efdad3c642eb27c0756",
"sha256:7de8fdde0003f4294655aa5d5f0a89c26b9f22c0a58790c38fae1ed392d44a5a",
"sha256:889b2cc88b837d86eda1b17008ebeb679d82875022200c6e8e4ce6cf549b7acb",
"sha256:92011118955724465fb6853def593cf397b4a1367495e0b59a7e69d40c4eb71d",
"sha256:97cf27e51fa078078c649a51d7ade3c92d9e709ba2bfb97493007103c741f1d0",
"sha256:9a23f8440561a633204a67fb44617ce2a299beecf3295f0d13c495518908e910",
"sha256:a51725a815a6188c662fb66fb32077709a9ca38053f0274640293a14fdd22978",
"sha256:a77d3e1163a7770164404607b7ba3967fb49b24782a6ef85d9b5f54126cc39e5",
"sha256:adbdce121896fd3a17a77ab0b0b5eedf05a9834a18699db6829a64e1dfccca7f",
"sha256:c29e6bd0ec49a44d7690ecb623a8eac5ab8a923bce0bea6293953992edf3a76a",
"sha256:c72a6b2f4af1adfe193f7beb91ddf708ff867a3f977ef2ec53c0ffb8283ab9f5",
"sha256:d0a2db9d20117bf523dde15858398e7c0858aadca7c0f088ac0d6edd360e9ad2",
"sha256:e3ab5d32784e843fc0dd3ab6dcafc67ef806e6b6828dc6af2f689be0eb4d781d",
"sha256:e428c4fbfa085f947b536706a2fc349245d7baa8334f0c5723c56a10595f9b95",
"sha256:e8d2859428712785e8a8b7d2b3ef0a1d1565892367b32f915c4a4df44d0e64f5",
"sha256:eef70b4fc1e872ebddc38cddacc87c19a3709c0e3e5d20bf3954c147b1dd941d",
"sha256:f64bb98ac59b3ea3bf74b02f13836eb2e24e48e0ab0145bbda646295769bd780",
"sha256:f9006288bcf4895917d02583cf3411f98631275bc67cce355a7f39f8c14338fa"
],
"markers": "python_version >= '3.8'",
"version": "==1.24.2"
},
"packaging": {
"hashes": [
"sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2",
"sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"
],
"markers": "python_version >= '3.7'",
"version": "==23.0"
},
"pandas": {
"hashes": [
"sha256:14e45300521902689a81f3f41386dc86f19b8ba8dd5ac5a3c7010ef8d2932813",
"sha256:26d9c71772c7afb9d5046e6e9cf42d83dd147b5cf5bcb9d97252077118543792",
"sha256:3749077d86e3a2f0ed51367f30bf5b82e131cc0f14260c4d3e499186fccc4406",
"sha256:41179ce559943d83a9b4bbacb736b04c928b095b5f25dd2b7389eda08f46f373",
"sha256:478ff646ca42b20376e4ed3fa2e8d7341e8a63105586efe54fa2508ee087f328",
"sha256:50869a35cbb0f2e0cd5ec04b191e7b12ed688874bd05dd777c19b28cbea90996",
"sha256:565fa34a5434d38e9d250af3c12ff931abaf88050551d9fbcdfafca50d62babf",
"sha256:5f2b952406a1588ad4cad5b3f55f520e82e902388a6d5a4a91baa8d38d23c7f6",
"sha256:5fbcb19d6fceb9e946b3e23258757c7b225ba450990d9ed63ccceeb8cae609f7",
"sha256:6973549c01ca91ec96199e940495219c887ea815b2083722821f1d7abfa2b4dc",
"sha256:74a3fd7e5a7ec052f183273dc7b0acd3a863edf7520f5d3a1765c04ffdb3b0b1",
"sha256:7a0a56cef15fd1586726dace5616db75ebcfec9179a3a55e78f72c5639fa2a23",
"sha256:7cec0bee9f294e5de5bbfc14d0573f65526071029d036b753ee6507d2a21480a",
"sha256:87bd9c03da1ac870a6d2c8902a0e1fd4267ca00f13bc494c9e5a9020920e1d51",
"sha256:972d8a45395f2a2d26733eb8d0f629b2f90bebe8e8eddbb8829b180c09639572",
"sha256:9842b6f4b8479e41968eced654487258ed81df7d1c9b7b870ceea24ed9459b31",
"sha256:9f69c4029613de47816b1bb30ff5ac778686688751a5e9c99ad8c7031f6508e5",
"sha256:a50d9a4336a9621cab7b8eb3fb11adb82de58f9b91d84c2cd526576b881a0c5a",
"sha256:bc4c368f42b551bf72fac35c5128963a171b40dce866fb066540eeaf46faa003",
"sha256:c39a8da13cede5adcd3be1182883aea1c925476f4e84b2807a46e2775306305d",
"sha256:c3ac844a0fe00bfaeb2c9b51ab1424e5c8744f89860b138434a363b1f620f354",
"sha256:c4c00e0b0597c8e4f59e8d461f797e5d70b4d025880516a8261b2817c47759ee",
"sha256:c74a62747864ed568f5a82a49a23a8d7fe171d0c69038b38cedf0976831296fa",
"sha256:dd05f7783b3274aa206a1af06f0ceed3f9b412cf665b7247eacd83be41cf7bf0",
"sha256:dfd681c5dc216037e0b0a2c821f5ed99ba9f03ebcf119c7dac0e9a7b960b9ec9",
"sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae",
"sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc"
],
"markers": "python_version >= '3.8'",
"version": "==1.5.3"
},
"pillow": {
"hashes": [
"sha256:013016af6b3a12a2f40b704677f8b51f72cb007dac785a9933d5c86a72a7fe33",
"sha256:0845adc64fe9886db00f5ab68c4a8cd933ab749a87747555cec1c95acea64b0b",
"sha256:0884ba7b515163a1a05440a138adeb722b8a6ae2c2b33aea93ea3118dd3a899e",
"sha256:09b89ddc95c248ee788328528e6a2996e09eaccddeeb82a5356e92645733be35",
"sha256:0dd4c681b82214b36273c18ca7ee87065a50e013112eea7d78c7a1b89a739153",
"sha256:0e51f608da093e5d9038c592b5b575cadc12fd748af1479b5e858045fff955a9",
"sha256:0f3269304c1a7ce82f1759c12ce731ef9b6e95b6df829dccd9fe42912cc48569",
"sha256:16a8df99701f9095bea8a6c4b3197da105df6f74e6176c5b410bc2df2fd29a57",
"sha256:19005a8e58b7c1796bc0167862b1f54a64d3b44ee5d48152b06bb861458bc0f8",
"sha256:1b4b4e9dda4f4e4c4e6896f93e84a8f0bcca3b059de9ddf67dac3c334b1195e1",
"sha256:28676836c7796805914b76b1837a40f76827ee0d5398f72f7dcc634bae7c6264",
"sha256:2968c58feca624bb6c8502f9564dd187d0e1389964898f5e9e1fbc8533169157",
"sha256:3f4cc516e0b264c8d4ccd6b6cbc69a07c6d582d8337df79be1e15a5056b258c9",
"sha256:3fa1284762aacca6dc97474ee9c16f83990b8eeb6697f2ba17140d54b453e133",
"sha256:43521ce2c4b865d385e78579a082b6ad1166ebed2b1a2293c3be1d68dd7ca3b9",
"sha256:451f10ef963918e65b8869e17d67db5e2f4ab40e716ee6ce7129b0cde2876eab",
"sha256:46c259e87199041583658457372a183636ae8cd56dbf3f0755e0f376a7f9d0e6",
"sha256:46f39cab8bbf4a384ba7cb0bc8bae7b7062b6a11cfac1ca4bc144dea90d4a9f5",
"sha256:519e14e2c49fcf7616d6d2cfc5c70adae95682ae20f0395e9280db85e8d6c4df",
"sha256:53dcb50fbdc3fb2c55431a9b30caeb2f7027fcd2aeb501459464f0214200a503",
"sha256:54614444887e0d3043557d9dbc697dbb16cfb5a35d672b7a0fcc1ed0cf1c600b",
"sha256:575d8912dca808edd9acd6f7795199332696d3469665ef26163cd090fa1f8bfa",
"sha256:5dd5a9c3091a0f414a963d427f920368e2b6a4c2f7527fdd82cde8ef0bc7a327",
"sha256:5f532a2ad4d174eb73494e7397988e22bf427f91acc8e6ebf5bb10597b49c493",
"sha256:60e7da3a3ad1812c128750fc1bc14a7ceeb8d29f77e0a2356a8fb2aa8925287d",
"sha256:653d7fb2df65efefbcbf81ef5fe5e5be931f1ee4332c2893ca638c9b11a409c4",
"sha256:6663977496d616b618b6cfa43ec86e479ee62b942e1da76a2c3daa1c75933ef4",
"sha256:6abfb51a82e919e3933eb137e17c4ae9c0475a25508ea88993bb59faf82f3b35",
"sha256:6c6b1389ed66cdd174d040105123a5a1bc91d0aa7059c7261d20e583b6d8cbd2",
"sha256:6d9dfb9959a3b0039ee06c1a1a90dc23bac3b430842dcb97908ddde05870601c",
"sha256:765cb54c0b8724a7c12c55146ae4647e0274a839fb6de7bcba841e04298e1011",
"sha256:7a21222644ab69ddd9967cfe6f2bb420b460dae4289c9d40ff9a4896e7c35c9a",
"sha256:7ac7594397698f77bce84382929747130765f66406dc2cd8b4ab4da68ade4c6e",
"sha256:7cfc287da09f9d2a7ec146ee4d72d6ea1342e770d975e49a8621bf54eaa8f30f",
"sha256:83125753a60cfc8c412de5896d10a0a405e0bd88d0470ad82e0869ddf0cb3848",
"sha256:847b114580c5cc9ebaf216dd8c8dbc6b00a3b7ab0131e173d7120e6deade1f57",
"sha256:87708d78a14d56a990fbf4f9cb350b7d89ee8988705e58e39bdf4d82c149210f",
"sha256:8a2b5874d17e72dfb80d917213abd55d7e1ed2479f38f001f264f7ce7bae757c",
"sha256:8f127e7b028900421cad64f51f75c051b628db17fb00e099eb148761eed598c9",
"sha256:94cdff45173b1919350601f82d61365e792895e3c3a3443cf99819e6fbf717a5",
"sha256:99d92d148dd03fd19d16175b6d355cc1b01faf80dae93c6c3eb4163709edc0a9",
"sha256:9a3049a10261d7f2b6514d35bbb7a4dfc3ece4c4de14ef5876c4b7a23a0e566d",
"sha256:9d9a62576b68cd90f7075876f4e8444487db5eeea0e4df3ba298ee38a8d067b0",
"sha256:9e5f94742033898bfe84c93c831a6f552bb629448d4072dd312306bab3bd96f1",
"sha256:a1c2d7780448eb93fbcc3789bf3916aa5720d942e37945f4056680317f1cd23e",
"sha256:a2e0f87144fcbbe54297cae708c5e7f9da21a4646523456b00cc956bd4c65815",
"sha256:a4dfdae195335abb4e89cc9762b2edc524f3c6e80d647a9a81bf81e17e3fb6f0",
"sha256:a96e6e23f2b79433390273eaf8cc94fec9c6370842e577ab10dabdcc7ea0a66b",
"sha256:aabdab8ec1e7ca7f1434d042bf8b1e92056245fb179790dc97ed040361f16bfd",
"sha256:b222090c455d6d1a64e6b7bb5f4035c4dff479e22455c9eaa1bdd4c75b52c80c",
"sha256:b52ff4f4e002f828ea6483faf4c4e8deea8d743cf801b74910243c58acc6eda3",
"sha256:b70756ec9417c34e097f987b4d8c510975216ad26ba6e57ccb53bc758f490dab",
"sha256:b8c2f6eb0df979ee99433d8b3f6d193d9590f735cf12274c108bd954e30ca858",
"sha256:b9b752ab91e78234941e44abdecc07f1f0d8f51fb62941d32995b8161f68cfe5",
"sha256:ba6612b6548220ff5e9df85261bddc811a057b0b465a1226b39bfb8550616aee",
"sha256:bd752c5ff1b4a870b7661234694f24b1d2b9076b8bf337321a814c612665f343",
"sha256:c3c4ed2ff6760e98d262e0cc9c9a7f7b8a9f61aa4d47c58835cdaf7b0b8811bb",
"sha256:c5c1362c14aee73f50143d74389b2c158707b4abce2cb055b7ad37ce60738d47",
"sha256:cb362e3b0976dc994857391b776ddaa8c13c28a16f80ac6522c23d5257156bed",
"sha256:d197df5489004db87d90b918033edbeee0bd6df3848a204bca3ff0a903bef837",
"sha256:d3b56206244dc8711f7e8b7d6cad4663917cd5b2d950799425076681e8766286",
"sha256:d5b2f8a31bd43e0f18172d8ac82347c8f37ef3e0b414431157718aa234991b28",
"sha256:d7081c084ceb58278dd3cf81f836bc818978c0ccc770cbbb202125ddabec6628",
"sha256:db74f5562c09953b2c5f8ec4b7dfd3f5421f31811e97d1dbc0a7c93d6e3a24df",
"sha256:df41112ccce5d47770a0c13651479fbcd8793f34232a2dd9faeccb75eb5d0d0d",
"sha256:e1339790c083c5a4de48f688b4841f18df839eb3c9584a770cbd818b33e26d5d",
"sha256:e621b0246192d3b9cb1dc62c78cfa4c6f6d2ddc0ec207d43c0dedecb914f152a",
"sha256:e8c5cf126889a4de385c02a2c3d3aba4b00f70234bfddae82a5eaa3ee6d5e3e6",
"sha256:e9d7747847c53a16a729b6ee5e737cf170f7a16611c143d95aa60a109a59c336",
"sha256:eaef5d2de3c7e9b21f1e762f289d17b726c2239a42b11e25446abf82b26ac132",
"sha256:ed3e4b4e1e6de75fdc16d3259098de7c6571b1a6cc863b1a49e7d3d53e036070",
"sha256:ef21af928e807f10bf4141cad4746eee692a0dd3ff56cfb25fce076ec3cc8abe",
"sha256:f09598b416ba39a8f489c124447b007fe865f786a89dbfa48bb5cf395693132a",
"sha256:f0caf4a5dcf610d96c3bd32932bfac8aee61c96e60481c2a0ea58da435e25acd",
"sha256:f6e78171be3fb7941f9910ea15b4b14ec27725865a73c15277bc39f5ca4f8391",
"sha256:f715c32e774a60a337b2bb8ad9839b4abf75b267a0f18806f6f4f5f1688c4b5a",
"sha256:fb5c1ad6bad98c57482236a21bf985ab0ef42bd51f7ad4e4538e89a997624e12"
],
"markers": "python_version >= '3.7'",
"version": "==9.4.0"
},
"pyparsing": {
"hashes": [
"sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb",
"sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"
],
"markers": "python_full_version >= '3.6.8'",
"version": "==3.0.9"
},
"python-dateutil": {
"hashes": [
"sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
"sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
"version": "==2.8.2"
},
"pytz": {
"hashes": [
"sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0",
"sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a"
],
"version": "==2022.7.1"
},
"seaborn": {
"hashes": [
"sha256:374645f36509d0dcab895cba5b47daf0586f77bfe3b36c97c607db7da5be0139",
"sha256:ebf15355a4dba46037dfd65b7350f014ceb1f13c05e814eda2c9f5fd731afc08"
],
"index": "pypi",
"version": "==0.12.2"
},
"six": {
"hashes": [
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
"sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
"version": "==1.16.0"
}
},
"develop": {}
}

View File

@@ -1,16 +1,12 @@
# TikTok hashtag analysis toolset
> IMPORTANT NOTE: this tool relies on [drawrowfly/tiktok-scraper](https://github.com/drawrowfly/tiktok-scraper) which seems to be broken at time of writing and without updates for some time with several open issues ([796](https://github.com/drawrowfly/tiktok-scraper/issues/796) [#799](https://github.com/drawrowfly/tiktok-scraper/issues/799)) that need to be fixed before this library can work smoothly :/
The tool helps to download posts and videos from TikTok for a given set of hashtags over a period of time. Users can create a growing database of posts for specific hashtags which can then be used for further hashtag analysis. It uses the [tiktok-scraper](https://github.com/drawrowfly/tiktok-scraper) Node package to download the posts and videos.
The tool helps to download posts and videos from TikTok for a given set of hashtags over a period of time. Users can create a growing database of posts for specific hashtags which can then be used for further hashtag analysis. It uses the [TikTokApi](https://github.com/davidteather/TikTok-Api) Python package to download the posts and uses [yt-dlp](https://github.com/yt-dlp/yt-dlp) to download the videos.
[![PyPI version](https://badge.fury.io/py/tiktok-hashtag-analysis.svg)](https://badge.fury.io/py/tiktok-hashtag-analysis)
## Pre-requisites
1. Make sure you have Python 3.6 or a later version installed
2. And, you need to have node version 16. On Mac, do `brew install node` followed by `npm install -g n` and then `n 16`
4. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper
5. Install the tool with pip: `pip install tiktok-hashtag-analysis`
1. Make sure you have Python 3.9 or a later version installed
2. Install the tool with pip: `pip install tiktok-hashtag-analysis`
1. or directly from the repo version: `pip install git+https://github.com/bellingcat/tiktok-hashtag-analysis`
You should now be ready to start using it.
@@ -19,27 +15,23 @@ You should now be ready to start using it.
## About the tool
### Command-line arguments
```
tiktok-hashtag-analysis --help
usage: tiktok-hashtag-analysis [-h] [-t [T ...]] [-f F] [-p] [-v] [-ht HASHTAG] [-n NUMBER] [-plt] [-d] {download,frequencies}
usage: tiktok-hashtag-analysis [-h] [--file FILE] [-d] [--number NUMBER] [-p] [-t] [--output-dir OUTPUT_DIR] [--log LOG] [hashtags ...]
Analyze hashtags within posts scraped from TikTok.
positional arguments:
{download,frequencies}
command to initialize
hashtags List of hashtags to scrape
options:
optional arguments:
-h, --help show this help message and exit
-t [T ...] List of hashtags to scrape (module: run_downloader)
-f F File name containing list of hashtags to scrape (module: run_downloader)
-p Download post data (module: run_downloader)
-v Download video files (module: run_downloader)
-ht HASHTAG, --hashtag HASHTAG
The hashtag of scraped posts to analyze (module: hashtag_frequencies)
-n NUMBER, --number NUMBER
The number of top n occurrences (module: hashtag_frequencies)
-plt, --plot Plot the occurrences (module: hashtag_frequencies)
-d, --print List top n hashtags (module: hashtag_frequencies)
--file FILE File name containing list of hashtags to scrape
-d, --download Download video files corresponding to scraped posts
--number NUMBER The number of co-occurring hashtags to analyze
-p, --plot Plot the most common co-occurring hashtags
-t, --table Print a table of the most common co-occurring hashtags
--output-dir OUTPUT_DIR
Directory to save scraped data and visualizations to
--log LOG File to write logs to
```
### Structure of output data
@@ -67,9 +59,9 @@ The `data` folder contains all the downloaded data as shown in the tree diagram
## How to use
### Post downloading
Running the `tiktok-hashtag-analysis download` command with the following options will scrape posts containing the hashtags `#london`, `#paris`, or `#newyork`:
Running the `tiktok-hashtag-analysis` command with the following options will scrape posts containing the hashtags `#london`, `#paris`, or `#newyork`:
tiktok-hashtag-analysis download -t london paris newyork -p
tiktok-hashtag-analysis london paris newyork
and will produce an output similar to the following log:
@@ -100,7 +92,7 @@ Assume we want to analyze the 20 most frequently occurring hashtags in the downl
- The results can be plotted and saved as a PNG file by executing the following command:
`tiktok-hashtag-analysis frequencies london 20 -p`
`tiktok-hashtag-analysis frequencies --hashtag london --number 20 --plot`
which will produce a figure similar to that shown below:
<p align="center">
@@ -111,7 +103,7 @@ Assume we want to analyze the 20 most frequently occurring hashtags in the downl
- The results can be displayed in tabular form by executing the following command:
`tiktok-hashtag-analysis frequencies london 20 -d`
`tiktok-hashtag-analysis frequencies --hashtag london --number 20 --print`
which will produce a terminal output similar to the following:
```

View File

@@ -1,2 +0,0 @@
matplotlib
seaborn

View File

@@ -3,7 +3,7 @@
set -e
TAG=$(python -c 'from tiktok_hashtag_analysis.version import __version__; print("v" + __version__)')
TAG=$(python -c 'from tiktok_hashtag_analysis import __version__; print("v" + __version__)')
read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt

View File

@@ -1,5 +1,5 @@
from setuptools import setup, find_packages
from tiktok_hashtag_analysis.version import __version__
from setuptools import setup
from tiktok_hashtag_analysis import __version__
with open("README.md", "r", encoding="utf-8") as file:
long_description = file.read()
@@ -10,23 +10,18 @@ setup(
author="Bellingcat",
author_email="tech@bellingcat.com",
packages=["tiktok_hashtag_analysis"],
package_data={
"tiktok_hashtag_analysis": [
"logging.config",
]
},
description="Analyze hashtags within posts scraped from TikTok",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/bellingcat/tiktok-hashtag-analysis",
license="MIT License",
install_requires=["seaborn", "matplotlib"],
install_requires=["seaborn", "matplotlib", "TikTokApi", "requests", "yt-dlp"],
classifiers=[
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Information Technology',
'License :: OSI Approved :: MIT License',
'Natural Language :: English',
'Programming Language :: Python :: 3'
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Information Technology",
"License :: OSI Approved :: MIT License",
"Natural Language :: English",
"Programming Language :: Python :: 3",
],
entry_points={
"console_scripts": [

View File

@@ -0,0 +1 @@
__version__ = "2.0.0"

View File

@@ -1,76 +1,91 @@
import logging, argparse
from .file_methods import log_writer
from .run_downloader import * # Import everything from run_downloader.py
from .hashtag_frequencies import * # Import everything from hashtag_frequencies.py
import logging
import argparse
from pathlib import Path
import sys
logger = logging.getLogger()
from .base import TikTokDownloader, load_hashtags_from_file
def create_parser() -> argparse.ArgumentParser:
"""Create the parser and the arguments for the user input."""
parser = argparse.ArgumentParser(description="Analyze hashtags within posts scraped from TikTok.")
parser.add_argument("command", help="command to initialize", choices=['download', 'frequencies'])
parser.add_argument("-t", type=str, nargs="*", help="List of hashtags to scrape (module: run_downloader)")
parser.add_argument("-f", type=str, help="File name containing list of hashtags to scrape (module: run_downloader)")
parser.add_argument("-p", action="store_true", help="Download post data (module: run_downloader)")
parser.add_argument("-v", action="store_true", help="Download video files (module: run_downloader)")
parser.add_argument("-ht", "--hashtag", type=str,
help="The hashtag of scraped posts to analyze (module: hashtag_frequencies)", )
parser.add_argument("-n", "--number", type=int, help="The number of top n occurrences (module: hashtag_frequencies)")
parser.add_argument("-plt", "--plot", help="Plot the occurrences (module: hashtag_frequencies)", action="store_true")
parser.add_argument("-d", "--print", help="List top n hashtags (module: hashtag_frequencies)", action="store_true")
def create_parser():
parser = argparse.ArgumentParser(
description="Analyze hashtags within posts scraped from TikTok."
)
parser.add_argument(
"hashtags",
type=str,
nargs="*",
help="List of hashtags to scrape",
)
parser.add_argument(
"--file",
type=str,
help="File name containing list of hashtags to scrape",
)
parser.add_argument(
"-d",
"--download",
action="store_true",
help="Download video files corresponding to scraped posts",
)
parser.add_argument(
"--number",
type=int,
help="The number of co-occurring hashtags to analyze",
default=20,
)
parser.add_argument(
"-p",
"--plot",
help="Plot the most common co-occurring hashtags",
action="store_true",
)
parser.add_argument(
"-t",
"--table",
help="Print a table of the most common co-occurring hashtags",
action="store_true",
)
parser.add_argument(
"--output-dir",
type=str,
help="Directory to save scraped data and visualizations to",
default=Path(".").resolve().parent / "data",
)
parser.add_argument("--log", type=str, help="File to write logs to", default=None)
return parser
def main():
parser = create_parser()
args = parser.parse_args()
if args.command == "download":
if not (args.t or args.f):
parser.error(
"No hashtags were given, please use either the `-t` flag or the `-f` flag to specify one or more hashtags.")
if not (args.p or args.v):
logging.basicConfig(
level=logging.INFO,
filename=args.log,
format="%(asctime)s %(levelname)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
if len(args.hashtags) == 0:
if not args.file:
parser.error(
"No argument given, please specify either the `-p` flag to download post data or the `-v` flag to download video files, or both."
"No hashtags were specified, please specify one or more hashtags "
"to scrape or use the `--file` flag to specify a text file containing "
"hashtags."
)
if args.t:
hashtags = args.t
elif args.f:
file_name = args.f
hashtags = get_hashtag_list(file_name)
logger.info(f"Hashtags to scrape: {hashtags}")
if not hashtags:
raise ValueError(
"No hashtags were specified: please use either the `-t` flag to specify a sspace-separated list of one or more hashtags as a command-line argument, or use the `-f` flag to specify a text file of newline-separated hashtags.")
download_data_type = {"posts": args.p, "videos": args.v}
scraped_summary_list = get_data(hashtags, download_data_type)
if scraped_summary_list:
log_writer(scraped_summary_list)
elif args.command == "frequencies":
img_folder = IMAGES
check_file(img_folder, "dir")
if args.n < 1:
raise ValueError(
f"Specified argument `n` (the number of hashtags to analyze) must be greater than zero, not: {args.n}.")
input_file = data_file = os.path.join(
FILES["data"], args.hashtag, FILES["posts"], FILES["data_file"]
)
if not check_existence(input_file, "file"):
raise FileNotFoundError(
f"File ({input_file}) for specified argument `hashtag` ({args.hashtag}) does not exist.")
# base = os.path.splitext(input_file)[0]
# path = f"./{base}_sorted_hashtags.csv"
occs = get_occurrences(input_file, args.n)
if args.plot:
plot(occs, img_folder)
else:
print_occurrences(occs)
hashtags = load_hashtags_from_file(file=args.file)
else:
hashtags = args.hashtags
if __name__=="__main__":
main()
downloader = TikTokDownloader(hashtags=hashtags, data_dir=args.output_dir)
downloader.run(
download=args.download, plot=args.plot, table=args.table, number=args.number
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,259 @@
import os
import json
from pathlib import Path
from collections import Counter
from datetime import datetime
import warnings
import asyncio
import logging
import re
from typing import List, Dict
import yt_dlp
import requests
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
from TikTokApi import TikTokApi
warnings.filterwarnings("ignore", message="Glyph (.*) missing from current font")
sns.set_theme(style="darkgrid")
def process_hashtag_list(hashtags: List[str]) -> List[str]:
"""Convert a list of hashtags to a standard form (remove whitespace, make
lowercase, etc.)."""
return list(
filter(None, (hashtag.strip().strip("#").lower() for hashtag in hashtags))
)
def load_hashtags_from_file(file: str) -> List[str]:
"""Read and process hashtags specified in a text file."""
if not os.path.isfile(file):
raise OSError(f"{file} does not exist")
with open(file, "r", encoding="utf-8") as f:
hashtags = re.split(r"\n|,", f.read())
return process_hashtag_list(hashtags=hashtags)
async def _fetch_hashtag_data(hashtag: str) -> List[Dict]:
"""Fetch data for videos containing a specified hashtag, asynchronously."""
data = []
async with TikTokApi() as api:
await api.create_sessions(
ms_tokens=[os.environ["MS_TOKEN"]], num_sessions=1, sleep_after=3
)
async for video in api.hashtag(name=hashtag).videos(count=1000):
data.append(video.as_dict)
return data
def json_load(file_path: Path) -> List:
"""Read a JSON file and return the read data."""
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(fp=f)
return data
def json_dump(file_path: Path, data: List):
"""Write data to a JSON file."""
with open(file_path, "w", encoding="utf-8") as f:
json.dump(obj=data, fp=f)
def download_gallery(video_data: Dict, video_dir: Path):
"""yt-dlp doesn't seem to support downloading images from an image gallery,
so this is a quick fix that likely will fail on edge cases."""
video_id = video_data["id"]
if play_url := video_data["music"]["playUrl"]:
r = requests.get(play_url)
with open(video_dir / f"{video_id}.mp3", "wb") as f:
f.write(r.content)
for i, image in enumerate(video_data["imagePost"]["images"]):
image_url = image["imageURL"]["urlList"][0]
r = requests.get(image_url)
ext = r.headers["Content-Type"].split("/")[-1]
with open(video_dir / f"{video_id}_{i:02d}.{ext}", "wb") as f:
f.write(r.content)
def aggregate_cooccurring_hashtags(hashtag_file: Path) -> Counter:
"""Aggregate how frequently hashtags are used, from a file containing a
list of raw TikTok post API responses."""
videos = json_load(file_path=hashtag_file)
all_hashtags: List[set] = []
for video in videos:
video_hashtags = set(
hashtag["hashtagName"]
for hashtag in video.get("textExtra", [])
if hashtag.get("hashtagName")
)
all_hashtags.extend(video_hashtags)
return Counter(all_hashtags)
class TikTokDownloader:
"""Main class for scraping data from TikTok."""
def __init__(self, hashtags: List[str], data_dir: str):
self.hashtags = process_hashtag_list(hashtags)
logging.info(f"Hashtags to scrape: {hashtags}")
self.data_dir = Path(data_dir)
os.makedirs(self.data_dir, exist_ok=True)
def get_hashtag_posts(self, hashtag: str):
"""Fetch data about posts that used a specified hashtag and merge with
existing data, if it exists."""
# Define file to store hashtags in and create parent directory
hashtag_file = self.data_dir / hashtag / "posts.json"
hashtag_file.parent.mkdir(exist_ok=True, parents=True)
# If there are previously scraped posts, load them
if hashtag_file.is_file():
already_fetched_data = json_load(file_path=hashtag_file)
already_fetched_ids = set(video["id"] for video in already_fetched_data)
else:
already_fetched_ids = set()
already_fetched_data = []
# Scrape posts that use the specified hashtag
fetched_data = asyncio.run(_fetch_hashtag_data(hashtag=hashtag))
if len(fetched_data) == 0:
logging.warning(f"No posts were found for the hashtag: {hashtag}")
# Determine which newly scraped posts haven't been scraped before
new_fetched_data = [
video for video in fetched_data if video["id"] not in already_fetched_ids
]
if len(new_fetched_data) == 0:
logging.warning(f"No new posts were found for the hashtag: {hashtag}")
# Merge new and old data and write to file
all_fetched_data = already_fetched_data + new_fetched_data
json_dump(file_path=hashtag_file, data=all_fetched_data)
logging.info(
f"Scraped {len(new_fetched_data)} new posts containing the hashtag "
f"'{hashtag}', with {len(already_fetched_data)} posts previously scraped"
)
def get_hashtag_videos(self, hashtag: str):
"""Download videos and other media corresponding to posts that used a
specified hashtag,"""
# Define file containing post data and directory to save videos to
hashtag_file = self.data_dir / hashtag / "posts.json"
video_dir = self.data_dir / hashtag / "videos"
video_dir.mkdir(exist_ok=True)
# Get list of post IDs that have previously had their media downloaded
already_downloaded_ids = set(
file.split(".")[0].split("_")[0] for file in os.listdir(video_dir)
)
# Get list of posts that have been scraped but not had their media downloaded
video_list = json_load(file_path=hashtag_file)
new_video_list = [
video for video in video_list if video["id"] not in already_downloaded_ids
]
if len(new_video_list) == 0:
logging.warning(
f"No new videos to be downloaded for the hashtag: {hashtag}"
)
# Populate list of URLs to download using yt-dlp, and list of image
# galleries to download using the `download_gallery` function
urls_to_download = []
galleries_to_download = []
for video in new_video_list:
if video.get("imagePost") is None:
url = f"https://www.tiktok.com/@{video['author']['uniqueId']}/video/{video['id']}"
urls_to_download.append(url)
else:
galleries_to_download.append(video)
# Download audio and image files for all image gallery posts
if len(galleries_to_download) > 0:
logging.info(f"Downloading image galleries for hashtag {hashtag}")
for video in galleries_to_download:
download_gallery(video_data=video, video_dir=video_dir)
# Download video files for all video posts
if len(urls_to_download) > 0:
logging.info(f"Downloading videos for hashtag {hashtag}")
ydl_opts = {"outtmpl": os.path.join(video_dir, "%(id)s.%(ext)s")}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download(urls_to_download)
def frequency_table(self, hashtag: str, number: int):
"""Print `number`-most commonly co-occurring hashtags for a specified
source hashtag, in tabular form."""
# Load video data file and extract co-occurring hashtag frequency information
hashtag_file = self.data_dir / hashtag / "posts.json"
frequencies = aggregate_cooccurring_hashtags(hashtag_file=hashtag_file)
# Print table that displays most commonly co-occurring hashtags
total_posts = max(frequencies.values())
print(f"\nCo-occurring hashtags for #{hashtag} posts")
print(f"{'Rank':<8} {'Hashtag':<30} {'Occurrences':<15} {'Frequency':<15}")
for row, (hashtag, frequency) in enumerate(frequencies.most_common(number)):
ratio = frequency / total_posts
print(f"{row:<8} {hashtag:<30} {frequency:<15} {ratio:.4f}")
print(f"Total posts: {total_posts}\n\n")
def plot(self, hashtag: str, number: int):
"""Create plot of `number`-most commonly co-occurring hashtags for a
specified source hashtag."""
# Load video data file and extract co-occurring hashtag frequency information
hashtag_file = self.data_dir / hashtag / "posts.json"
frequencies = aggregate_cooccurring_hashtags(hashtag_file=hashtag_file)
# Define labels and other fields used in plot
total_posts = max(frequencies.values())
sorted_frequencices = frequencies.most_common(number)
labels = [label for label, _ in sorted_frequencices[1:]]
ratios = [freq / total_posts * 100 for _, freq in sorted_frequencices[1:]]
y_pos = list(reversed(range(len(sorted_frequencices) - 1)))
# Visualize data in bar chart
fig, ax = plt.subplots(figsize=(5, 6.66))
ax.barh(y_pos, ratios)
ax.set_yticks(y_pos)
ax.set_yticklabels(labels)
ax.grid(axis="y")
ax.set_xlabel("Percent of posts with co-occurring hashtag")
ax.set_ylim(min(y_pos) - 1, max(y_pos) + 1)
ax.set_title(f"Co-occurring hashtags for #{hashtag} posts")
ax.xaxis.set_major_formatter(mtick.PercentFormatter(decimals=0))
# Write image of plot to file
current_time = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
plot_file = self.data_dir / hashtag / "plots" / f"{hashtag}__{current_time}.png"
plot_file.parent.mkdir(exist_ok=True, parents=True)
plt.savefig(plot_file, bbox_inches="tight", facecolor="white", dpi=300)
logging.info(f"Plot saved to file: {plot_file}")
def run(self, download: bool, plot: bool, table: bool, number: int):
"""Execute the specified operations on all specified hashtags."""
# Scrape all specified hashtags and perform analyses, depending on if
# `--table` and `--plot` flags are used in the command
for hashtag in self.hashtags:
self.get_hashtag_posts(hashtag=hashtag)
if plot:
self.plot(hashtag=hashtag, number=number)
if table:
self.frequency_table(hashtag=hashtag, number=number)
# Download media for all hashtags if `--download` flag is used in the command
for hashtag in self.hashtags:
if download:
self.get_hashtag_videos(hashtag=hashtag)

View File

@@ -1,161 +0,0 @@
"""Utility functions that perform data processing related tasks.
"""
from typing import NamedTuple, List, Tuple, Set, Optional, Dict, Any
import logging
from . import file_methods
logger = logging.getLogger()
class Diff(NamedTuple):
"""Keep track of scraped post IDs and whether previously-scraped posts have been filtered."""
ids: Set[str]
filter_posts: bool
class Total(NamedTuple):
"""Keep track of number of total and number of unique scraped posts."""
total: int
unique: int
def get_difference(tag: str, file_name: str, ids: List[str]) -> Optional[Diff]:
"""Find TikTok post IDs that haven't previously been scraped.
Filter out the new posts for the hashtag `tag` by comparing the list of
post IDs contained in `filename` to the list of newly downloaded IDs
contained in `ids`.
"""
filter_posts = False
current_id_data = file_methods.get_data(file_name)
if tag in current_id_data:
current_ids = current_id_data[tag]
set_current_ids = set(current_ids)
total_current_ids = len(set_current_ids)
set_ids = set(ids)
new_ids = set_ids.difference(set_current_ids)
if not new_ids:
return None
else:
total_new_ids = len(new_ids)
if total_new_ids == total_current_ids:
new_data = Diff(new_ids, filter_posts)
else:
new_data = Diff(new_ids, filter_posts)
return new_data
else:
filter_posts = True
new_data = Diff(set(ids), filter_posts)
return new_data
def extract_posts(
settings: Dict[Any, Any], file_name: str, tag: str
) -> Optional[Tuple[List[str], List[Dict]]]:
"""Find TikTok posts that haven't previously been scraped.
Compares the file downloaded by tiktok-scraper to the list of
previously-scraped posts (from the file ids/post_ids.json).
"""
ids = []
posts = []
posts = file_methods.get_data(file_name)
for post in posts:
ids.append(post["id"])
if not ids:
logger.warn(f"No posts were found for the hashtag: {tag}")
return None
status = file_methods.check_existence(settings["post_ids"], "file")
if not status:
new_data = (ids, posts)
return new_data
else:
new_ids = get_difference(tag, settings["post_ids"], ids)
if not new_ids:
logger.warn(f"No new posts were found for the hashtag: {tag}")
return None
elif new_ids.filter_posts:
new_posts = [post for post in posts if post["id"] in new_ids.ids]
return (list(new_ids.ids), new_posts)
else:
return (list(new_ids.ids), posts)
def extract_videos(settings: dict, tag: str, download_list: List[str]) -> List[str]:
"""Find TikTok videos that haven't previously been scraped.
Compares the file downloaded by tiktok-scraper to the list of
previously-scraped videos (from the file ids/video_ids.json).
"""
status = file_methods.check_existence(settings["video_ids"], "file")
if not status:
new_data = download_list
return new_data
else:
new_videos = get_difference(tag, settings["video_ids"], download_list)
if not new_videos:
logger.warn(
f"No new videos were found for the {tag} in the downloaded folder."
)
return []
else:
return list(new_videos.ids)
def update_posts(
file_path: str, file_type: str, new_data: List[Any], tag: str = None
) -> Optional[Tuple[str, int]]:
"""Update the file containing scraped post IDs (`ids/post_ids.json`) with
the IDs of the recently scraped posts.
"""
status = file_methods.check_existence(file_path, file_type)
if not tag:
file_methods.post_writer(file_path, new_data, status)
return None
else:
scraped_data = file_methods.id_writer(file_path, new_data, tag, status)
return scraped_data
def update_videos(
settings: Dict[str, Any], new_data: List[str], tag: str
) -> Tuple[str, int]:
"""Update the file containing video IDs (`ids/video_ids.json`) with the IDs
of the recently scraped videos.
"""
file_path = settings["video_ids"]
file_methods.check_file(file_path, "file")
number_scraped = file_methods.id_writer(file_path, new_data, tag, True)
file_methods.clean_video_files(settings, tag, new_data)
return number_scraped
def get_total_posts(file_path: str, tag: str) -> Total:
"""Count number of total scraped posts and number of unique scraped posts."""
status = file_methods.check_existence(file_path, "file")
if not status:
raise OSError(f"{file_path} not found!")
else:
data = file_methods.get_data(file_path)
total_posts = len(data[tag])
unique = len(set(data[tag]))
t = Total(total_posts, unique)
return t
def print_total(file_path: str, tag: str, data_type: str):
"""Print number of total and unique scraped posts, warn if any non-unique posts."""
total = get_total_posts(file_path, tag)
if total.total == total.unique:
logger.info(f"Scraped {total.total} {data_type} containing the hashtag '{tag}'")
else:
logger.warn(
f"Out of total {data_type} for the hashtag {tag} {total.total}, only {total.unique} are unique. Something is going wrong..."
)

View File

@@ -1,216 +0,0 @@
"""Utility functions that operate on files, such as writing to reading from a file.
"""
import os
import json
import subprocess
from os import path
from datetime import datetime
import shutil
from typing import Tuple, List, Optional, Dict, Any
import logging, logging.config
logging.config.fileConfig(path.join(path.dirname(path.abspath(__file__)), 'logging.config'))
logger = logging.getLogger("Logger")
def create_file(name: str, file_type: str):
"""Create a file or directory."""
if file_type == "dir":
os.makedirs(name, mode=0o777)
elif file_type == "file":
with open(name, "w"):
pass
else:
raise ValueError(f"{file_type} has to be either 'dir' or 'file'")
def check_existence(file_path: str, file_type: str):
"""Check if a file or a directory exists."""
if file_type == "file":
return os.path.isfile(file_path)
elif file_type == "dir":
return os.path.isdir(file_path)
else:
raise ValueError(f"{file_type} has to be either 'dir' or 'file'")
def check_file(file_path: str, file_type: str):
"""If path does not exist, creates a file or directory."""
status = check_existence(file_path, file_type)
if not status:
create_file(file_path, file_type)
def download_posts(settings: Dict, tag: str, output_dir: Any):
"""Run the tiktok-scraper command to download posts for a given hashtag.
Returns the path to the downloaded file of posts. If no file was downloaded,
prints the error and returns nothing in order to move on.
os.chdir is used to execute shell commands in the correct folder and then
reused to return to the original folder of execution of run_downloader script.
"""
path = os.path.join(settings["data"], tag, settings["posts"])
os.makedirs(path, exist_ok=True)
tiktok_command = f"tiktok-scraper hashtag {tag} -t 'json' --filepath {output_dir}"
output = subprocess.check_output(tiktok_command, shell=True, encoding="utf-8")
new_file = output.split()[-1]
if "json" in new_file:
return new_file
else:
logger.warn(
f"Something's wrong with what is returned by tiktok-scraper for the hashtag {tag} - *{new_file}* is not a json file.\n\ntiktok-scraper returned {output}"
)
def download_videos(settings: Dict, tag: str):
"""Run the tiktok-scraper command to download videos for a given hashtag.
Note that all the videos are downloaded that are returned by the TikTok API,
making this a time- and data-intensive process.
The list of downloaded video IDs is constucted and returned if the
downloaded folder contains at least 1 video.
os.chdir is used to execute shell commands in the correct folder and then
reused to return to the original folder of execution of run_downloader script.
"""
path = os.path.join(settings["data"], tag, settings["videos"])
os.makedirs(path, exist_ok=True)
tiktok_command = f"tiktok-scraper hashtag {tag} -d --filepath {path}"
result = subprocess.check_output(tiktok_command, shell=True)
downloaded_list_tmp = os.listdir(os.path.join(path, f"#{tag}"))
if downloaded_list_tmp:
downloaded_list = []
for file in downloaded_list_tmp:
file = file.split(".")[0]
downloaded_list.append(file)
return downloaded_list
else:
logger.warn(f"No video files were downloaded for the hashtag {tag}.")
shutil.rmtree(settings["videos_delete"])
def get_data(file_path: str) -> Any:
"""Read a JSON file and return the read data."""
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
return data
def dump_data(file_path: str, data: Any):
"""Write data to a JSON file."""
with open(file_path, "w", encoding="utf-8") as f:
json.dump(data, f)
def log_writer(log_data: List[Tuple[str, Tuple[str, int]]]):
"""Create the dictionary of total downloads (posts and videos) per hashtag.
Example : {
timetamp : {
hashtag : {
videos : number_of_new_videos ,
posts : number_of_new_posts
}
}
}
Writes the dictionary to the log file (`logs/log.json`).
"""
total = 0
scraped_summary_dict = {} # type: Dict[str, Dict[str, int]]
for hashtag, (data_type, count) in log_data:
if hashtag in scraped_summary_dict:
if data_type in scraped_summary_dict[hashtag]:
scraped_summary_dict[hashtag][data_type] += count
else:
scraped_summary_dict[hashtag][data_type] = count
total += count
else:
scraped_summary_dict[hashtag] = {data_type: count}
total += count
now_str = datetime.now().strftime("%d-%m-%Y %H:%M:%S")
data = {now_str: scraped_summary_dict}
logger.debug(f"Logged post data: {data}")
logger.info(f"Successfully scraped {total} total entries")
def id_writer(
file_path: str, new_data: List[str], tag: str, status: bool
) -> Tuple[str, int]:
"""Write the list of new ids to the post_ids or video_ids file."""
total = len(new_data)
if status:
try:
data = get_data(file_path)
if tag in data:
data[tag] += new_data
else:
data[tag] = new_data
dump_data(file_path, data)
except json.decoder.JSONDecodeError:
data = {tag: new_data}
dump_data(file_path, data)
else:
data = {tag: new_data}
dump_data(file_path, data)
logger.debug(f"SUCCESS - {total} entries added to {file_path}")
number_scraped = (tag, total)
return number_scraped
def post_writer(file_path: str, new_data: List[Dict], status: bool):
"""Write the new posts in the post file of the given hashtag
(`/data/{hashtag}/posts/data.json`).
"""
total = len(new_data)
if status:
try:
data = get_data(file_path)
data += new_data
dump_data(file_path, data)
except json.decoder.JSONDecodeError:
data = new_data
dump_data(file_path, data)
else:
data = new_data
dump_data(file_path, data)
logger.debug(f"SUCCESS - {total} entries added to {file_path}")
def delete_file(file_path: str, file_type: str):
"""Delete a directory or file."""
if not check_existence(file_path, file_type):
raise OSError(f"Attempt to delete file failed: {file_path} does not exist")
elif file_type == "file":
os.remove(file_path)
logger.debug(f"Successfully deleted {file_path}")
elif file_type == "dir":
os.rmdir(file_path)
logger.debug(f"Successfully deleted {file_path}")
else:
raise OSError("{file_type} needs to be either 'file' or 'dir'")
def clean_video_files(settings: dict, tag: str, new_data: Optional[List[str]] = None):
"""Move the new videos from the tiktok-scraper video folder to `/data/{hashtag}/videos/`.
Deletes the residual tiktok-scraper video folder.
"""
if new_data:
for file in new_data:
settings["videos_from"] = (
settings["data"] + f"/{tag}/videos/#{tag}/{file}.mp4"
)
shutil.move(settings["videos_from"], settings["videos_to"])
shutil.rmtree(settings["videos_delete"])
logger.debug(
f"Successfully deleted the folder {settings['videos_delete']} folder of videos."
)

View File

@@ -1,32 +0,0 @@
"""Specify global constants including file paths and scraping options.
"""
# Directories
DATA = "../data"
IDS = "ids"
POSTS = "posts"
VIDEOS = "videos"
IMAGES = f"{DATA}/img"
# Files
POST_IDS = "post_ids.json"
VIDEO_IDS = "video_ids.json"
DATA_FILE = "data.json"
FILES = {
"data": DATA,
"ids": IDS,
"posts": POSTS,
"videos": VIDEOS,
"images": IMAGES,
"post_ids": f"{DATA}/{IDS}/{POST_IDS}",
"video_ids": f"{DATA}/{IDS}/{VIDEO_IDS}",
"data_file": f"{DATA_FILE}",
"downloads": [],
}
PARAMETERS = {
"scraper_attempts": 3,
"sleep": 8,
}

View File

@@ -1,99 +0,0 @@
"""Analyze the frequency of hashtags appearing in the set of given posts.
- The "hashtag" positional argument specifies the hashtag of scraped posts to analyze
- The "n" positional argument specifies how many hashtags does the user wants to analyze
- Specifying the "-d" flag prints the hashtag frequencies on the shell
- Specifying the "-p" flag plots the hashtag frequencies and saves as a png file
"""
import json
from datetime import datetime
import warnings
import logging
from typing import List, Tuple, Dict, Any
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
warnings.filterwarnings("ignore", message="Glyph (.*) missing from current font")
sns.set_theme(style="darkgrid")
def get_hashtags(obj: Dict) -> List[Tuple[str, int]]:
if not obj:
raise ValueError(f"Empty item, no hashtags could be extracted.")
else:
hashtags = {}
tags = [set([tag["name"] for tag in ele["hashtags"]]) for ele in obj]
{
tag: (
1
if tag not in hashtags and not hashtags.update({tag: 1})
else hashtags[tag] + 1 and not hashtags.update({tag: hashtags[tag] + 1})
)
for ele in tags
for tag in ele
}
return sorted(hashtags.items(), key=lambda e: e[1], reverse=True)
def get_occurrences(filename: str, n: int = 1) -> Dict[str, Any]:
"""Aggregate hashtag frequency information for a specified JSON file.
Example: {
"total": total posts in the file,
top_n: [[top n hashtags ], [frequencies of corresponding hashtags]]
}
"""
with open(filename) as f:
obj = json.load(f)
l = len(obj)
tags = get_hashtags(obj)
occs = {"total": l, "top_n": []}
occs["top_n"] = [[ele[i] for ele in tags[0 : min(l, n)]] for i in range(2)]
return occs
def plot(occs: dict, img_folder: str):
"""Save plot of common hashtags as bar chart to file."""
y_pos = list(reversed(range(len(occs["top_n"][0]) - 1)))
max_count = occs["top_n"][1][0]
freqs = [count / max_count * 100 for count in occs["top_n"][1][1:]]
labels = occs["top_n"][0][1:]
hashtag = occs["top_n"][0][0]
fig, ax = plt.subplots(figsize=(5, 6.66))
ax.barh(y_pos, freqs)
ax.set_yticks(y_pos)
ax.set_yticklabels(labels)
ax.grid(axis="y")
ax.set_xlabel("Percent of posts with common hashtag")
ax.set_ylim(min(y_pos) - 1, max(y_pos) + 1)
ax.set_title(f"Common hashtags for #{hashtag} posts")
ax.xaxis.set_major_formatter(mtick.PercentFormatter(decimals=0))
save_plot(img_folder, hashtag)
def save_plot(img_folder, hashtag):
"""Save the plot as a png file in the folder ../data/imgs/"""
now = datetime.now()
current_time = now.strftime("%Y_%m_%d_%H_%M_%S")
filename = f"{img_folder}/{hashtag}_{current_time}.png"
logging.info(f"Plot saved to file: {filename}")
plt.savefig(filename, bbox_inches="tight", facecolor="white", dpi=300)
def print_occurrences(occs):
"""Print information about the top n hashtags and their frequencies."""
row_number = 0
total_posts = occs["total"]
print(
"{:<8} {:<30} {:<15} {:<15}".format(
"Rank", "Hashtag", "Occurrences", "Frequency"
)
)
for key, value in zip(occs["top_n"][0], occs["top_n"][1]):
ratio = value / total_posts
print("{:<8} {:<30} {:<15} {:.4f}".format(row_number, key, value, ratio))
row_number += 1
print(f"Total posts: {total_posts}")

View File

@@ -1,5 +0,0 @@
# Enter a hashtag per line. Each line should contain only one word.
london
paris
tokyo
newyork

View File

@@ -1,36 +0,0 @@
[loggers]
keys=root,Logger
[handlers]
keys=consoleHandler,fileHandler
[formatters]
keys=consoleFormatter,fileFormatter
[logger_root]
level=DEBUG
handlers=consoleHandler
[logger_Logger]
level=DEBUG
handlers=consoleHandler,fileHandler
qualname=Logger
propagate=0
[handler_consoleHandler]
class=StreamHandler
level=INFO
formatter=consoleFormatter
args=(sys.stdout,)
[handler_fileHandler]
class=FileHandler
level=DEBUG
formatter=fileFormatter
args=("../logfile.log",)
[formatter_consoleFormatter]
format=%(message)s
[formatter_fileFormatter]
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s

View File

@@ -1,150 +0,0 @@
"""Download post data or videos from TikToks containing one or more specified hashtags.
- The "-p" flag specifies that only data from posts is downloaded, no video files
- The "-v" flag specifies that only video files are downloaded, no post data
- Specifying both "-p" and "-v" flags downloads both post data and video files
- The "-t" flag allows the user to specify a list of space-separated hashtags as an argument
- The "-f" flag allows the user to specify the filename of a text file containing a list of newline-separated hashtags as an argument
"""
import os
import time
from typing import List, Tuple, Dict, Any, Optional
from tempfile import TemporaryDirectory
from tiktok_hashtag_analysis import global_data
import tiktok_hashtag_analysis.file_methods as file_methods
from tiktok_hashtag_analysis import data_methods
def get_hashtag_list(file_name: str) -> List[str]:
"""Extract list of newline-separated hashtags from text file."""
if not file_methods.check_existence(file_name, "file"):
raise OSError(f"{file_name} does not exist")
with open(file_name) as f:
tags = list(
filter(None, [line.strip() for line in f if not line.startswith("#")])
)
return tags
def set_download_settings(download_data_type: Dict[str, bool]) -> Dict[str, Any]:
"""Load the constants from global_data module into the `settings` dict."""
settings = {
"data": global_data.FILES["data"],
"ids": global_data.FILES["ids"],
"sleep": global_data.PARAMETERS["sleep"],
"scraper": global_data.PARAMETERS["scraper_attempts"],
}
file_methods.check_file(f"{settings['data']}/{settings['ids']}", "dir")
if download_data_type["posts"]:
settings["posts"] = global_data.FILES["posts"]
settings["post_ids"] = global_data.FILES["post_ids"]
settings["data_file"] = global_data.FILES["data_file"]
if download_data_type["videos"]:
settings["videos"] = global_data.FILES["videos"]
settings["video_ids"] = global_data.FILES["video_ids"]
return settings
def get_posts(settings: dict, tag: str) -> Optional[Tuple[str, int]]:
"""Scrape trending TikTok post data for the specified hashtag.
1. Calls `file_methods.download_posts` to scrape the post data for a given hashtag
2. Calls `data_methods.extract_posts` to determine which if any posts
haven't previously been downloaded.
3. Calls `data_methods.update_posts` to update the ID list with the IDs of
newly downloaded posts.
"""
with TemporaryDirectory() as temp_dir:
file_path = file_methods.download_posts(settings, tag, temp_dir)
number_scraped = None
if file_path:
new_data = data_methods.extract_posts(settings, file_path, tag)
if new_data:
data_file = os.path.join(
settings["data"], tag, settings["posts"], settings["data_file"]
)
data_methods.update_posts(data_file, "file", new_data[1])
number_scraped = data_methods.update_posts(
settings["post_ids"], "file", new_data[0], tag
)
return number_scraped
def get_videos(settings: dict, tag: str) -> Optional[Tuple[str, int]]:
"""Scrape trending TikTok video files for the specified hashtag.
1. Calls `file_methods.download_videos` to download the video files for a given hashtag
2. Calls `data_methods.extract_videos` to determine which if any videos
haven't previouly been downloaded.
3. Calls `data_methods.update_videos` to update the ID list with the IDs of
newly downloaded videos.
4. Calls `clean_video_files` function to delete the residual video folder
after the data processing.
"""
number_scraped = None
download_list = file_methods.download_videos(settings, tag)
if download_list:
new_data = data_methods.extract_videos(settings, tag, download_list)
if new_data:
number_scraped = data_methods.update_videos(settings, new_data, tag)
else:
file_methods.clean_video_files(settings, tag)
return number_scraped
def get_data(
hashtags: list, download_data_type: Dict[str, bool]
) -> List[Tuple[str, Tuple[str, int]]]:
"""Check command-line arguments and scrape posts/videos for specified hashtags."""
counter = 0
total_hashtags = len(hashtags)
total_hashtags_offset = total_hashtags - 1
scraped_summary_list = []
if download_data_type["posts"]:
settings = set_download_settings(download_data_type)
while counter < total_hashtags:
tag = hashtags[counter]
file_methods.check_file(
os.path.join(settings["data"], tag, settings["posts"]), "dir"
)
file_methods.check_file(
os.path.join(
settings["data"], tag, settings["posts"], settings["data_file"]
),
"file",
)
res = get_posts(settings, tag)
if res:
number_scraped = (res[0], ("posts", res[1]))
scraped_summary_list.append(number_scraped)
data_methods.print_total(settings["post_ids"], tag, "posts")
counter += 1
if counter < total_hashtags_offset:
time.sleep(settings["sleep"])
if download_data_type["videos"]:
settings = set_download_settings(download_data_type)
while counter < total_hashtags:
tag = hashtags[counter]
file_methods.check_file(
os.path.join(settings["data"], tag, settings["videos"]), "dir"
)
settings["videos_delete"] = settings["data"] + f"/{tag}/videos/#{tag}"
settings["videos_to"] = settings["data"] + f"/{tag}/videos"
_res = get_videos(settings, tag)
if _res:
scraped_summary_list.append((_res[0], ("videos", _res[1])))
data_methods.print_total(settings["video_ids"], tag, "videos")
counter += 1
if counter < total_hashtags_offset:
time.sleep(settings["sleep"])
return scraped_summary_list

View File

@@ -1,12 +0,0 @@
_MAJOR = "1"
_MINOR = "0"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "4"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""
VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
__version__ = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX)