mirror of
https://github.com/bellingcat/tiktok-hashtag-analysis.git
synced 2026-06-07 19:08:32 +03:00
7
.github/workflows/python-publish.yaml
vendored
7
.github/workflows/python-publish.yaml
vendored
@@ -33,15 +33,12 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade --upgrade-strategy=eager pip setuptools wheel twine pipenv
|
||||
python -m pip install --upgrade --upgrade-strategy=eager pip setuptools wheel twine
|
||||
python -m pip install -e . --upgrade
|
||||
python -m pipenv install --dev --python 3.10
|
||||
env:
|
||||
PIPENV_DEFAULT_PYTHON_VERSION: "3.10"
|
||||
|
||||
- name: Build wheels
|
||||
run: |
|
||||
python -m pipenv run python setup.py sdist bdist_wheel
|
||||
python setup.py sdist bdist_wheel
|
||||
|
||||
- name: Publish a Python distribution to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1,5 +1,8 @@
|
||||
# Data directory
|
||||
data/
|
||||
build/
|
||||
*.egg-info/
|
||||
dist/
|
||||
|
||||
# Miscellaneous files
|
||||
**/.DS_Store
|
||||
|
||||
13
Pipfile
13
Pipfile
@@ -1,13 +0,0 @@
|
||||
[[source]]
|
||||
url = "https://pypi.org/simple"
|
||||
verify_ssl = true
|
||||
name = "pypi"
|
||||
|
||||
[packages]
|
||||
matplotlib = "*"
|
||||
seaborn = "*"
|
||||
|
||||
[dev-packages]
|
||||
|
||||
[requires]
|
||||
python_version = "3.10"
|
||||
416
Pipfile.lock
generated
416
Pipfile.lock
generated
@@ -1,416 +0,0 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "97c5ef0126b17f586b5fa1d518cf359b7e984e48f8fc2310e9aa79bd384c2374"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
"python_version": "3.10"
|
||||
},
|
||||
"sources": [
|
||||
{
|
||||
"name": "pypi",
|
||||
"url": "https://pypi.org/simple",
|
||||
"verify_ssl": true
|
||||
}
|
||||
]
|
||||
},
|
||||
"default": {
|
||||
"contourpy": {
|
||||
"hashes": [
|
||||
"sha256:031154ed61f7328ad7f97662e48660a150ef84ee1bc8876b6472af88bf5a9b98",
|
||||
"sha256:0f9d350b639db6c2c233d92c7f213d94d2e444d8e8fc5ca44c9706cf72193772",
|
||||
"sha256:130230b7e49825c98edf0b428b7aa1125503d91732735ef897786fe5452b1ec2",
|
||||
"sha256:152fd8f730c31fd67fe0ffebe1df38ab6a669403da93df218801a893645c6ccc",
|
||||
"sha256:1c71fdd8f1c0f84ffd58fca37d00ca4ebaa9e502fb49825484da075ac0b0b803",
|
||||
"sha256:24847601071f740837aefb730e01bd169fbcaa610209779a78db7ebb6e6a7051",
|
||||
"sha256:2e9ebb4425fc1b658e13bace354c48a933b842d53c458f02c86f371cecbedecc",
|
||||
"sha256:30676ca45084ee61e9c3da589042c24a57592e375d4b138bd84d8709893a1ba4",
|
||||
"sha256:31a55dccc8426e71817e3fe09b37d6d48ae40aae4ecbc8c7ad59d6893569c436",
|
||||
"sha256:366a0cf0fc079af5204801786ad7a1c007714ee3909e364dbac1729f5b0849e5",
|
||||
"sha256:38e2e577f0f092b8e6774459317c05a69935a1755ecfb621c0a98f0e3c09c9a5",
|
||||
"sha256:3c184ad2433635f216645fdf0493011a4667e8d46b34082f5a3de702b6ec42e3",
|
||||
"sha256:3caea6365b13119626ee996711ab63e0c9d7496f65641f4459c60a009a1f3e80",
|
||||
"sha256:3e927b3868bd1e12acee7cc8f3747d815b4ab3e445a28d2e5373a7f4a6e76ba1",
|
||||
"sha256:4ee3ee247f795a69e53cd91d927146fb16c4e803c7ac86c84104940c7d2cabf0",
|
||||
"sha256:54d43960d809c4c12508a60b66cb936e7ed57d51fb5e30b513934a4a23874fae",
|
||||
"sha256:57119b0116e3f408acbdccf9eb6ef19d7fe7baf0d1e9aaa5381489bc1aa56556",
|
||||
"sha256:58569c491e7f7e874f11519ef46737cea1d6eda1b514e4eb5ac7dab6aa864d02",
|
||||
"sha256:5a011cf354107b47c58ea932d13b04d93c6d1d69b8b6dce885e642531f847566",
|
||||
"sha256:5caeacc68642e5f19d707471890f037a13007feba8427eb7f2a60811a1fc1350",
|
||||
"sha256:5dd34c1ae752515318224cba7fc62b53130c45ac6a1040c8b7c1a223c46e8967",
|
||||
"sha256:60835badb5ed5f4e194a6f21c09283dd6e007664a86101431bf870d9e86266c4",
|
||||
"sha256:62398c80ef57589bdbe1eb8537127321c1abcfdf8c5f14f479dbbe27d0322e66",
|
||||
"sha256:6381fa66866b0ea35e15d197fc06ac3840a9b2643a6475c8fff267db8b9f1e69",
|
||||
"sha256:64757f6460fc55d7e16ed4f1de193f362104285c667c112b50a804d482777edd",
|
||||
"sha256:69f8ff4db108815addd900a74df665e135dbbd6547a8a69333a68e1f6e368ac2",
|
||||
"sha256:6c180d89a28787e4b73b07e9b0e2dac7741261dbdca95f2b489c4f8f887dd810",
|
||||
"sha256:71b0bf0c30d432278793d2141362ac853859e87de0a7dee24a1cea35231f0d50",
|
||||
"sha256:769eef00437edf115e24d87f8926955f00f7704bede656ce605097584f9966dc",
|
||||
"sha256:7f6979d20ee5693a1057ab53e043adffa1e7418d734c1532e2d9e915b08d8ec2",
|
||||
"sha256:87f4d8941a9564cda3f7fa6a6cd9b32ec575830780677932abdec7bcb61717b0",
|
||||
"sha256:89ba9bb365446a22411f0673abf6ee1fea3b2cf47b37533b970904880ceb72f3",
|
||||
"sha256:8acf74b5d383414401926c1598ed77825cd530ac7b463ebc2e4f46638f56cce6",
|
||||
"sha256:9056c5310eb1daa33fc234ef39ebfb8c8e2533f088bbf0bc7350f70a29bde1ac",
|
||||
"sha256:95c3acddf921944f241b6773b767f1cbce71d03307270e2d769fd584d5d1092d",
|
||||
"sha256:9e20e5a1908e18aaa60d9077a6d8753090e3f85ca25da6e25d30dc0a9e84c2c6",
|
||||
"sha256:a1e97b86f73715e8670ef45292d7cc033548266f07d54e2183ecb3c87598888f",
|
||||
"sha256:a877ada905f7d69b2a31796c4b66e31a8068b37aa9b78832d41c82fc3e056ddd",
|
||||
"sha256:a9d7587d2fdc820cc9177139b56795c39fb8560f540bba9ceea215f1f66e1566",
|
||||
"sha256:abf298af1e7ad44eeb93501e40eb5a67abbf93b5d90e468d01fc0c4451971afa",
|
||||
"sha256:ae90d5a8590e5310c32a7630b4b8618cef7563cebf649011da80874d0aa8f414",
|
||||
"sha256:b6d0f9e1d39dbfb3977f9dd79f156c86eb03e57a7face96f199e02b18e58d32a",
|
||||
"sha256:b8d587cc39057d0afd4166083d289bdeff221ac6d3ee5046aef2d480dc4b503c",
|
||||
"sha256:c5210e5d5117e9aec8c47d9156d1d3835570dd909a899171b9535cb4a3f32693",
|
||||
"sha256:cc331c13902d0f50845099434cd936d49d7a2ca76cb654b39691974cb1e4812d",
|
||||
"sha256:ce41676b3d0dd16dbcfabcc1dc46090aaf4688fd6e819ef343dbda5a57ef0161",
|
||||
"sha256:d8165a088d31798b59e91117d1f5fc3df8168d8b48c4acc10fc0df0d0bdbcc5e",
|
||||
"sha256:e7281244c99fd7c6f27c1c6bfafba878517b0b62925a09b586d88ce750a016d2",
|
||||
"sha256:e96a08b62bb8de960d3a6afbc5ed8421bf1a2d9c85cc4ea73f4bc81b4910500f",
|
||||
"sha256:ed33433fc3820263a6368e532f19ddb4c5990855e4886088ad84fd7c4e561c71",
|
||||
"sha256:efb8f6d08ca7998cf59eaf50c9d60717f29a1a0a09caa46460d33b2924839dbd",
|
||||
"sha256:efe99298ba37e37787f6a2ea868265465410822f7bea163edcc1bd3903354ea9",
|
||||
"sha256:f99e9486bf1bb979d95d5cffed40689cb595abb2b841f2991fc894b3452290e8",
|
||||
"sha256:fc1464c97579da9f3ab16763c32e5c5d5bb5fa1ec7ce509a4ca6108b61b84fab",
|
||||
"sha256:fd7dc0e6812b799a34f6d12fcb1000539098c249c8da54f3566c6a6461d0dbad"
|
||||
],
|
||||
"markers": "python_version >= '3.8'",
|
||||
"version": "==1.0.7"
|
||||
},
|
||||
"cycler": {
|
||||
"hashes": [
|
||||
"sha256:3a27e95f763a428a739d2add979fa7494c912a32c17c4c38c4d5f082cad165a3",
|
||||
"sha256:9c87405839a19696e837b3b818fed3f5f69f16f1eec1a1ad77e043dcea9c772f"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==0.11.0"
|
||||
},
|
||||
"fonttools": {
|
||||
"hashes": [
|
||||
"sha256:2bb244009f9bf3fa100fc3ead6aeb99febe5985fa20afbfbaa2f8946c2fbdaf1",
|
||||
"sha256:820466f43c8be8c3009aef8b87e785014133508f0de64ec469e4efb643ae54fb"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==4.38.0"
|
||||
},
|
||||
"kiwisolver": {
|
||||
"hashes": [
|
||||
"sha256:02f79693ec433cb4b5f51694e8477ae83b3205768a6fb48ffba60549080e295b",
|
||||
"sha256:03baab2d6b4a54ddbb43bba1a3a2d1627e82d205c5cf8f4c924dc49284b87166",
|
||||
"sha256:1041feb4cda8708ce73bb4dcb9ce1ccf49d553bf87c3954bdfa46f0c3f77252c",
|
||||
"sha256:10ee06759482c78bdb864f4109886dff7b8a56529bc1609d4f1112b93fe6423c",
|
||||
"sha256:1d1573129aa0fd901076e2bfb4275a35f5b7aa60fbfb984499d661ec950320b0",
|
||||
"sha256:283dffbf061a4ec60391d51e6155e372a1f7a4f5b15d59c8505339454f8989e4",
|
||||
"sha256:28bc5b299f48150b5f822ce68624e445040595a4ac3d59251703779836eceff9",
|
||||
"sha256:2a66fdfb34e05b705620dd567f5a03f239a088d5a3f321e7b6ac3239d22aa286",
|
||||
"sha256:2e307eb9bd99801f82789b44bb45e9f541961831c7311521b13a6c85afc09767",
|
||||
"sha256:2e407cb4bd5a13984a6c2c0fe1845e4e41e96f183e5e5cd4d77a857d9693494c",
|
||||
"sha256:2f5e60fabb7343a836360c4f0919b8cd0d6dbf08ad2ca6b9cf90bf0c76a3c4f6",
|
||||
"sha256:36dafec3d6d6088d34e2de6b85f9d8e2324eb734162fba59d2ba9ed7a2043d5b",
|
||||
"sha256:3fe20f63c9ecee44560d0e7f116b3a747a5d7203376abeea292ab3152334d004",
|
||||
"sha256:41dae968a94b1ef1897cb322b39360a0812661dba7c682aa45098eb8e193dbdf",
|
||||
"sha256:4bd472dbe5e136f96a4b18f295d159d7f26fd399136f5b17b08c4e5f498cd494",
|
||||
"sha256:4ea39b0ccc4f5d803e3337dd46bcce60b702be4d86fd0b3d7531ef10fd99a1ac",
|
||||
"sha256:5853eb494c71e267912275e5586fe281444eb5e722de4e131cddf9d442615626",
|
||||
"sha256:5bce61af018b0cb2055e0e72e7d65290d822d3feee430b7b8203d8a855e78766",
|
||||
"sha256:6295ecd49304dcf3bfbfa45d9a081c96509e95f4b9d0eb7ee4ec0530c4a96514",
|
||||
"sha256:62ac9cc684da4cf1778d07a89bf5f81b35834cb96ca523d3a7fb32509380cbf6",
|
||||
"sha256:70e7c2e7b750585569564e2e5ca9845acfaa5da56ac46df68414f29fea97be9f",
|
||||
"sha256:7577c1987baa3adc4b3c62c33bd1118c3ef5c8ddef36f0f2c950ae0b199e100d",
|
||||
"sha256:75facbe9606748f43428fc91a43edb46c7ff68889b91fa31f53b58894503a191",
|
||||
"sha256:787518a6789009c159453da4d6b683f468ef7a65bbde796bcea803ccf191058d",
|
||||
"sha256:78d6601aed50c74e0ef02f4204da1816147a6d3fbdc8b3872d263338a9052c51",
|
||||
"sha256:7c43e1e1206cd421cd92e6b3280d4385d41d7166b3ed577ac20444b6995a445f",
|
||||
"sha256:81e38381b782cc7e1e46c4e14cd997ee6040768101aefc8fa3c24a4cc58e98f8",
|
||||
"sha256:841293b17ad704d70c578f1f0013c890e219952169ce8a24ebc063eecf775454",
|
||||
"sha256:872b8ca05c40d309ed13eb2e582cab0c5a05e81e987ab9c521bf05ad1d5cf5cb",
|
||||
"sha256:877272cf6b4b7e94c9614f9b10140e198d2186363728ed0f701c6eee1baec1da",
|
||||
"sha256:8c808594c88a025d4e322d5bb549282c93c8e1ba71b790f539567932722d7bd8",
|
||||
"sha256:8ed58b8acf29798b036d347791141767ccf65eee7f26bde03a71c944449e53de",
|
||||
"sha256:91672bacaa030f92fc2f43b620d7b337fd9a5af28b0d6ed3f77afc43c4a64b5a",
|
||||
"sha256:968f44fdbf6dd757d12920d63b566eeb4d5b395fd2d00d29d7ef00a00582aac9",
|
||||
"sha256:9f85003f5dfa867e86d53fac6f7e6f30c045673fa27b603c397753bebadc3008",
|
||||
"sha256:a553dadda40fef6bfa1456dc4be49b113aa92c2a9a9e8711e955618cd69622e3",
|
||||
"sha256:a68b62a02953b9841730db7797422f983935aeefceb1679f0fc85cbfbd311c32",
|
||||
"sha256:abbe9fa13da955feb8202e215c4018f4bb57469b1b78c7a4c5c7b93001699938",
|
||||
"sha256:ad881edc7ccb9d65b0224f4e4d05a1e85cf62d73aab798943df6d48ab0cd79a1",
|
||||
"sha256:b1792d939ec70abe76f5054d3f36ed5656021dcad1322d1cc996d4e54165cef9",
|
||||
"sha256:b428ef021242344340460fa4c9185d0b1f66fbdbfecc6c63eff4b7c29fad429d",
|
||||
"sha256:b533558eae785e33e8c148a8d9921692a9fe5aa516efbdff8606e7d87b9d5824",
|
||||
"sha256:ba59c92039ec0a66103b1d5fe588fa546373587a7d68f5c96f743c3396afc04b",
|
||||
"sha256:bc8d3bd6c72b2dd9decf16ce70e20abcb3274ba01b4e1c96031e0c4067d1e7cd",
|
||||
"sha256:bc9db8a3efb3e403e4ecc6cd9489ea2bac94244f80c78e27c31dcc00d2790ac2",
|
||||
"sha256:bf7d9fce9bcc4752ca4a1b80aabd38f6d19009ea5cbda0e0856983cf6d0023f5",
|
||||
"sha256:c2dbb44c3f7e6c4d3487b31037b1bdbf424d97687c1747ce4ff2895795c9bf69",
|
||||
"sha256:c79ebe8f3676a4c6630fd3f777f3cfecf9289666c84e775a67d1d358578dc2e3",
|
||||
"sha256:c97528e64cb9ebeff9701e7938653a9951922f2a38bd847787d4a8e498cc83ae",
|
||||
"sha256:d0611a0a2a518464c05ddd5a3a1a0e856ccc10e67079bb17f265ad19ab3c7597",
|
||||
"sha256:d06adcfa62a4431d404c31216f0f8ac97397d799cd53800e9d3efc2fbb3cf14e",
|
||||
"sha256:d41997519fcba4a1e46eb4a2fe31bc12f0ff957b2b81bac28db24744f333e955",
|
||||
"sha256:d5b61785a9ce44e5a4b880272baa7cf6c8f48a5180c3e81c59553ba0cb0821ca",
|
||||
"sha256:da152d8cdcab0e56e4f45eb08b9aea6455845ec83172092f09b0e077ece2cf7a",
|
||||
"sha256:da7e547706e69e45d95e116e6939488d62174e033b763ab1496b4c29b76fabea",
|
||||
"sha256:db5283d90da4174865d520e7366801a93777201e91e79bacbac6e6927cbceede",
|
||||
"sha256:db608a6757adabb32f1cfe6066e39b3706d8c3aa69bbc353a5b61edad36a5cb4",
|
||||
"sha256:e0ea21f66820452a3f5d1655f8704a60d66ba1191359b96541eaf457710a5fc6",
|
||||
"sha256:e7da3fec7408813a7cebc9e4ec55afed2d0fd65c4754bc376bf03498d4e92686",
|
||||
"sha256:e92a513161077b53447160b9bd8f522edfbed4bd9759e4c18ab05d7ef7e49408",
|
||||
"sha256:ecb1fa0db7bf4cff9dac752abb19505a233c7f16684c5826d1f11ebd9472b871",
|
||||
"sha256:efda5fc8cc1c61e4f639b8067d118e742b812c930f708e6667a5ce0d13499e29",
|
||||
"sha256:f0a1dbdb5ecbef0d34eb77e56fcb3e95bbd7e50835d9782a45df81cc46949750",
|
||||
"sha256:f0a71d85ecdd570ded8ac3d1c0f480842f49a40beb423bb8014539a9f32a5897",
|
||||
"sha256:f4f270de01dd3e129a72efad823da90cc4d6aafb64c410c9033aba70db9f1ff0",
|
||||
"sha256:f6cb459eea32a4e2cf18ba5fcece2dbdf496384413bc1bae15583f19e567f3b2",
|
||||
"sha256:f8ad8285b01b0d4695102546b342b493b3ccc6781fc28c8c6a1bb63e95d22f09",
|
||||
"sha256:f9f39e2f049db33a908319cf46624a569b36983c7c78318e9726a4cb8923b26c"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==1.4.4"
|
||||
},
|
||||
"matplotlib": {
|
||||
"hashes": [
|
||||
"sha256:01b7f521a9a73c383825813af255f8c4485d1706e4f3e2ed5ae771e4403a40ab",
|
||||
"sha256:11011c97d62c1db7bc20509572557842dbb8c2a2ddd3dd7f20501aa1cde3e54e",
|
||||
"sha256:1183877d008c752d7d535396096c910f4663e4b74a18313adee1213328388e1e",
|
||||
"sha256:12f999661589981e74d793ee2f41b924b3b87d65fd929f6153bf0f30675c59b1",
|
||||
"sha256:1c235bf9be052347373f589e018988cad177abb3f997ab1a2e2210c41562cc0c",
|
||||
"sha256:1f4d69707b1677560cd952544ee4962f68ff07952fb9069ff8c12b56353cb8c9",
|
||||
"sha256:1fcc4cad498533d3c393a160975acc9b36ffa224d15a6b90ae579eacee5d8579",
|
||||
"sha256:2787a16df07370dcba385fe20cdd0cc3cfaabd3c873ddabca78c10514c799721",
|
||||
"sha256:29f17b7f2e068dc346687cbdf80b430580bab42346625821c2d3abf3a1ec5417",
|
||||
"sha256:38d38cb1ea1d80ee0f6351b65c6f76cad6060bbbead015720ba001348ae90f0c",
|
||||
"sha256:3f56a7252eee8f3438447f75f5e1148a1896a2756a92285fe5d73bed6deebff4",
|
||||
"sha256:5223affa21050fb6118353c1380c15e23aedfb436bf3e162c26dc950617a7519",
|
||||
"sha256:57ad1aee29043163374bfa8990e1a2a10ff72c9a1bfaa92e9c46f6ea59269121",
|
||||
"sha256:59400cc9451094b7f08cc3f321972e6e1db4cd37a978d4e8a12824bf7fd2f03b",
|
||||
"sha256:68d94a436f62b8a861bf3ace82067a71bafb724b4e4f9133521e4d8012420dd7",
|
||||
"sha256:6adc441b5b2098a4b904bbf9d9e92fb816fef50c55aa2ea6a823fc89b94bb838",
|
||||
"sha256:6d81b11ede69e3a751424b98dc869c96c10256b2206bfdf41f9c720eee86844c",
|
||||
"sha256:73b93af33634ed919e72811c9703e1105185cd3fb46d76f30b7f4cfbbd063f89",
|
||||
"sha256:77b384cee7ab8cf75ffccbfea351a09b97564fc62d149827a5e864bec81526e5",
|
||||
"sha256:79e501eb847f4a489eb7065bb8d3187117f65a4c02d12ea3a19d6c5bef173bcc",
|
||||
"sha256:809119d1cba3ece3c9742eb01827fe7a0e781ea3c5d89534655a75e07979344f",
|
||||
"sha256:80c166a0e28512e26755f69040e6bf2f946a02ffdb7c00bf6158cca3d2b146e6",
|
||||
"sha256:81b409b2790cf8d7c1ef35920f01676d2ae7afa8241844e7aa5484fdf493a9a0",
|
||||
"sha256:994637e2995b0342699b396a320698b07cd148bbcf2dd2fa2daba73f34dd19f2",
|
||||
"sha256:9ceebaf73f1a3444fa11014f38b9da37ff7ea328d6efa1652241fe3777bfdab9",
|
||||
"sha256:9fb8fb19d03abf3c5dab89a8677e62c4023632f919a62b6dd1d6d2dbf42cd9f5",
|
||||
"sha256:acc3b1a4bddbf56fe461e36fb9ef94c2cb607fc90d24ccc650040bfcc7610de4",
|
||||
"sha256:bbddfeb1495484351fb5b30cf5bdf06b3de0bc4626a707d29e43dfd61af2a780",
|
||||
"sha256:bbf269e1d24bc25247095d71c7a969813f7080e2a7c6fa28931a603f747ab012",
|
||||
"sha256:bebcff4c3ed02c6399d47329f3554193abd824d3d53b5ca02cf583bcd94470e2",
|
||||
"sha256:c3f08df2ac4636249b8bc7a85b8b82c983bef1441595936f62c2918370ca7e1d",
|
||||
"sha256:ca94f0362f6b6f424b555b956971dcb94b12d0368a6c3e07dc7a40d32d6d873d",
|
||||
"sha256:d00c248ab6b92bea3f8148714837937053a083ff03b4c5e30ed37e28fc0e7e56",
|
||||
"sha256:d2cfaa7fd62294d945b8843ea24228a27c8e7c5b48fa634f3c168153b825a21b",
|
||||
"sha256:d5f18430f5cfa5571ab8f4c72c89af52aa0618e864c60028f11a857d62200cba",
|
||||
"sha256:debeab8e2ab07e5e3dac33e12456da79c7e104270d2b2d1df92b9e40347cca75",
|
||||
"sha256:dfba7057609ca9567b9704626756f0142e97ec8c5ba2c70c6e7bd1c25ef99f06",
|
||||
"sha256:e0a64d7cc336b52e90f59e6d638ae847b966f68582a7af041e063d568e814740",
|
||||
"sha256:eb9421c403ffd387fbe729de6d9a03005bf42faba5e8432f4e51e703215b49fc",
|
||||
"sha256:faff486b36530a836a6b4395850322e74211cd81fc17f28b4904e1bd53668e3e",
|
||||
"sha256:ff2aa84e74f80891e6bcf292ebb1dd57714ffbe13177642d65fee25384a30894"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.6.3"
|
||||
},
|
||||
"numpy": {
|
||||
"hashes": [
|
||||
"sha256:003a9f530e880cb2cd177cba1af7220b9aa42def9c4afc2a2fc3ee6be7eb2b22",
|
||||
"sha256:150947adbdfeceec4e5926d956a06865c1c690f2fd902efede4ca6fe2e657c3f",
|
||||
"sha256:2620e8592136e073bd12ee4536149380695fbe9ebeae845b81237f986479ffc9",
|
||||
"sha256:2eabd64ddb96a1239791da78fa5f4e1693ae2dadc82a76bc76a14cbb2b966e96",
|
||||
"sha256:4173bde9fa2a005c2c6e2ea8ac1618e2ed2c1c6ec8a7657237854d42094123a0",
|
||||
"sha256:4199e7cfc307a778f72d293372736223e39ec9ac096ff0a2e64853b866a8e18a",
|
||||
"sha256:4cecaed30dc14123020f77b03601559fff3e6cd0c048f8b5289f4eeabb0eb281",
|
||||
"sha256:557d42778a6869c2162deb40ad82612645e21d79e11c1dc62c6e82a2220ffb04",
|
||||
"sha256:63e45511ee4d9d976637d11e6c9864eae50e12dc9598f531c035265991910468",
|
||||
"sha256:6524630f71631be2dabe0c541e7675db82651eb998496bbe16bc4f77f0772253",
|
||||
"sha256:76807b4063f0002c8532cfeac47a3068a69561e9c8715efdad3c642eb27c0756",
|
||||
"sha256:7de8fdde0003f4294655aa5d5f0a89c26b9f22c0a58790c38fae1ed392d44a5a",
|
||||
"sha256:889b2cc88b837d86eda1b17008ebeb679d82875022200c6e8e4ce6cf549b7acb",
|
||||
"sha256:92011118955724465fb6853def593cf397b4a1367495e0b59a7e69d40c4eb71d",
|
||||
"sha256:97cf27e51fa078078c649a51d7ade3c92d9e709ba2bfb97493007103c741f1d0",
|
||||
"sha256:9a23f8440561a633204a67fb44617ce2a299beecf3295f0d13c495518908e910",
|
||||
"sha256:a51725a815a6188c662fb66fb32077709a9ca38053f0274640293a14fdd22978",
|
||||
"sha256:a77d3e1163a7770164404607b7ba3967fb49b24782a6ef85d9b5f54126cc39e5",
|
||||
"sha256:adbdce121896fd3a17a77ab0b0b5eedf05a9834a18699db6829a64e1dfccca7f",
|
||||
"sha256:c29e6bd0ec49a44d7690ecb623a8eac5ab8a923bce0bea6293953992edf3a76a",
|
||||
"sha256:c72a6b2f4af1adfe193f7beb91ddf708ff867a3f977ef2ec53c0ffb8283ab9f5",
|
||||
"sha256:d0a2db9d20117bf523dde15858398e7c0858aadca7c0f088ac0d6edd360e9ad2",
|
||||
"sha256:e3ab5d32784e843fc0dd3ab6dcafc67ef806e6b6828dc6af2f689be0eb4d781d",
|
||||
"sha256:e428c4fbfa085f947b536706a2fc349245d7baa8334f0c5723c56a10595f9b95",
|
||||
"sha256:e8d2859428712785e8a8b7d2b3ef0a1d1565892367b32f915c4a4df44d0e64f5",
|
||||
"sha256:eef70b4fc1e872ebddc38cddacc87c19a3709c0e3e5d20bf3954c147b1dd941d",
|
||||
"sha256:f64bb98ac59b3ea3bf74b02f13836eb2e24e48e0ab0145bbda646295769bd780",
|
||||
"sha256:f9006288bcf4895917d02583cf3411f98631275bc67cce355a7f39f8c14338fa"
|
||||
],
|
||||
"markers": "python_version >= '3.8'",
|
||||
"version": "==1.24.2"
|
||||
},
|
||||
"packaging": {
|
||||
"hashes": [
|
||||
"sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2",
|
||||
"sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==23.0"
|
||||
},
|
||||
"pandas": {
|
||||
"hashes": [
|
||||
"sha256:14e45300521902689a81f3f41386dc86f19b8ba8dd5ac5a3c7010ef8d2932813",
|
||||
"sha256:26d9c71772c7afb9d5046e6e9cf42d83dd147b5cf5bcb9d97252077118543792",
|
||||
"sha256:3749077d86e3a2f0ed51367f30bf5b82e131cc0f14260c4d3e499186fccc4406",
|
||||
"sha256:41179ce559943d83a9b4bbacb736b04c928b095b5f25dd2b7389eda08f46f373",
|
||||
"sha256:478ff646ca42b20376e4ed3fa2e8d7341e8a63105586efe54fa2508ee087f328",
|
||||
"sha256:50869a35cbb0f2e0cd5ec04b191e7b12ed688874bd05dd777c19b28cbea90996",
|
||||
"sha256:565fa34a5434d38e9d250af3c12ff931abaf88050551d9fbcdfafca50d62babf",
|
||||
"sha256:5f2b952406a1588ad4cad5b3f55f520e82e902388a6d5a4a91baa8d38d23c7f6",
|
||||
"sha256:5fbcb19d6fceb9e946b3e23258757c7b225ba450990d9ed63ccceeb8cae609f7",
|
||||
"sha256:6973549c01ca91ec96199e940495219c887ea815b2083722821f1d7abfa2b4dc",
|
||||
"sha256:74a3fd7e5a7ec052f183273dc7b0acd3a863edf7520f5d3a1765c04ffdb3b0b1",
|
||||
"sha256:7a0a56cef15fd1586726dace5616db75ebcfec9179a3a55e78f72c5639fa2a23",
|
||||
"sha256:7cec0bee9f294e5de5bbfc14d0573f65526071029d036b753ee6507d2a21480a",
|
||||
"sha256:87bd9c03da1ac870a6d2c8902a0e1fd4267ca00f13bc494c9e5a9020920e1d51",
|
||||
"sha256:972d8a45395f2a2d26733eb8d0f629b2f90bebe8e8eddbb8829b180c09639572",
|
||||
"sha256:9842b6f4b8479e41968eced654487258ed81df7d1c9b7b870ceea24ed9459b31",
|
||||
"sha256:9f69c4029613de47816b1bb30ff5ac778686688751a5e9c99ad8c7031f6508e5",
|
||||
"sha256:a50d9a4336a9621cab7b8eb3fb11adb82de58f9b91d84c2cd526576b881a0c5a",
|
||||
"sha256:bc4c368f42b551bf72fac35c5128963a171b40dce866fb066540eeaf46faa003",
|
||||
"sha256:c39a8da13cede5adcd3be1182883aea1c925476f4e84b2807a46e2775306305d",
|
||||
"sha256:c3ac844a0fe00bfaeb2c9b51ab1424e5c8744f89860b138434a363b1f620f354",
|
||||
"sha256:c4c00e0b0597c8e4f59e8d461f797e5d70b4d025880516a8261b2817c47759ee",
|
||||
"sha256:c74a62747864ed568f5a82a49a23a8d7fe171d0c69038b38cedf0976831296fa",
|
||||
"sha256:dd05f7783b3274aa206a1af06f0ceed3f9b412cf665b7247eacd83be41cf7bf0",
|
||||
"sha256:dfd681c5dc216037e0b0a2c821f5ed99ba9f03ebcf119c7dac0e9a7b960b9ec9",
|
||||
"sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae",
|
||||
"sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc"
|
||||
],
|
||||
"markers": "python_version >= '3.8'",
|
||||
"version": "==1.5.3"
|
||||
},
|
||||
"pillow": {
|
||||
"hashes": [
|
||||
"sha256:013016af6b3a12a2f40b704677f8b51f72cb007dac785a9933d5c86a72a7fe33",
|
||||
"sha256:0845adc64fe9886db00f5ab68c4a8cd933ab749a87747555cec1c95acea64b0b",
|
||||
"sha256:0884ba7b515163a1a05440a138adeb722b8a6ae2c2b33aea93ea3118dd3a899e",
|
||||
"sha256:09b89ddc95c248ee788328528e6a2996e09eaccddeeb82a5356e92645733be35",
|
||||
"sha256:0dd4c681b82214b36273c18ca7ee87065a50e013112eea7d78c7a1b89a739153",
|
||||
"sha256:0e51f608da093e5d9038c592b5b575cadc12fd748af1479b5e858045fff955a9",
|
||||
"sha256:0f3269304c1a7ce82f1759c12ce731ef9b6e95b6df829dccd9fe42912cc48569",
|
||||
"sha256:16a8df99701f9095bea8a6c4b3197da105df6f74e6176c5b410bc2df2fd29a57",
|
||||
"sha256:19005a8e58b7c1796bc0167862b1f54a64d3b44ee5d48152b06bb861458bc0f8",
|
||||
"sha256:1b4b4e9dda4f4e4c4e6896f93e84a8f0bcca3b059de9ddf67dac3c334b1195e1",
|
||||
"sha256:28676836c7796805914b76b1837a40f76827ee0d5398f72f7dcc634bae7c6264",
|
||||
"sha256:2968c58feca624bb6c8502f9564dd187d0e1389964898f5e9e1fbc8533169157",
|
||||
"sha256:3f4cc516e0b264c8d4ccd6b6cbc69a07c6d582d8337df79be1e15a5056b258c9",
|
||||
"sha256:3fa1284762aacca6dc97474ee9c16f83990b8eeb6697f2ba17140d54b453e133",
|
||||
"sha256:43521ce2c4b865d385e78579a082b6ad1166ebed2b1a2293c3be1d68dd7ca3b9",
|
||||
"sha256:451f10ef963918e65b8869e17d67db5e2f4ab40e716ee6ce7129b0cde2876eab",
|
||||
"sha256:46c259e87199041583658457372a183636ae8cd56dbf3f0755e0f376a7f9d0e6",
|
||||
"sha256:46f39cab8bbf4a384ba7cb0bc8bae7b7062b6a11cfac1ca4bc144dea90d4a9f5",
|
||||
"sha256:519e14e2c49fcf7616d6d2cfc5c70adae95682ae20f0395e9280db85e8d6c4df",
|
||||
"sha256:53dcb50fbdc3fb2c55431a9b30caeb2f7027fcd2aeb501459464f0214200a503",
|
||||
"sha256:54614444887e0d3043557d9dbc697dbb16cfb5a35d672b7a0fcc1ed0cf1c600b",
|
||||
"sha256:575d8912dca808edd9acd6f7795199332696d3469665ef26163cd090fa1f8bfa",
|
||||
"sha256:5dd5a9c3091a0f414a963d427f920368e2b6a4c2f7527fdd82cde8ef0bc7a327",
|
||||
"sha256:5f532a2ad4d174eb73494e7397988e22bf427f91acc8e6ebf5bb10597b49c493",
|
||||
"sha256:60e7da3a3ad1812c128750fc1bc14a7ceeb8d29f77e0a2356a8fb2aa8925287d",
|
||||
"sha256:653d7fb2df65efefbcbf81ef5fe5e5be931f1ee4332c2893ca638c9b11a409c4",
|
||||
"sha256:6663977496d616b618b6cfa43ec86e479ee62b942e1da76a2c3daa1c75933ef4",
|
||||
"sha256:6abfb51a82e919e3933eb137e17c4ae9c0475a25508ea88993bb59faf82f3b35",
|
||||
"sha256:6c6b1389ed66cdd174d040105123a5a1bc91d0aa7059c7261d20e583b6d8cbd2",
|
||||
"sha256:6d9dfb9959a3b0039ee06c1a1a90dc23bac3b430842dcb97908ddde05870601c",
|
||||
"sha256:765cb54c0b8724a7c12c55146ae4647e0274a839fb6de7bcba841e04298e1011",
|
||||
"sha256:7a21222644ab69ddd9967cfe6f2bb420b460dae4289c9d40ff9a4896e7c35c9a",
|
||||
"sha256:7ac7594397698f77bce84382929747130765f66406dc2cd8b4ab4da68ade4c6e",
|
||||
"sha256:7cfc287da09f9d2a7ec146ee4d72d6ea1342e770d975e49a8621bf54eaa8f30f",
|
||||
"sha256:83125753a60cfc8c412de5896d10a0a405e0bd88d0470ad82e0869ddf0cb3848",
|
||||
"sha256:847b114580c5cc9ebaf216dd8c8dbc6b00a3b7ab0131e173d7120e6deade1f57",
|
||||
"sha256:87708d78a14d56a990fbf4f9cb350b7d89ee8988705e58e39bdf4d82c149210f",
|
||||
"sha256:8a2b5874d17e72dfb80d917213abd55d7e1ed2479f38f001f264f7ce7bae757c",
|
||||
"sha256:8f127e7b028900421cad64f51f75c051b628db17fb00e099eb148761eed598c9",
|
||||
"sha256:94cdff45173b1919350601f82d61365e792895e3c3a3443cf99819e6fbf717a5",
|
||||
"sha256:99d92d148dd03fd19d16175b6d355cc1b01faf80dae93c6c3eb4163709edc0a9",
|
||||
"sha256:9a3049a10261d7f2b6514d35bbb7a4dfc3ece4c4de14ef5876c4b7a23a0e566d",
|
||||
"sha256:9d9a62576b68cd90f7075876f4e8444487db5eeea0e4df3ba298ee38a8d067b0",
|
||||
"sha256:9e5f94742033898bfe84c93c831a6f552bb629448d4072dd312306bab3bd96f1",
|
||||
"sha256:a1c2d7780448eb93fbcc3789bf3916aa5720d942e37945f4056680317f1cd23e",
|
||||
"sha256:a2e0f87144fcbbe54297cae708c5e7f9da21a4646523456b00cc956bd4c65815",
|
||||
"sha256:a4dfdae195335abb4e89cc9762b2edc524f3c6e80d647a9a81bf81e17e3fb6f0",
|
||||
"sha256:a96e6e23f2b79433390273eaf8cc94fec9c6370842e577ab10dabdcc7ea0a66b",
|
||||
"sha256:aabdab8ec1e7ca7f1434d042bf8b1e92056245fb179790dc97ed040361f16bfd",
|
||||
"sha256:b222090c455d6d1a64e6b7bb5f4035c4dff479e22455c9eaa1bdd4c75b52c80c",
|
||||
"sha256:b52ff4f4e002f828ea6483faf4c4e8deea8d743cf801b74910243c58acc6eda3",
|
||||
"sha256:b70756ec9417c34e097f987b4d8c510975216ad26ba6e57ccb53bc758f490dab",
|
||||
"sha256:b8c2f6eb0df979ee99433d8b3f6d193d9590f735cf12274c108bd954e30ca858",
|
||||
"sha256:b9b752ab91e78234941e44abdecc07f1f0d8f51fb62941d32995b8161f68cfe5",
|
||||
"sha256:ba6612b6548220ff5e9df85261bddc811a057b0b465a1226b39bfb8550616aee",
|
||||
"sha256:bd752c5ff1b4a870b7661234694f24b1d2b9076b8bf337321a814c612665f343",
|
||||
"sha256:c3c4ed2ff6760e98d262e0cc9c9a7f7b8a9f61aa4d47c58835cdaf7b0b8811bb",
|
||||
"sha256:c5c1362c14aee73f50143d74389b2c158707b4abce2cb055b7ad37ce60738d47",
|
||||
"sha256:cb362e3b0976dc994857391b776ddaa8c13c28a16f80ac6522c23d5257156bed",
|
||||
"sha256:d197df5489004db87d90b918033edbeee0bd6df3848a204bca3ff0a903bef837",
|
||||
"sha256:d3b56206244dc8711f7e8b7d6cad4663917cd5b2d950799425076681e8766286",
|
||||
"sha256:d5b2f8a31bd43e0f18172d8ac82347c8f37ef3e0b414431157718aa234991b28",
|
||||
"sha256:d7081c084ceb58278dd3cf81f836bc818978c0ccc770cbbb202125ddabec6628",
|
||||
"sha256:db74f5562c09953b2c5f8ec4b7dfd3f5421f31811e97d1dbc0a7c93d6e3a24df",
|
||||
"sha256:df41112ccce5d47770a0c13651479fbcd8793f34232a2dd9faeccb75eb5d0d0d",
|
||||
"sha256:e1339790c083c5a4de48f688b4841f18df839eb3c9584a770cbd818b33e26d5d",
|
||||
"sha256:e621b0246192d3b9cb1dc62c78cfa4c6f6d2ddc0ec207d43c0dedecb914f152a",
|
||||
"sha256:e8c5cf126889a4de385c02a2c3d3aba4b00f70234bfddae82a5eaa3ee6d5e3e6",
|
||||
"sha256:e9d7747847c53a16a729b6ee5e737cf170f7a16611c143d95aa60a109a59c336",
|
||||
"sha256:eaef5d2de3c7e9b21f1e762f289d17b726c2239a42b11e25446abf82b26ac132",
|
||||
"sha256:ed3e4b4e1e6de75fdc16d3259098de7c6571b1a6cc863b1a49e7d3d53e036070",
|
||||
"sha256:ef21af928e807f10bf4141cad4746eee692a0dd3ff56cfb25fce076ec3cc8abe",
|
||||
"sha256:f09598b416ba39a8f489c124447b007fe865f786a89dbfa48bb5cf395693132a",
|
||||
"sha256:f0caf4a5dcf610d96c3bd32932bfac8aee61c96e60481c2a0ea58da435e25acd",
|
||||
"sha256:f6e78171be3fb7941f9910ea15b4b14ec27725865a73c15277bc39f5ca4f8391",
|
||||
"sha256:f715c32e774a60a337b2bb8ad9839b4abf75b267a0f18806f6f4f5f1688c4b5a",
|
||||
"sha256:fb5c1ad6bad98c57482236a21bf985ab0ef42bd51f7ad4e4538e89a997624e12"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==9.4.0"
|
||||
},
|
||||
"pyparsing": {
|
||||
"hashes": [
|
||||
"sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb",
|
||||
"sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"
|
||||
],
|
||||
"markers": "python_full_version >= '3.6.8'",
|
||||
"version": "==3.0.9"
|
||||
},
|
||||
"python-dateutil": {
|
||||
"hashes": [
|
||||
"sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
|
||||
"sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
|
||||
"version": "==2.8.2"
|
||||
},
|
||||
"pytz": {
|
||||
"hashes": [
|
||||
"sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0",
|
||||
"sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a"
|
||||
],
|
||||
"version": "==2022.7.1"
|
||||
},
|
||||
"seaborn": {
|
||||
"hashes": [
|
||||
"sha256:374645f36509d0dcab895cba5b47daf0586f77bfe3b36c97c607db7da5be0139",
|
||||
"sha256:ebf15355a4dba46037dfd65b7350f014ceb1f13c05e814eda2c9f5fd731afc08"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.12.2"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
|
||||
"sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
|
||||
"version": "==1.16.0"
|
||||
}
|
||||
},
|
||||
"develop": {}
|
||||
}
|
||||
145
README.md
145
README.md
@@ -1,16 +1,12 @@
|
||||
# TikTok hashtag analysis toolset
|
||||
|
||||
> IMPORTANT NOTE: this tool relies on [drawrowfly/tiktok-scraper](https://github.com/drawrowfly/tiktok-scraper) which seems to be broken at time of writing and without updates for some time with several open issues ([796](https://github.com/drawrowfly/tiktok-scraper/issues/796) [#799](https://github.com/drawrowfly/tiktok-scraper/issues/799)) that need to be fixed before this library can work smoothly :/
|
||||
|
||||
The tool helps to download posts and videos from TikTok for a given set of hashtags over a period of time. Users can create a growing database of posts for specific hashtags which can then be used for further hashtag analysis. It uses the [tiktok-scraper](https://github.com/drawrowfly/tiktok-scraper) Node package to download the posts and videos.
|
||||
The tool helps to download posts and videos from TikTok for a given set of hashtags over a period of time. Users can create a growing database of posts for specific hashtags which can then be used for further hashtag analysis. It uses the [TikTokApi](https://github.com/davidteather/TikTok-Api) Python package to download the posts and uses [yt-dlp](https://github.com/yt-dlp/yt-dlp) to download the videos.
|
||||
|
||||
[](https://badge.fury.io/py/tiktok-hashtag-analysis)
|
||||
|
||||
## Pre-requisites
|
||||
1. Make sure you have Python 3.6 or a later version installed
|
||||
2. And, you need to have node version 16. On Mac, do `brew install node` followed by `npm install -g n` and then `n 16`
|
||||
4. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper
|
||||
5. Install the tool with pip: `pip install tiktok-hashtag-analysis`
|
||||
1. Make sure you have Python 3.9 or a later version installed
|
||||
2. Install the tool with pip: `pip install tiktok-hashtag-analysis`
|
||||
1. or directly from the repo version: `pip install git+https://github.com/bellingcat/tiktok-hashtag-analysis`
|
||||
|
||||
You should now be ready to start using it.
|
||||
@@ -19,88 +15,83 @@ You should now be ready to start using it.
|
||||
## About the tool
|
||||
### Command-line arguments
|
||||
```
|
||||
tiktok-hashtag-analysis --help
|
||||
usage: tiktok-hashtag-analysis [-h] [-t [T ...]] [-f F] [-p] [-v] [-ht HASHTAG] [-n NUMBER] [-plt] [-d] {download,frequencies}
|
||||
usage: tiktok-hashtag-analysis [-h] [--file FILE] [-d] [--number NUMBER] [-p] [-t] [--output-dir OUTPUT_DIR] [--config CONFIG] [--log LOG] [hashtags ...]
|
||||
|
||||
Analyze hashtags within posts scraped from TikTok.
|
||||
|
||||
positional arguments:
|
||||
{download,frequencies}
|
||||
command to initialize
|
||||
hashtags List of hashtags to scrape
|
||||
|
||||
options:
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
-t [T ...] List of hashtags to scrape (module: run_downloader)
|
||||
-f F File name containing list of hashtags to scrape (module: run_downloader)
|
||||
-p Download post data (module: run_downloader)
|
||||
-v Download video files (module: run_downloader)
|
||||
-ht HASHTAG, --hashtag HASHTAG
|
||||
The hashtag of scraped posts to analyze (module: hashtag_frequencies)
|
||||
-n NUMBER, --number NUMBER
|
||||
The number of top n occurrences (module: hashtag_frequencies)
|
||||
-plt, --plot Plot the occurrences (module: hashtag_frequencies)
|
||||
-d, --print List top n hashtags (module: hashtag_frequencies)
|
||||
--file FILE File name containing list of hashtags to scrape
|
||||
-d, --download Download video files corresponding to scraped posts
|
||||
--number NUMBER The number of co-occurring hashtags to analyze
|
||||
-p, --plot Plot the most common co-occurring hashtags
|
||||
-t, --table Print a table of the most common co-occurring hashtags
|
||||
--output-dir OUTPUT_DIR
|
||||
Directory to save scraped data and visualizations to
|
||||
--config CONFIG File name of configuration file to store TikTok credentials to
|
||||
--log LOG File to write logs to
|
||||
```
|
||||
|
||||
### Structure of output data
|
||||
```
|
||||
$ tree ../data
|
||||
../data
|
||||
├── ids
|
||||
│ └── post_ids.json
|
||||
├── london
|
||||
│ └── posts
|
||||
│ └── data.json
|
||||
│ ├── plots
|
||||
│ ├── posts.json
|
||||
│ └── media
|
||||
├── newyork
|
||||
│ └── posts
|
||||
│ └── data.json
|
||||
│ ├── plots
|
||||
│ ├── posts.json
|
||||
│ └── media
|
||||
└── paris
|
||||
└── posts
|
||||
└── data.json
|
||||
│ ├── plots
|
||||
│ ├── posts.json
|
||||
│ └── media
|
||||
```
|
||||
|
||||
|
||||
The `data` folder contains all the downloaded data as shown in the tree diagram above.
|
||||
- The `ids` folder contains two files `post_ids.json` and `video_ids.json` that record the ids of the downloaded posts and videos for each hashtag.
|
||||
- Each hashtag has a folder with two subfolders `posts` and `videos` that store posts and videos respectively. The posts are stored in the `data.json` file in the `posts` folder, and videos are stored as the `.mp4` files in the `videos` folder.
|
||||
- Each hashtag has a folder with two subfolders `plots` and `media` that store plots of the most common co-occurring hashtags, and media downloaded from the posts. The posts are stored in the `posts.json` file, and downloaded media is stored as `.mp4` files (for videos) or audio and image files (for image galleries) in the `media` folder.
|
||||
|
||||
|
||||
## How to use
|
||||
### Post downloading
|
||||
Running the `tiktok-hashtag-analysis download` command with the following options will scrape posts containing the hashtags `#london`, `#paris`, or `#newyork`:
|
||||
Running the `tiktok-hashtag-analysis` command with the following options will scrape posts that contain the hashtags `#london`, `#paris`, or `#newyork`:
|
||||
|
||||
tiktok-hashtag-analysis download -t london paris newyork -p
|
||||
tiktok-hashtag-analysis london paris newyork
|
||||
|
||||
and will produce an output similar to the following log:
|
||||
|
||||
$ tiktok-hashtag-analysis download -t london paris newyork -p
|
||||
$ tiktok-hashtag-analysis download london paris newyork
|
||||
Hashtags to scrape: ['london', 'paris', 'newyork']
|
||||
Scraped 963 posts containing the hashtag 'london'
|
||||
Scraped 961 posts containing the hashtag 'paris'
|
||||
Scraped 940 posts containing the hashtag 'newyork'
|
||||
Successfully scraped 2864 total entries
|
||||
|
||||
- The `-t` flag allows a space-separated list of hashtags to be specified as a command line argument
|
||||
- The `-p` flag specifies that posts, not videos, will be downloaded
|
||||
- The list of hashtags to scrape is specified as a positional argument
|
||||
|
||||
### Video downloading
|
||||
Running the `tiktok-hashtag-analysis download` script with the following options will scrape trending videos containing the hashtag `#london`:
|
||||
`tiktok-hashtag-analysis download -t london -v`
|
||||
Running the `tiktok-hashtag-analysis` script with the following options will scrape trending posts containing the hashtag `#london`:
|
||||
`tiktok-hashtag-analysis london --download`
|
||||
|
||||
- The `-t` flag allows a space-separated list of hashtags to be specified as a command line argument
|
||||
- The `-v` flag specifies that videos, not posts, will be downloaded
|
||||
- The `--download` flag specifies that video files for scraped posts should be downloaded
|
||||
|
||||
Note that video downloading is a time and data rate consuming task, as a result we recommend using one hashtag at a time when using the `-v` flag to avoid complications.
|
||||
Note that video downloading is a time and data rate consuming task, as a result we recommend using one hashtag at a time when using the `--download` flag to avoid complications.
|
||||
|
||||
## Analyzing results
|
||||
### Top n hashtag occurrences
|
||||
The script `tiktok-hashtag-analysis frequencies` analyzes the frequencies of top occurring hashtags in a given set of posts.
|
||||
### Most common co-occurring hashtags
|
||||
In addition to scraping data and downloading media, the `tiktok-hashtag-analysis` script can also analyze the frequencies of the most common co-occurring hashtags in a given set of posts.
|
||||
|
||||
Assume we want to analyze the 20 most frequently occurring hashtags in the downloaded posts of the `#london` hashtag.
|
||||
Assume we want to analyze the 20 most frequently co-occurring hashtags in the downloaded posts of the `#london` hashtag.
|
||||
|
||||
- The results can be plotted and saved as a PNG file by executing the following command:
|
||||
|
||||
`tiktok-hashtag-analysis frequencies london 20 -p`
|
||||
`tiktok-hashtag-analysis london --number 20 --plot`
|
||||
|
||||
which will produce a figure similar to that shown below:
|
||||
<p align="center">
|
||||
@@ -111,32 +102,48 @@ Assume we want to analyze the 20 most frequently occurring hashtags in the downl
|
||||
|
||||
- The results can be displayed in tabular form by executing the following command:
|
||||
|
||||
`tiktok-hashtag-analysis frequencies london 20 -d`
|
||||
`tiktok-hashtag-analysis london --number 20 --table`
|
||||
|
||||
which will produce a terminal output similar to the following:
|
||||
```
|
||||
Rank Hashtag Occurrences Frequency
|
||||
0 london 960 1.0000
|
||||
1 fyp 494 0.5146
|
||||
2 uk 238 0.2479
|
||||
3 foryou 221 0.2302
|
||||
4 foryoupage 184 0.1917
|
||||
5 viral 179 0.1865
|
||||
6 fypシ 84 0.0875
|
||||
7 funny 56 0.0583
|
||||
8 xyzbca 51 0.0531
|
||||
9 british 45 0.0469
|
||||
10 england 44 0.0458
|
||||
11 trending 40 0.0417
|
||||
12 fy 33 0.0344
|
||||
13 comedy 32 0.0333
|
||||
14 roadman 28 0.0292
|
||||
15 4u 27 0.0281
|
||||
16 usa 26 0.0271
|
||||
17 tiktok 26 0.0271
|
||||
18 travel 21 0.0219
|
||||
19 america 20 0.0208
|
||||
Total posts: 960
|
||||
Co-occurring hashtags for #london posts
|
||||
Rank Hashtag Occurrences Frequency
|
||||
0 london 881 1.0000
|
||||
1 fyp 399 0.4529
|
||||
2 uk 174 0.1975
|
||||
3 foryou 168 0.1907
|
||||
4 viral 152 0.1725
|
||||
5 foryoupage 137 0.1555
|
||||
6 fypシ 73 0.0829
|
||||
7 funny 54 0.0613
|
||||
8 tiktok 43 0.0488
|
||||
9 trending 43 0.0488
|
||||
10 british 41 0.0465
|
||||
11 england 38 0.0431
|
||||
12 xyzbca 34 0.0386
|
||||
13 fy 33 0.0375
|
||||
14 usa 33 0.0375
|
||||
15 love 29 0.0329
|
||||
16 comedy 25 0.0284
|
||||
17 royalfamily 23 0.0261
|
||||
18 queen 23 0.0261
|
||||
19 queenelizabeth 22 0.0250
|
||||
Total posts: 881
|
||||
```
|
||||
|
||||
The `Frequency` column shows the ratio of the occurrence to the total number of downloaded posts.
|
||||
|
||||
### Contributing
|
||||
To run the build-in tests in the `tests/` directory, first install the test dependency packages:
|
||||
|
||||
```
|
||||
pip install .[test]
|
||||
```
|
||||
|
||||
and then run the tests using the following command:
|
||||
|
||||
```
|
||||
pytest
|
||||
```
|
||||
|
||||
This repo uses [black](https://github.com/psf/black) to format source code, please run the `black` command before submitting a PR.
|
||||
15
pytest.ini
Normal file
15
pytest.ini
Normal file
@@ -0,0 +1,15 @@
|
||||
[pytest]
|
||||
minversion =
|
||||
7.0.0
|
||||
testpaths =
|
||||
tests/
|
||||
python_files =
|
||||
*.py
|
||||
addopts =
|
||||
-vvv
|
||||
--cov='tiktok_hashtag_analysis'
|
||||
--cov-report html:reports/coverage
|
||||
--html='reports/tests.html'
|
||||
--self-contained-html
|
||||
filterwarnings =
|
||||
ignore:Glyph (.*) missing from current font
|
||||
@@ -1,2 +1,5 @@
|
||||
matplotlib
|
||||
seaborn
|
||||
seaborn==0.12.2
|
||||
matplotlib==3.7.2
|
||||
yt-dlp==2023.7.6
|
||||
TikTokApi==6.1.1
|
||||
requests==2.31.0
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
set -e
|
||||
|
||||
TAG=$(python -c 'from tiktok_hashtag_analysis.version import __version__; print("v" + __version__)')
|
||||
TAG=$(python -c 'from tiktok_hashtag_analysis import __version__; print("v" + __version__)')
|
||||
|
||||
read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt
|
||||
|
||||
|
||||
58
setup.py
58
setup.py
@@ -1,36 +1,64 @@
|
||||
from setuptools import setup, find_packages
|
||||
from tiktok_hashtag_analysis.version import __version__
|
||||
from setuptools import setup
|
||||
|
||||
|
||||
def read_requirements(filename: str):
|
||||
with open(filename) as requirements_file:
|
||||
import re
|
||||
|
||||
def fix_url_dependencies(req: str) -> str:
|
||||
"""Pip and setuptools disagree about how URL dependencies should be handled."""
|
||||
m = re.match(
|
||||
r"^(git\+)?(https|ssh)://(git@)?github\.com/([\w-]+)/(?P<name>[\w-]+)\.git",
|
||||
req,
|
||||
)
|
||||
if m is None:
|
||||
return req
|
||||
else:
|
||||
return f"{m.group('name')} @ {req}"
|
||||
|
||||
requirements = []
|
||||
for line in requirements_file:
|
||||
line = line.strip()
|
||||
if line.startswith("#") or len(line) <= 0:
|
||||
continue
|
||||
requirements.append(fix_url_dependencies(line))
|
||||
return requirements
|
||||
|
||||
|
||||
with open("README.md", "r", encoding="utf-8") as file:
|
||||
long_description = file.read()
|
||||
|
||||
# version.py defines the VERSION and VERSION_SHORT variables.
|
||||
# We use exec here so we don't import cached_path whilst setting up.
|
||||
VERSION = {} # type: ignore
|
||||
with open("tiktok_hashtag_analysis/version.py", "r") as version_file:
|
||||
exec(version_file.read(), VERSION)
|
||||
|
||||
setup(
|
||||
name="tiktok-hashtag-analysis",
|
||||
version=__version__,
|
||||
version=VERSION["VERSION"],
|
||||
author="Bellingcat",
|
||||
author_email="tech@bellingcat.com",
|
||||
packages=["tiktok_hashtag_analysis"],
|
||||
package_data={
|
||||
"tiktok_hashtag_analysis": [
|
||||
"logging.config",
|
||||
]
|
||||
},
|
||||
description="Analyze hashtags within posts scraped from TikTok",
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
url="https://github.com/bellingcat/tiktok-hashtag-analysis",
|
||||
license="MIT License",
|
||||
install_requires=["seaborn", "matplotlib"],
|
||||
# install_requires=read_requirements("requirements.txt"),
|
||||
# extras_require={"dev": read_requirements("dev-requirements.txt")},
|
||||
install_requires=["seaborn", "matplotlib", "TikTokApi", "requests", "yt_dlp"],
|
||||
extras_require={"test": ["pytest", "pytest-cov", "pytest-html", "pytest-metadata"]},
|
||||
classifiers=[
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'Intended Audience :: Information Technology',
|
||||
'License :: OSI Approved :: MIT License',
|
||||
'Natural Language :: English',
|
||||
'Programming Language :: Python :: 3'
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Intended Audience :: Information Technology",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Natural Language :: English",
|
||||
"Programming Language :: Python :: 3",
|
||||
],
|
||||
entry_points={
|
||||
"console_scripts": [
|
||||
"tiktok-hashtag-analysis=tiktok_hashtag_analysis.__main__:main",
|
||||
"tiktok-hashtag-analysis=tiktok_hashtag_analysis.cli:main",
|
||||
]
|
||||
},
|
||||
)
|
||||
|
||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
24
tests/auth.py
Normal file
24
tests/auth.py
Normal file
@@ -0,0 +1,24 @@
|
||||
import pytest
|
||||
|
||||
from tiktok_hashtag_analysis.auth import Authorization
|
||||
|
||||
MS_TOKEN = "thisisafakemstokenfortiktok"
|
||||
|
||||
|
||||
def test_auth_input(tmp_path, monkeypatch):
|
||||
config_file = tmp_path / ".tiktok"
|
||||
monkeypatch.setattr("builtins.input", lambda _: MS_TOKEN)
|
||||
auth = Authorization(config_file=config_file)
|
||||
auth.get_token()
|
||||
|
||||
assert auth.ms_token == MS_TOKEN
|
||||
|
||||
|
||||
def test_auth(tmp_path):
|
||||
config_file = tmp_path / ".tiktok"
|
||||
auth = Authorization(config_file=config_file)
|
||||
|
||||
auth.dump_token(ms_token=MS_TOKEN)
|
||||
auth.get_token()
|
||||
|
||||
assert auth.ms_token == MS_TOKEN
|
||||
15
tests/base.py
Normal file
15
tests/base.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from tiktok_hashtag_analysis.base import TikTokDownloader, load_hashtags_from_file
|
||||
|
||||
|
||||
def test_scrape(tmp_path, hashtags):
|
||||
downloader = TikTokDownloader(hashtags=hashtags[:1], data_dir=tmp_path)
|
||||
downloader.run(download=True, plot=True, table=True, number=20)
|
||||
|
||||
|
||||
def test_load_hashtags_from_file(tmp_path, hashtags):
|
||||
file = tmp_path / "hashtags.txt"
|
||||
with open(file, "w", encoding="utf-8") as f:
|
||||
f.write("\n".join(hashtags))
|
||||
|
||||
loaded_hashtags = load_hashtags_from_file(file=file)
|
||||
assert loaded_hashtags == hashtags
|
||||
31
tests/cli.py
Normal file
31
tests/cli.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import pytest
|
||||
|
||||
from tiktok_hashtag_analysis.cli import create_parser
|
||||
|
||||
ARGUMENTS = [
|
||||
("file", "hashtags.txt", "--file"),
|
||||
("download", True, "--download"),
|
||||
("download", True, "-d"),
|
||||
("number", 20, "--number"),
|
||||
("plot", True, "--plot"),
|
||||
("plot", True, "-p"),
|
||||
("table", True, "--table"),
|
||||
("table", True, "-t"),
|
||||
("output_dir", "/tmp/tiktok_download", "--output-dir"),
|
||||
("config", "~/.tiktok", "--config"),
|
||||
("log", "../logfile.log", "--log"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("attribute,value,flag", ARGUMENTS)
|
||||
def test_parser(hashtags, attribute, value, flag):
|
||||
argument_list = [*hashtags, flag]
|
||||
|
||||
if not isinstance(value, bool):
|
||||
argument_list.append(str(value))
|
||||
|
||||
parser = create_parser()
|
||||
args = vars(parser.parse_args(argument_list))
|
||||
|
||||
assert args.get(attribute) == value
|
||||
assert args.get("hashtags") == hashtags
|
||||
11
tests/conftest.py
Normal file
11
tests/conftest.py
Normal file
@@ -0,0 +1,11 @@
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
TEST_HASHTAGS = ["embraceeuropa", "francisparkeryockey"]
|
||||
|
||||
|
||||
@pytest.fixture(scope="package")
|
||||
def hashtags():
|
||||
return TEST_HASHTAGS
|
||||
@@ -0,0 +1 @@
|
||||
from .base import TikTokDownloader
|
||||
|
||||
@@ -1,76 +0,0 @@
|
||||
import logging, argparse
|
||||
from .file_methods import log_writer
|
||||
from .run_downloader import * # Import everything from run_downloader.py
|
||||
from .hashtag_frequencies import * # Import everything from hashtag_frequencies.py
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
|
||||
def create_parser() -> argparse.ArgumentParser:
|
||||
"""Create the parser and the arguments for the user input."""
|
||||
parser = argparse.ArgumentParser(description="Analyze hashtags within posts scraped from TikTok.")
|
||||
parser.add_argument("command", help="command to initialize", choices=['download', 'frequencies'])
|
||||
parser.add_argument("-t", type=str, nargs="*", help="List of hashtags to scrape (module: run_downloader)")
|
||||
parser.add_argument("-f", type=str, help="File name containing list of hashtags to scrape (module: run_downloader)")
|
||||
parser.add_argument("-p", action="store_true", help="Download post data (module: run_downloader)")
|
||||
parser.add_argument("-v", action="store_true", help="Download video files (module: run_downloader)")
|
||||
parser.add_argument("-ht", "--hashtag", type=str,
|
||||
help="The hashtag of scraped posts to analyze (module: hashtag_frequencies)", )
|
||||
parser.add_argument("-n", "--number", type=int, help="The number of top n occurrences (module: hashtag_frequencies)")
|
||||
parser.add_argument("-plt", "--plot", help="Plot the occurrences (module: hashtag_frequencies)", action="store_true")
|
||||
parser.add_argument("-d", "--print", help="List top n hashtags (module: hashtag_frequencies)", action="store_true")
|
||||
return parser
|
||||
|
||||
|
||||
def main():
|
||||
parser = create_parser()
|
||||
args = parser.parse_args()
|
||||
if args.command == "download":
|
||||
if not (args.t or args.f):
|
||||
parser.error(
|
||||
"No hashtags were given, please use either the `-t` flag or the `-f` flag to specify one or more hashtags.")
|
||||
|
||||
if not (args.p or args.v):
|
||||
parser.error(
|
||||
"No argument given, please specify either the `-p` flag to download post data or the `-v` flag to download video files, or both."
|
||||
)
|
||||
|
||||
if args.t:
|
||||
hashtags = args.t
|
||||
elif args.f:
|
||||
file_name = args.f
|
||||
hashtags = get_hashtag_list(file_name)
|
||||
|
||||
logger.info(f"Hashtags to scrape: {hashtags}")
|
||||
if not hashtags:
|
||||
raise ValueError(
|
||||
"No hashtags were specified: please use either the `-t` flag to specify a sspace-separated list of one or more hashtags as a command-line argument, or use the `-f` flag to specify a text file of newline-separated hashtags.")
|
||||
|
||||
download_data_type = {"posts": args.p, "videos": args.v}
|
||||
|
||||
scraped_summary_list = get_data(hashtags, download_data_type)
|
||||
if scraped_summary_list:
|
||||
log_writer(scraped_summary_list)
|
||||
elif args.command == "frequencies":
|
||||
img_folder = IMAGES
|
||||
check_file(img_folder, "dir")
|
||||
if args.n < 1:
|
||||
raise ValueError(
|
||||
f"Specified argument `n` (the number of hashtags to analyze) must be greater than zero, not: {args.n}.")
|
||||
input_file = data_file = os.path.join(
|
||||
FILES["data"], args.hashtag, FILES["posts"], FILES["data_file"]
|
||||
)
|
||||
if not check_existence(input_file, "file"):
|
||||
raise FileNotFoundError(
|
||||
f"File ({input_file}) for specified argument `hashtag` ({args.hashtag}) does not exist.")
|
||||
|
||||
# base = os.path.splitext(input_file)[0]
|
||||
# path = f"./{base}_sorted_hashtags.csv"
|
||||
occs = get_occurrences(input_file, args.n)
|
||||
if args.plot:
|
||||
plot(occs, img_folder)
|
||||
else:
|
||||
print_occurrences(occs)
|
||||
|
||||
if __name__=="__main__":
|
||||
main()
|
||||
72
tiktok_hashtag_analysis/auth.py
Normal file
72
tiktok_hashtag_analysis/auth.py
Normal file
@@ -0,0 +1,72 @@
|
||||
import os
|
||||
import configparser
|
||||
from pathlib import Path
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class Authorization:
|
||||
"""Handle authorization for TikTok, using the `msToken`."""
|
||||
|
||||
def __init__(self, config_file: Optional[str] = None):
|
||||
if config_file:
|
||||
self.config_file = Path(config_file)
|
||||
else:
|
||||
self.config_file = Path.home() / ".tiktok"
|
||||
|
||||
self.section = "TikTok"
|
||||
self.ms_token = None
|
||||
|
||||
def get_token(self) -> str:
|
||||
"""Load the "msToken" cookie taken from TikTok, which the scraper requires."""
|
||||
|
||||
# Step 1: check if MS_TOKEN is defined as environment variable
|
||||
if ms_token := os.environ.get("MS_TOKEN"):
|
||||
self.ms_token = ms_token
|
||||
logging.info("Loaded token from environment variable")
|
||||
|
||||
# Step 2: check if MS_TOKEN is defined in config file
|
||||
elif self.config_file.is_file():
|
||||
if ms_token := self.load_token():
|
||||
self.ms_token = ms_token
|
||||
logging.info(f"Loaded token from config file: {self.config_file}")
|
||||
|
||||
# Step 3: have user enter MS_TOKEN via terminal
|
||||
else:
|
||||
ms_token = self.input_token()
|
||||
self.dump_token(ms_token=ms_token)
|
||||
self.ms_token = ms_token
|
||||
logging.info(
|
||||
f"Loaded token from user input and saved to config file: {self.config_file}"
|
||||
)
|
||||
|
||||
return self.ms_token
|
||||
|
||||
def load_token(self) -> Optional[str]:
|
||||
"""Parse a config file and extract the token."""
|
||||
|
||||
config = configparser.ConfigParser()
|
||||
config.read(self.config_file)
|
||||
return config.get(section=self.section, option="MS_TOKEN", fallback=None)
|
||||
|
||||
def dump_token(self, ms_token: str):
|
||||
"""Write the token to a config file."""
|
||||
|
||||
config = configparser.ConfigParser()
|
||||
config.read(self.config_file)
|
||||
config.add_section(self.section)
|
||||
config.set(section=self.section, option="MS_TOKEN", value=ms_token)
|
||||
|
||||
with open(self.config_file, "w", encoding="utf-8") as f:
|
||||
config.write(f)
|
||||
|
||||
def input_token(self) -> str:
|
||||
"""Allow user to manually enter the token in the terminal."""
|
||||
|
||||
print(
|
||||
"\nPlease copy and paste your `msToken` cookie taken from your web browser when visiting the TikTok website. See [THIS VIDEO] for more information.\n"
|
||||
)
|
||||
|
||||
ms_token = input("msToken: ")
|
||||
|
||||
return ms_token
|
||||
267
tiktok_hashtag_analysis/base.py
Normal file
267
tiktok_hashtag_analysis/base.py
Normal file
@@ -0,0 +1,267 @@
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
from datetime import datetime
|
||||
import warnings
|
||||
import asyncio
|
||||
import logging
|
||||
import re
|
||||
from typing import List, Dict
|
||||
|
||||
import yt_dlp
|
||||
import requests
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.ticker as mtick
|
||||
import seaborn as sns
|
||||
|
||||
from TikTokApi import TikTokApi
|
||||
|
||||
from .auth import Authorization
|
||||
|
||||
warnings.filterwarnings("ignore", message="Glyph (.*) missing from current font")
|
||||
sns.set_theme(style="darkgrid")
|
||||
|
||||
|
||||
def process_hashtag_list(hashtags: List[str]) -> List[str]:
|
||||
"""Convert a list of hashtags to a standard form (remove whitespace, make
|
||||
lowercase, etc.)."""
|
||||
return list(
|
||||
filter(None, (hashtag.strip().strip("#").lower() for hashtag in hashtags))
|
||||
)
|
||||
|
||||
|
||||
def load_hashtags_from_file(file: str) -> List[str]:
|
||||
"""Read and process hashtags specified in a text file."""
|
||||
if not os.path.isfile(file):
|
||||
raise OSError(f"{file} does not exist")
|
||||
with open(file, "r", encoding="utf-8") as f:
|
||||
hashtags = re.split(r"\n|,", f.read())
|
||||
return process_hashtag_list(hashtags=hashtags)
|
||||
|
||||
|
||||
async def _fetch_hashtag_data(hashtag: str, ms_token: str) -> List[Dict]:
|
||||
"""Fetch data for videos containing a specified hashtag, asynchronously."""
|
||||
data = []
|
||||
async with TikTokApi() as api:
|
||||
await api.create_sessions(ms_tokens=[ms_token], num_sessions=1, sleep_after=3)
|
||||
async for video in api.hashtag(name=hashtag).videos(count=1000):
|
||||
data.append(video.as_dict)
|
||||
return data
|
||||
|
||||
|
||||
def json_load(file_path: Path) -> List:
|
||||
"""Read a JSON file and return the read data."""
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
data = json.load(fp=f)
|
||||
return data
|
||||
|
||||
|
||||
def json_dump(file_path: Path, data: List):
|
||||
"""Write data to a JSON file."""
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
json.dump(obj=data, fp=f)
|
||||
|
||||
|
||||
def download_gallery(video_data: Dict, video_dir: Path):
|
||||
"""yt-dlp doesn't seem to support downloading images from an image gallery,
|
||||
so this is a quick fix that likely will fail on edge cases."""
|
||||
|
||||
video_id = video_data["id"]
|
||||
if play_url := video_data["music"]["playUrl"]:
|
||||
r = requests.get(play_url)
|
||||
with open(video_dir / f"{video_id}.mp3", "wb") as f:
|
||||
f.write(r.content)
|
||||
|
||||
for i, image in enumerate(video_data["imagePost"]["images"]):
|
||||
image_url = image["imageURL"]["urlList"][0]
|
||||
r = requests.get(image_url)
|
||||
ext = r.headers["Content-Type"].split("/")[-1]
|
||||
with open(video_dir / f"{video_id}_{i:02d}.{ext}", "wb") as f:
|
||||
f.write(r.content)
|
||||
|
||||
|
||||
def aggregate_cooccurring_hashtags(hashtag_file: Path) -> Counter:
|
||||
"""Aggregate how frequently hashtags are used, from a file containing a
|
||||
list of raw TikTok post API responses."""
|
||||
videos = json_load(file_path=hashtag_file)
|
||||
|
||||
all_hashtags: List[set] = []
|
||||
for video in videos:
|
||||
video_hashtags = set(
|
||||
hashtag["hashtagName"]
|
||||
for hashtag in video.get("textExtra", [])
|
||||
if hashtag.get("hashtagName")
|
||||
)
|
||||
all_hashtags.extend(video_hashtags)
|
||||
|
||||
return Counter(all_hashtags)
|
||||
|
||||
|
||||
class TikTokDownloader:
|
||||
"""Main class for scraping data from TikTok."""
|
||||
|
||||
def __init__(self, hashtags: List[str], data_dir: str, config_file: str = None):
|
||||
self.hashtags = process_hashtag_list(hashtags)
|
||||
logging.info(f"Hashtags to scrape: {hashtags}")
|
||||
|
||||
self.data_dir = Path(data_dir)
|
||||
os.makedirs(self.data_dir, exist_ok=True)
|
||||
|
||||
self.auth = Authorization(config_file=config_file)
|
||||
self.ms_token = self.auth.get_token()
|
||||
|
||||
def get_hashtag_posts(self, hashtag: str):
|
||||
"""Fetch data about posts that used a specified hashtag and merge with
|
||||
existing data, if it exists."""
|
||||
|
||||
# Define file to store hashtags in and create parent directory
|
||||
hashtag_file = self.data_dir / hashtag / "posts.json"
|
||||
hashtag_file.parent.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
# If there are previously scraped posts, load them
|
||||
if hashtag_file.is_file():
|
||||
already_fetched_data = json_load(file_path=hashtag_file)
|
||||
already_fetched_ids = set(video["id"] for video in already_fetched_data)
|
||||
else:
|
||||
already_fetched_ids = set()
|
||||
already_fetched_data = []
|
||||
|
||||
# Scrape posts that use the specified hashtag
|
||||
fetched_data = asyncio.run(
|
||||
_fetch_hashtag_data(hashtag=hashtag, ms_token=self.ms_token)
|
||||
)
|
||||
if len(fetched_data) == 0:
|
||||
logging.warning(f"No posts were found for the hashtag: {hashtag}")
|
||||
|
||||
# Determine which newly scraped posts haven't been scraped before
|
||||
new_fetched_data = [
|
||||
video for video in fetched_data if video["id"] not in already_fetched_ids
|
||||
]
|
||||
if len(new_fetched_data) == 0:
|
||||
logging.warning(f"No new posts were found for the hashtag: {hashtag}")
|
||||
|
||||
# Merge new and old data and write to file
|
||||
all_fetched_data = already_fetched_data + new_fetched_data
|
||||
json_dump(file_path=hashtag_file, data=all_fetched_data)
|
||||
logging.info(
|
||||
f"Scraped {len(new_fetched_data)} new posts containing the hashtag "
|
||||
f"'{hashtag}', with {len(already_fetched_data)} posts previously scraped"
|
||||
)
|
||||
|
||||
def get_hashtag_videos(self, hashtag: str):
|
||||
"""Download videos and other media corresponding to posts that used a
|
||||
specified hashtag,"""
|
||||
|
||||
# Define file containing post data and directory to save videos to
|
||||
hashtag_file = self.data_dir / hashtag / "posts.json"
|
||||
video_dir = self.data_dir / hashtag / "media"
|
||||
video_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Get list of post IDs that have previously had their media downloaded
|
||||
already_downloaded_ids = set(
|
||||
file.split(".")[0].split("_")[0] for file in os.listdir(video_dir)
|
||||
)
|
||||
# Get list of posts that have been scraped but not had their media downloaded
|
||||
video_list = json_load(file_path=hashtag_file)
|
||||
new_video_list = [
|
||||
video for video in video_list if video["id"] not in already_downloaded_ids
|
||||
]
|
||||
if len(new_video_list) == 0:
|
||||
logging.warning(
|
||||
f"No new videos to be downloaded for the hashtag: {hashtag}"
|
||||
)
|
||||
|
||||
# Populate list of URLs to download using yt-dlp, and list of image
|
||||
# galleries to download using the `download_gallery` function
|
||||
urls_to_download = []
|
||||
galleries_to_download = []
|
||||
for video in new_video_list:
|
||||
if video.get("imagePost") is None:
|
||||
url = f"https://www.tiktok.com/@{video['author']['uniqueId']}/video/{video['id']}"
|
||||
urls_to_download.append(url)
|
||||
else:
|
||||
galleries_to_download.append(video)
|
||||
|
||||
# Download audio and image files for all image gallery posts
|
||||
if len(galleries_to_download) > 0:
|
||||
logging.info(f"Downloading image galleries for hashtag {hashtag}")
|
||||
for video in galleries_to_download:
|
||||
download_gallery(video_data=video, video_dir=video_dir)
|
||||
|
||||
# Download video files for all video posts
|
||||
if len(urls_to_download) > 0:
|
||||
logging.info(f"Downloading media for hashtag {hashtag}")
|
||||
ydl_opts = {
|
||||
"outtmpl": os.path.join(video_dir, "%(id)s.%(ext)s"),
|
||||
"ignore_errors": True,
|
||||
}
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
ydl.download(urls_to_download)
|
||||
|
||||
def frequency_table(self, hashtag: str, number: int):
|
||||
"""Print `number`-most commonly co-occurring hashtags for a specified
|
||||
source hashtag, in tabular form."""
|
||||
|
||||
# Load video data file and extract co-occurring hashtag frequency information
|
||||
hashtag_file = self.data_dir / hashtag / "posts.json"
|
||||
frequencies = aggregate_cooccurring_hashtags(hashtag_file=hashtag_file)
|
||||
|
||||
# Print table that displays most commonly co-occurring hashtags
|
||||
total_posts = max(frequencies.values())
|
||||
print(f"\nCo-occurring hashtags for #{hashtag} posts")
|
||||
print(f"{'Rank':<8} {'Hashtag':<30} {'Occurrences':<15} {'Frequency':<15}")
|
||||
for row, (hashtag, frequency) in enumerate(frequencies.most_common(number)):
|
||||
ratio = frequency / total_posts
|
||||
print(f"{row:<8} {hashtag:<30} {frequency:<15} {ratio:.4f}")
|
||||
print(f"Total posts: {total_posts}\n\n")
|
||||
|
||||
def plot(self, hashtag: str, number: int):
|
||||
"""Create plot of `number`-most commonly co-occurring hashtags for a
|
||||
specified source hashtag."""
|
||||
|
||||
# Load video data file and extract co-occurring hashtag frequency information
|
||||
hashtag_file = self.data_dir / hashtag / "posts.json"
|
||||
frequencies = aggregate_cooccurring_hashtags(hashtag_file=hashtag_file)
|
||||
|
||||
# Define labels and other fields used in plot
|
||||
total_posts = max(frequencies.values())
|
||||
sorted_frequencices = frequencies.most_common(number)
|
||||
labels = [label for label, _ in sorted_frequencices[1:]]
|
||||
ratios = [freq / total_posts * 100 for _, freq in sorted_frequencices[1:]]
|
||||
y_pos = list(reversed(range(len(sorted_frequencices) - 1)))
|
||||
|
||||
# Visualize data in bar chart
|
||||
fig, ax = plt.subplots(figsize=(5, 6.66))
|
||||
ax.barh(y_pos, ratios)
|
||||
ax.set_yticks(y_pos)
|
||||
ax.set_yticklabels(labels)
|
||||
ax.grid(axis="y")
|
||||
ax.set_xlabel("Percent of posts with co-occurring hashtag")
|
||||
ax.set_ylim(min(y_pos) - 1, max(y_pos) + 1)
|
||||
ax.set_title(f"Co-occurring hashtags for #{hashtag} posts")
|
||||
ax.xaxis.set_major_formatter(mtick.PercentFormatter(decimals=0))
|
||||
|
||||
# Write image of plot to file
|
||||
current_time = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
|
||||
plot_file = self.data_dir / hashtag / "plots" / f"{hashtag}__{current_time}.png"
|
||||
plot_file.parent.mkdir(exist_ok=True, parents=True)
|
||||
plt.savefig(plot_file, bbox_inches="tight", facecolor="white", dpi=300)
|
||||
logging.info(f"Plot saved to file: {plot_file}")
|
||||
|
||||
def run(self, download: bool, plot: bool, table: bool, number: int):
|
||||
"""Execute the specified operations on all specified hashtags."""
|
||||
|
||||
# Scrape all specified hashtags and perform analyses, depending on if
|
||||
# `--table` and `--plot` flags are used in the command
|
||||
for hashtag in self.hashtags:
|
||||
self.get_hashtag_posts(hashtag=hashtag)
|
||||
if plot:
|
||||
self.plot(hashtag=hashtag, number=number)
|
||||
if table:
|
||||
self.frequency_table(hashtag=hashtag, number=number)
|
||||
|
||||
# Download media for all hashtags if `--download` flag is used in the command
|
||||
for hashtag in self.hashtags:
|
||||
if download:
|
||||
self.get_hashtag_videos(hashtag=hashtag)
|
||||
102
tiktok_hashtag_analysis/cli.py
Normal file
102
tiktok_hashtag_analysis/cli.py
Normal file
@@ -0,0 +1,102 @@
|
||||
import logging
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
from .base import TikTokDownloader, load_hashtags_from_file
|
||||
|
||||
|
||||
def create_parser():
|
||||
"""Create parser tp parse input command-line arguments."""
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Analyze hashtags within posts scraped from TikTok."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"hashtags",
|
||||
type=str,
|
||||
nargs="*",
|
||||
help="List of hashtags to scrape",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--file",
|
||||
type=str,
|
||||
help="File name containing list of hashtags to scrape",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--download",
|
||||
action="store_true",
|
||||
help="Download video files corresponding to scraped posts",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--number",
|
||||
type=int,
|
||||
help="The number of co-occurring hashtags to analyze",
|
||||
default=20,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-p",
|
||||
"--plot",
|
||||
help="Plot the most common co-occurring hashtags",
|
||||
action="store_true",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--table",
|
||||
help="Print a table of the most common co-occurring hashtags",
|
||||
action="store_true",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
help="Directory to save scraped data and visualizations to",
|
||||
default=Path(".").resolve().parent / "data",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
help="File name of configuration file to store TikTok credentials to",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument("--log", type=str, help="File to write logs to", default=None)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def main():
|
||||
"""Parse and process command-line arguments, scrape specified hashtags, and perform specified analyses."""
|
||||
|
||||
parser = create_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
filename=args.log,
|
||||
format="%(asctime)s %(levelname)s | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
|
||||
if len(args.hashtags) == 0:
|
||||
if not args.file:
|
||||
parser.error(
|
||||
"No hashtags were specified, please specify one or more hashtags "
|
||||
"to scrape or use the `--file` flag to specify a text file containing "
|
||||
"hashtags."
|
||||
)
|
||||
else:
|
||||
hashtags = load_hashtags_from_file(file=args.file)
|
||||
else:
|
||||
hashtags = args.hashtags
|
||||
|
||||
downloader = TikTokDownloader(
|
||||
hashtags=hashtags, data_dir=args.output_dir, config_file=args.config
|
||||
)
|
||||
|
||||
downloader.run(
|
||||
download=args.download, plot=args.plot, table=args.table, number=args.number
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,161 +0,0 @@
|
||||
"""Utility functions that perform data processing related tasks.
|
||||
"""
|
||||
|
||||
from typing import NamedTuple, List, Tuple, Set, Optional, Dict, Any
|
||||
import logging
|
||||
|
||||
from . import file_methods
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
|
||||
class Diff(NamedTuple):
|
||||
"""Keep track of scraped post IDs and whether previously-scraped posts have been filtered."""
|
||||
|
||||
ids: Set[str]
|
||||
filter_posts: bool
|
||||
|
||||
|
||||
class Total(NamedTuple):
|
||||
"""Keep track of number of total and number of unique scraped posts."""
|
||||
|
||||
total: int
|
||||
unique: int
|
||||
|
||||
|
||||
def get_difference(tag: str, file_name: str, ids: List[str]) -> Optional[Diff]:
|
||||
"""Find TikTok post IDs that haven't previously been scraped.
|
||||
|
||||
Filter out the new posts for the hashtag `tag` by comparing the list of
|
||||
post IDs contained in `filename` to the list of newly downloaded IDs
|
||||
contained in `ids`.
|
||||
"""
|
||||
filter_posts = False
|
||||
current_id_data = file_methods.get_data(file_name)
|
||||
if tag in current_id_data:
|
||||
current_ids = current_id_data[tag]
|
||||
set_current_ids = set(current_ids)
|
||||
total_current_ids = len(set_current_ids)
|
||||
set_ids = set(ids)
|
||||
new_ids = set_ids.difference(set_current_ids)
|
||||
if not new_ids:
|
||||
return None
|
||||
else:
|
||||
total_new_ids = len(new_ids)
|
||||
if total_new_ids == total_current_ids:
|
||||
new_data = Diff(new_ids, filter_posts)
|
||||
else:
|
||||
new_data = Diff(new_ids, filter_posts)
|
||||
return new_data
|
||||
else:
|
||||
filter_posts = True
|
||||
new_data = Diff(set(ids), filter_posts)
|
||||
return new_data
|
||||
|
||||
|
||||
def extract_posts(
|
||||
settings: Dict[Any, Any], file_name: str, tag: str
|
||||
) -> Optional[Tuple[List[str], List[Dict]]]:
|
||||
"""Find TikTok posts that haven't previously been scraped.
|
||||
|
||||
Compares the file downloaded by tiktok-scraper to the list of
|
||||
previously-scraped posts (from the file ids/post_ids.json).
|
||||
"""
|
||||
ids = []
|
||||
posts = []
|
||||
|
||||
posts = file_methods.get_data(file_name)
|
||||
for post in posts:
|
||||
ids.append(post["id"])
|
||||
|
||||
if not ids:
|
||||
logger.warn(f"No posts were found for the hashtag: {tag}")
|
||||
return None
|
||||
|
||||
status = file_methods.check_existence(settings["post_ids"], "file")
|
||||
if not status:
|
||||
new_data = (ids, posts)
|
||||
return new_data
|
||||
else:
|
||||
new_ids = get_difference(tag, settings["post_ids"], ids)
|
||||
if not new_ids:
|
||||
logger.warn(f"No new posts were found for the hashtag: {tag}")
|
||||
return None
|
||||
elif new_ids.filter_posts:
|
||||
new_posts = [post for post in posts if post["id"] in new_ids.ids]
|
||||
return (list(new_ids.ids), new_posts)
|
||||
else:
|
||||
return (list(new_ids.ids), posts)
|
||||
|
||||
|
||||
def extract_videos(settings: dict, tag: str, download_list: List[str]) -> List[str]:
|
||||
"""Find TikTok videos that haven't previously been scraped.
|
||||
|
||||
Compares the file downloaded by tiktok-scraper to the list of
|
||||
previously-scraped videos (from the file ids/video_ids.json).
|
||||
"""
|
||||
status = file_methods.check_existence(settings["video_ids"], "file")
|
||||
if not status:
|
||||
new_data = download_list
|
||||
return new_data
|
||||
else:
|
||||
new_videos = get_difference(tag, settings["video_ids"], download_list)
|
||||
if not new_videos:
|
||||
logger.warn(
|
||||
f"No new videos were found for the {tag} in the downloaded folder."
|
||||
)
|
||||
return []
|
||||
else:
|
||||
return list(new_videos.ids)
|
||||
|
||||
|
||||
def update_posts(
|
||||
file_path: str, file_type: str, new_data: List[Any], tag: str = None
|
||||
) -> Optional[Tuple[str, int]]:
|
||||
"""Update the file containing scraped post IDs (`ids/post_ids.json`) with
|
||||
the IDs of the recently scraped posts.
|
||||
"""
|
||||
status = file_methods.check_existence(file_path, file_type)
|
||||
if not tag:
|
||||
file_methods.post_writer(file_path, new_data, status)
|
||||
return None
|
||||
else:
|
||||
scraped_data = file_methods.id_writer(file_path, new_data, tag, status)
|
||||
return scraped_data
|
||||
|
||||
|
||||
def update_videos(
|
||||
settings: Dict[str, Any], new_data: List[str], tag: str
|
||||
) -> Tuple[str, int]:
|
||||
"""Update the file containing video IDs (`ids/video_ids.json`) with the IDs
|
||||
of the recently scraped videos.
|
||||
"""
|
||||
file_path = settings["video_ids"]
|
||||
file_methods.check_file(file_path, "file")
|
||||
number_scraped = file_methods.id_writer(file_path, new_data, tag, True)
|
||||
file_methods.clean_video_files(settings, tag, new_data)
|
||||
return number_scraped
|
||||
|
||||
|
||||
def get_total_posts(file_path: str, tag: str) -> Total:
|
||||
"""Count number of total scraped posts and number of unique scraped posts."""
|
||||
status = file_methods.check_existence(file_path, "file")
|
||||
if not status:
|
||||
raise OSError(f"{file_path} not found!")
|
||||
else:
|
||||
data = file_methods.get_data(file_path)
|
||||
total_posts = len(data[tag])
|
||||
unique = len(set(data[tag]))
|
||||
t = Total(total_posts, unique)
|
||||
return t
|
||||
|
||||
|
||||
def print_total(file_path: str, tag: str, data_type: str):
|
||||
"""Print number of total and unique scraped posts, warn if any non-unique posts."""
|
||||
total = get_total_posts(file_path, tag)
|
||||
if total.total == total.unique:
|
||||
logger.info(f"Scraped {total.total} {data_type} containing the hashtag '{tag}'")
|
||||
else:
|
||||
logger.warn(
|
||||
f"Out of total {data_type} for the hashtag {tag} {total.total}, only {total.unique} are unique. Something is going wrong..."
|
||||
)
|
||||
@@ -1,216 +0,0 @@
|
||||
"""Utility functions that operate on files, such as writing to reading from a file.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import subprocess
|
||||
from os import path
|
||||
from datetime import datetime
|
||||
import shutil
|
||||
from typing import Tuple, List, Optional, Dict, Any
|
||||
|
||||
import logging, logging.config
|
||||
|
||||
logging.config.fileConfig(path.join(path.dirname(path.abspath(__file__)), 'logging.config'))
|
||||
logger = logging.getLogger("Logger")
|
||||
|
||||
|
||||
def create_file(name: str, file_type: str):
|
||||
"""Create a file or directory."""
|
||||
if file_type == "dir":
|
||||
os.makedirs(name, mode=0o777)
|
||||
elif file_type == "file":
|
||||
with open(name, "w"):
|
||||
pass
|
||||
else:
|
||||
raise ValueError(f"{file_type} has to be either 'dir' or 'file'")
|
||||
|
||||
|
||||
def check_existence(file_path: str, file_type: str):
|
||||
"""Check if a file or a directory exists."""
|
||||
if file_type == "file":
|
||||
return os.path.isfile(file_path)
|
||||
elif file_type == "dir":
|
||||
return os.path.isdir(file_path)
|
||||
else:
|
||||
raise ValueError(f"{file_type} has to be either 'dir' or 'file'")
|
||||
|
||||
|
||||
def check_file(file_path: str, file_type: str):
|
||||
"""If path does not exist, creates a file or directory."""
|
||||
status = check_existence(file_path, file_type)
|
||||
if not status:
|
||||
create_file(file_path, file_type)
|
||||
|
||||
|
||||
def download_posts(settings: Dict, tag: str, output_dir: Any):
|
||||
"""Run the tiktok-scraper command to download posts for a given hashtag.
|
||||
|
||||
Returns the path to the downloaded file of posts. If no file was downloaded,
|
||||
prints the error and returns nothing in order to move on.
|
||||
|
||||
os.chdir is used to execute shell commands in the correct folder and then
|
||||
reused to return to the original folder of execution of run_downloader script.
|
||||
"""
|
||||
path = os.path.join(settings["data"], tag, settings["posts"])
|
||||
os.makedirs(path, exist_ok=True)
|
||||
tiktok_command = f"tiktok-scraper hashtag {tag} -t 'json' --filepath {output_dir}"
|
||||
output = subprocess.check_output(tiktok_command, shell=True, encoding="utf-8")
|
||||
new_file = output.split()[-1]
|
||||
if "json" in new_file:
|
||||
return new_file
|
||||
else:
|
||||
logger.warn(
|
||||
f"Something's wrong with what is returned by tiktok-scraper for the hashtag {tag} - *{new_file}* is not a json file.\n\ntiktok-scraper returned {output}"
|
||||
)
|
||||
|
||||
|
||||
def download_videos(settings: Dict, tag: str):
|
||||
"""Run the tiktok-scraper command to download videos for a given hashtag.
|
||||
|
||||
Note that all the videos are downloaded that are returned by the TikTok API,
|
||||
making this a time- and data-intensive process.
|
||||
The list of downloaded video IDs is constucted and returned if the
|
||||
downloaded folder contains at least 1 video.
|
||||
|
||||
os.chdir is used to execute shell commands in the correct folder and then
|
||||
reused to return to the original folder of execution of run_downloader script.
|
||||
"""
|
||||
path = os.path.join(settings["data"], tag, settings["videos"])
|
||||
os.makedirs(path, exist_ok=True)
|
||||
tiktok_command = f"tiktok-scraper hashtag {tag} -d --filepath {path}"
|
||||
result = subprocess.check_output(tiktok_command, shell=True)
|
||||
downloaded_list_tmp = os.listdir(os.path.join(path, f"#{tag}"))
|
||||
if downloaded_list_tmp:
|
||||
downloaded_list = []
|
||||
for file in downloaded_list_tmp:
|
||||
file = file.split(".")[0]
|
||||
downloaded_list.append(file)
|
||||
|
||||
return downloaded_list
|
||||
else:
|
||||
logger.warn(f"No video files were downloaded for the hashtag {tag}.")
|
||||
shutil.rmtree(settings["videos_delete"])
|
||||
|
||||
|
||||
def get_data(file_path: str) -> Any:
|
||||
"""Read a JSON file and return the read data."""
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
return data
|
||||
|
||||
|
||||
def dump_data(file_path: str, data: Any):
|
||||
"""Write data to a JSON file."""
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f)
|
||||
|
||||
|
||||
def log_writer(log_data: List[Tuple[str, Tuple[str, int]]]):
|
||||
"""Create the dictionary of total downloads (posts and videos) per hashtag.
|
||||
|
||||
Example : {
|
||||
timetamp : {
|
||||
hashtag : {
|
||||
videos : number_of_new_videos ,
|
||||
posts : number_of_new_posts
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Writes the dictionary to the log file (`logs/log.json`).
|
||||
"""
|
||||
|
||||
total = 0
|
||||
scraped_summary_dict = {} # type: Dict[str, Dict[str, int]]
|
||||
for hashtag, (data_type, count) in log_data:
|
||||
if hashtag in scraped_summary_dict:
|
||||
if data_type in scraped_summary_dict[hashtag]:
|
||||
scraped_summary_dict[hashtag][data_type] += count
|
||||
else:
|
||||
scraped_summary_dict[hashtag][data_type] = count
|
||||
total += count
|
||||
else:
|
||||
scraped_summary_dict[hashtag] = {data_type: count}
|
||||
total += count
|
||||
|
||||
now_str = datetime.now().strftime("%d-%m-%Y %H:%M:%S")
|
||||
data = {now_str: scraped_summary_dict}
|
||||
|
||||
logger.debug(f"Logged post data: {data}")
|
||||
logger.info(f"Successfully scraped {total} total entries")
|
||||
|
||||
|
||||
def id_writer(
|
||||
file_path: str, new_data: List[str], tag: str, status: bool
|
||||
) -> Tuple[str, int]:
|
||||
"""Write the list of new ids to the post_ids or video_ids file."""
|
||||
|
||||
total = len(new_data)
|
||||
if status:
|
||||
try:
|
||||
data = get_data(file_path)
|
||||
if tag in data:
|
||||
data[tag] += new_data
|
||||
else:
|
||||
data[tag] = new_data
|
||||
dump_data(file_path, data)
|
||||
except json.decoder.JSONDecodeError:
|
||||
data = {tag: new_data}
|
||||
dump_data(file_path, data)
|
||||
else:
|
||||
data = {tag: new_data}
|
||||
dump_data(file_path, data)
|
||||
logger.debug(f"SUCCESS - {total} entries added to {file_path}")
|
||||
number_scraped = (tag, total)
|
||||
return number_scraped
|
||||
|
||||
|
||||
def post_writer(file_path: str, new_data: List[Dict], status: bool):
|
||||
"""Write the new posts in the post file of the given hashtag
|
||||
(`/data/{hashtag}/posts/data.json`).
|
||||
"""
|
||||
total = len(new_data)
|
||||
if status:
|
||||
try:
|
||||
data = get_data(file_path)
|
||||
data += new_data
|
||||
dump_data(file_path, data)
|
||||
except json.decoder.JSONDecodeError:
|
||||
data = new_data
|
||||
dump_data(file_path, data)
|
||||
else:
|
||||
data = new_data
|
||||
dump_data(file_path, data)
|
||||
logger.debug(f"SUCCESS - {total} entries added to {file_path}")
|
||||
|
||||
|
||||
def delete_file(file_path: str, file_type: str):
|
||||
"""Delete a directory or file."""
|
||||
if not check_existence(file_path, file_type):
|
||||
raise OSError(f"Attempt to delete file failed: {file_path} does not exist")
|
||||
elif file_type == "file":
|
||||
os.remove(file_path)
|
||||
logger.debug(f"Successfully deleted {file_path}")
|
||||
elif file_type == "dir":
|
||||
os.rmdir(file_path)
|
||||
logger.debug(f"Successfully deleted {file_path}")
|
||||
else:
|
||||
raise OSError("{file_type} needs to be either 'file' or 'dir'")
|
||||
|
||||
|
||||
def clean_video_files(settings: dict, tag: str, new_data: Optional[List[str]] = None):
|
||||
"""Move the new videos from the tiktok-scraper video folder to `/data/{hashtag}/videos/`.
|
||||
Deletes the residual tiktok-scraper video folder.
|
||||
"""
|
||||
if new_data:
|
||||
for file in new_data:
|
||||
settings["videos_from"] = (
|
||||
settings["data"] + f"/{tag}/videos/#{tag}/{file}.mp4"
|
||||
)
|
||||
shutil.move(settings["videos_from"], settings["videos_to"])
|
||||
|
||||
shutil.rmtree(settings["videos_delete"])
|
||||
logger.debug(
|
||||
f"Successfully deleted the folder {settings['videos_delete']} folder of videos."
|
||||
)
|
||||
@@ -1,32 +0,0 @@
|
||||
"""Specify global constants including file paths and scraping options.
|
||||
"""
|
||||
|
||||
|
||||
# Directories
|
||||
DATA = "../data"
|
||||
IDS = "ids"
|
||||
POSTS = "posts"
|
||||
VIDEOS = "videos"
|
||||
IMAGES = f"{DATA}/img"
|
||||
|
||||
# Files
|
||||
POST_IDS = "post_ids.json"
|
||||
VIDEO_IDS = "video_ids.json"
|
||||
DATA_FILE = "data.json"
|
||||
|
||||
FILES = {
|
||||
"data": DATA,
|
||||
"ids": IDS,
|
||||
"posts": POSTS,
|
||||
"videos": VIDEOS,
|
||||
"images": IMAGES,
|
||||
"post_ids": f"{DATA}/{IDS}/{POST_IDS}",
|
||||
"video_ids": f"{DATA}/{IDS}/{VIDEO_IDS}",
|
||||
"data_file": f"{DATA_FILE}",
|
||||
"downloads": [],
|
||||
}
|
||||
|
||||
PARAMETERS = {
|
||||
"scraper_attempts": 3,
|
||||
"sleep": 8,
|
||||
}
|
||||
@@ -1,99 +0,0 @@
|
||||
"""Analyze the frequency of hashtags appearing in the set of given posts.
|
||||
|
||||
- The "hashtag" positional argument specifies the hashtag of scraped posts to analyze
|
||||
- The "n" positional argument specifies how many hashtags does the user wants to analyze
|
||||
- Specifying the "-d" flag prints the hashtag frequencies on the shell
|
||||
- Specifying the "-p" flag plots the hashtag frequencies and saves as a png file
|
||||
"""
|
||||
import json
|
||||
from datetime import datetime
|
||||
import warnings
|
||||
import logging
|
||||
from typing import List, Tuple, Dict, Any
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.ticker as mtick
|
||||
import seaborn as sns
|
||||
|
||||
warnings.filterwarnings("ignore", message="Glyph (.*) missing from current font")
|
||||
sns.set_theme(style="darkgrid")
|
||||
|
||||
|
||||
def get_hashtags(obj: Dict) -> List[Tuple[str, int]]:
|
||||
if not obj:
|
||||
raise ValueError(f"Empty item, no hashtags could be extracted.")
|
||||
else:
|
||||
hashtags = {}
|
||||
tags = [set([tag["name"] for tag in ele["hashtags"]]) for ele in obj]
|
||||
{
|
||||
tag: (
|
||||
1
|
||||
if tag not in hashtags and not hashtags.update({tag: 1})
|
||||
else hashtags[tag] + 1 and not hashtags.update({tag: hashtags[tag] + 1})
|
||||
)
|
||||
for ele in tags
|
||||
for tag in ele
|
||||
}
|
||||
|
||||
return sorted(hashtags.items(), key=lambda e: e[1], reverse=True)
|
||||
|
||||
|
||||
def get_occurrences(filename: str, n: int = 1) -> Dict[str, Any]:
|
||||
"""Aggregate hashtag frequency information for a specified JSON file.
|
||||
|
||||
Example: {
|
||||
"total": total posts in the file,
|
||||
top_n: [[top n hashtags ], [frequencies of corresponding hashtags]]
|
||||
}
|
||||
"""
|
||||
with open(filename) as f:
|
||||
obj = json.load(f)
|
||||
l = len(obj)
|
||||
tags = get_hashtags(obj)
|
||||
occs = {"total": l, "top_n": []}
|
||||
occs["top_n"] = [[ele[i] for ele in tags[0 : min(l, n)]] for i in range(2)]
|
||||
return occs
|
||||
|
||||
|
||||
def plot(occs: dict, img_folder: str):
|
||||
"""Save plot of common hashtags as bar chart to file."""
|
||||
y_pos = list(reversed(range(len(occs["top_n"][0]) - 1)))
|
||||
max_count = occs["top_n"][1][0]
|
||||
freqs = [count / max_count * 100 for count in occs["top_n"][1][1:]]
|
||||
labels = occs["top_n"][0][1:]
|
||||
hashtag = occs["top_n"][0][0]
|
||||
|
||||
fig, ax = plt.subplots(figsize=(5, 6.66))
|
||||
ax.barh(y_pos, freqs)
|
||||
ax.set_yticks(y_pos)
|
||||
ax.set_yticklabels(labels)
|
||||
ax.grid(axis="y")
|
||||
ax.set_xlabel("Percent of posts with common hashtag")
|
||||
ax.set_ylim(min(y_pos) - 1, max(y_pos) + 1)
|
||||
ax.set_title(f"Common hashtags for #{hashtag} posts")
|
||||
ax.xaxis.set_major_formatter(mtick.PercentFormatter(decimals=0))
|
||||
save_plot(img_folder, hashtag)
|
||||
|
||||
|
||||
def save_plot(img_folder, hashtag):
|
||||
"""Save the plot as a png file in the folder ../data/imgs/"""
|
||||
now = datetime.now()
|
||||
current_time = now.strftime("%Y_%m_%d_%H_%M_%S")
|
||||
filename = f"{img_folder}/{hashtag}_{current_time}.png"
|
||||
logging.info(f"Plot saved to file: {filename}")
|
||||
plt.savefig(filename, bbox_inches="tight", facecolor="white", dpi=300)
|
||||
|
||||
|
||||
def print_occurrences(occs):
|
||||
"""Print information about the top n hashtags and their frequencies."""
|
||||
row_number = 0
|
||||
total_posts = occs["total"]
|
||||
print(
|
||||
"{:<8} {:<30} {:<15} {:<15}".format(
|
||||
"Rank", "Hashtag", "Occurrences", "Frequency"
|
||||
)
|
||||
)
|
||||
for key, value in zip(occs["top_n"][0], occs["top_n"][1]):
|
||||
ratio = value / total_posts
|
||||
print("{:<8} {:<30} {:<15} {:.4f}".format(row_number, key, value, ratio))
|
||||
row_number += 1
|
||||
print(f"Total posts: {total_posts}")
|
||||
@@ -1,5 +0,0 @@
|
||||
# Enter a hashtag per line. Each line should contain only one word.
|
||||
london
|
||||
paris
|
||||
tokyo
|
||||
newyork
|
||||
@@ -1,36 +0,0 @@
|
||||
[loggers]
|
||||
keys=root,Logger
|
||||
|
||||
[handlers]
|
||||
keys=consoleHandler,fileHandler
|
||||
|
||||
[formatters]
|
||||
keys=consoleFormatter,fileFormatter
|
||||
|
||||
[logger_root]
|
||||
level=DEBUG
|
||||
handlers=consoleHandler
|
||||
|
||||
[logger_Logger]
|
||||
level=DEBUG
|
||||
handlers=consoleHandler,fileHandler
|
||||
qualname=Logger
|
||||
propagate=0
|
||||
|
||||
[handler_consoleHandler]
|
||||
class=StreamHandler
|
||||
level=INFO
|
||||
formatter=consoleFormatter
|
||||
args=(sys.stdout,)
|
||||
|
||||
[handler_fileHandler]
|
||||
class=FileHandler
|
||||
level=DEBUG
|
||||
formatter=fileFormatter
|
||||
args=("../logfile.log",)
|
||||
|
||||
[formatter_consoleFormatter]
|
||||
format=%(message)s
|
||||
|
||||
[formatter_fileFormatter]
|
||||
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
|
||||
@@ -1,150 +0,0 @@
|
||||
"""Download post data or videos from TikToks containing one or more specified hashtags.
|
||||
|
||||
- The "-p" flag specifies that only data from posts is downloaded, no video files
|
||||
- The "-v" flag specifies that only video files are downloaded, no post data
|
||||
- Specifying both "-p" and "-v" flags downloads both post data and video files
|
||||
- The "-t" flag allows the user to specify a list of space-separated hashtags as an argument
|
||||
- The "-f" flag allows the user to specify the filename of a text file containing a list of newline-separated hashtags as an argument
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
from typing import List, Tuple, Dict, Any, Optional
|
||||
from tempfile import TemporaryDirectory
|
||||
from tiktok_hashtag_analysis import global_data
|
||||
import tiktok_hashtag_analysis.file_methods as file_methods
|
||||
from tiktok_hashtag_analysis import data_methods
|
||||
|
||||
|
||||
def get_hashtag_list(file_name: str) -> List[str]:
|
||||
"""Extract list of newline-separated hashtags from text file."""
|
||||
if not file_methods.check_existence(file_name, "file"):
|
||||
raise OSError(f"{file_name} does not exist")
|
||||
with open(file_name) as f:
|
||||
tags = list(
|
||||
filter(None, [line.strip() for line in f if not line.startswith("#")])
|
||||
)
|
||||
return tags
|
||||
|
||||
|
||||
def set_download_settings(download_data_type: Dict[str, bool]) -> Dict[str, Any]:
|
||||
"""Load the constants from global_data module into the `settings` dict."""
|
||||
settings = {
|
||||
"data": global_data.FILES["data"],
|
||||
"ids": global_data.FILES["ids"],
|
||||
"sleep": global_data.PARAMETERS["sleep"],
|
||||
"scraper": global_data.PARAMETERS["scraper_attempts"],
|
||||
}
|
||||
file_methods.check_file(f"{settings['data']}/{settings['ids']}", "dir")
|
||||
if download_data_type["posts"]:
|
||||
settings["posts"] = global_data.FILES["posts"]
|
||||
settings["post_ids"] = global_data.FILES["post_ids"]
|
||||
settings["data_file"] = global_data.FILES["data_file"]
|
||||
|
||||
if download_data_type["videos"]:
|
||||
settings["videos"] = global_data.FILES["videos"]
|
||||
settings["video_ids"] = global_data.FILES["video_ids"]
|
||||
|
||||
return settings
|
||||
|
||||
|
||||
def get_posts(settings: dict, tag: str) -> Optional[Tuple[str, int]]:
|
||||
"""Scrape trending TikTok post data for the specified hashtag.
|
||||
|
||||
1. Calls `file_methods.download_posts` to scrape the post data for a given hashtag
|
||||
2. Calls `data_methods.extract_posts` to determine which if any posts
|
||||
haven't previously been downloaded.
|
||||
3. Calls `data_methods.update_posts` to update the ID list with the IDs of
|
||||
newly downloaded posts.
|
||||
"""
|
||||
with TemporaryDirectory() as temp_dir:
|
||||
file_path = file_methods.download_posts(settings, tag, temp_dir)
|
||||
number_scraped = None
|
||||
if file_path:
|
||||
new_data = data_methods.extract_posts(settings, file_path, tag)
|
||||
if new_data:
|
||||
data_file = os.path.join(
|
||||
settings["data"], tag, settings["posts"], settings["data_file"]
|
||||
)
|
||||
data_methods.update_posts(data_file, "file", new_data[1])
|
||||
number_scraped = data_methods.update_posts(
|
||||
settings["post_ids"], "file", new_data[0], tag
|
||||
)
|
||||
|
||||
return number_scraped
|
||||
|
||||
|
||||
def get_videos(settings: dict, tag: str) -> Optional[Tuple[str, int]]:
|
||||
"""Scrape trending TikTok video files for the specified hashtag.
|
||||
|
||||
1. Calls `file_methods.download_videos` to download the video files for a given hashtag
|
||||
2. Calls `data_methods.extract_videos` to determine which if any videos
|
||||
haven't previouly been downloaded.
|
||||
3. Calls `data_methods.update_videos` to update the ID list with the IDs of
|
||||
newly downloaded videos.
|
||||
4. Calls `clean_video_files` function to delete the residual video folder
|
||||
after the data processing.
|
||||
"""
|
||||
number_scraped = None
|
||||
download_list = file_methods.download_videos(settings, tag)
|
||||
if download_list:
|
||||
new_data = data_methods.extract_videos(settings, tag, download_list)
|
||||
if new_data:
|
||||
number_scraped = data_methods.update_videos(settings, new_data, tag)
|
||||
else:
|
||||
file_methods.clean_video_files(settings, tag)
|
||||
|
||||
return number_scraped
|
||||
|
||||
|
||||
def get_data(
|
||||
hashtags: list, download_data_type: Dict[str, bool]
|
||||
) -> List[Tuple[str, Tuple[str, int]]]:
|
||||
"""Check command-line arguments and scrape posts/videos for specified hashtags."""
|
||||
counter = 0
|
||||
total_hashtags = len(hashtags)
|
||||
total_hashtags_offset = total_hashtags - 1
|
||||
scraped_summary_list = []
|
||||
|
||||
if download_data_type["posts"]:
|
||||
settings = set_download_settings(download_data_type)
|
||||
while counter < total_hashtags:
|
||||
tag = hashtags[counter]
|
||||
file_methods.check_file(
|
||||
os.path.join(settings["data"], tag, settings["posts"]), "dir"
|
||||
)
|
||||
file_methods.check_file(
|
||||
os.path.join(
|
||||
settings["data"], tag, settings["posts"], settings["data_file"]
|
||||
),
|
||||
"file",
|
||||
)
|
||||
res = get_posts(settings, tag)
|
||||
if res:
|
||||
number_scraped = (res[0], ("posts", res[1]))
|
||||
scraped_summary_list.append(number_scraped)
|
||||
data_methods.print_total(settings["post_ids"], tag, "posts")
|
||||
|
||||
counter += 1
|
||||
if counter < total_hashtags_offset:
|
||||
time.sleep(settings["sleep"])
|
||||
|
||||
if download_data_type["videos"]:
|
||||
settings = set_download_settings(download_data_type)
|
||||
while counter < total_hashtags:
|
||||
tag = hashtags[counter]
|
||||
file_methods.check_file(
|
||||
os.path.join(settings["data"], tag, settings["videos"]), "dir"
|
||||
)
|
||||
settings["videos_delete"] = settings["data"] + f"/{tag}/videos/#{tag}"
|
||||
settings["videos_to"] = settings["data"] + f"/{tag}/videos"
|
||||
_res = get_videos(settings, tag)
|
||||
if _res:
|
||||
scraped_summary_list.append((_res[0], ("videos", _res[1])))
|
||||
data_methods.print_total(settings["video_ids"], tag, "videos")
|
||||
|
||||
counter += 1
|
||||
if counter < total_hashtags_offset:
|
||||
time.sleep(settings["sleep"])
|
||||
|
||||
return scraped_summary_list
|
||||
@@ -1,12 +1,11 @@
|
||||
|
||||
_MAJOR = "1"
|
||||
_MAJOR = "2"
|
||||
_MINOR = "0"
|
||||
# On main and in a nightly release the patch should be one ahead of the last
|
||||
# released build.
|
||||
_PATCH = "4"
|
||||
_PATCH = "0"
|
||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||
_SUFFIX = ""
|
||||
|
||||
VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
|
||||
__version__ = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX)
|
||||
VERSION = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX)
|
||||
|
||||
Reference in New Issue
Block a user