Merge pull request #20 from bellingcat/refactor

Refactor
This commit is contained in:
Tristan Lee
2023-09-06 09:53:57 -05:00
committed by GitHub
27 changed files with 671 additions and 1300 deletions

View File

@@ -33,15 +33,12 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade --upgrade-strategy=eager pip setuptools wheel twine pipenv
python -m pip install --upgrade --upgrade-strategy=eager pip setuptools wheel twine
python -m pip install -e . --upgrade
python -m pipenv install --dev --python 3.10
env:
PIPENV_DEFAULT_PYTHON_VERSION: "3.10"
- name: Build wheels
run: |
python -m pipenv run python setup.py sdist bdist_wheel
python setup.py sdist bdist_wheel
- name: Publish a Python distribution to PyPI
uses: pypa/gh-action-pypi-publish@release/v1

3
.gitignore vendored
View File

@@ -1,5 +1,8 @@
# Data directory
data/
build/
*.egg-info/
dist/
# Miscellaneous files
**/.DS_Store

13
Pipfile
View File

@@ -1,13 +0,0 @@
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"
[packages]
matplotlib = "*"
seaborn = "*"
[dev-packages]
[requires]
python_version = "3.10"

416
Pipfile.lock generated
View File

@@ -1,416 +0,0 @@
{
"_meta": {
"hash": {
"sha256": "97c5ef0126b17f586b5fa1d518cf359b7e984e48f8fc2310e9aa79bd384c2374"
},
"pipfile-spec": 6,
"requires": {
"python_version": "3.10"
},
"sources": [
{
"name": "pypi",
"url": "https://pypi.org/simple",
"verify_ssl": true
}
]
},
"default": {
"contourpy": {
"hashes": [
"sha256:031154ed61f7328ad7f97662e48660a150ef84ee1bc8876b6472af88bf5a9b98",
"sha256:0f9d350b639db6c2c233d92c7f213d94d2e444d8e8fc5ca44c9706cf72193772",
"sha256:130230b7e49825c98edf0b428b7aa1125503d91732735ef897786fe5452b1ec2",
"sha256:152fd8f730c31fd67fe0ffebe1df38ab6a669403da93df218801a893645c6ccc",
"sha256:1c71fdd8f1c0f84ffd58fca37d00ca4ebaa9e502fb49825484da075ac0b0b803",
"sha256:24847601071f740837aefb730e01bd169fbcaa610209779a78db7ebb6e6a7051",
"sha256:2e9ebb4425fc1b658e13bace354c48a933b842d53c458f02c86f371cecbedecc",
"sha256:30676ca45084ee61e9c3da589042c24a57592e375d4b138bd84d8709893a1ba4",
"sha256:31a55dccc8426e71817e3fe09b37d6d48ae40aae4ecbc8c7ad59d6893569c436",
"sha256:366a0cf0fc079af5204801786ad7a1c007714ee3909e364dbac1729f5b0849e5",
"sha256:38e2e577f0f092b8e6774459317c05a69935a1755ecfb621c0a98f0e3c09c9a5",
"sha256:3c184ad2433635f216645fdf0493011a4667e8d46b34082f5a3de702b6ec42e3",
"sha256:3caea6365b13119626ee996711ab63e0c9d7496f65641f4459c60a009a1f3e80",
"sha256:3e927b3868bd1e12acee7cc8f3747d815b4ab3e445a28d2e5373a7f4a6e76ba1",
"sha256:4ee3ee247f795a69e53cd91d927146fb16c4e803c7ac86c84104940c7d2cabf0",
"sha256:54d43960d809c4c12508a60b66cb936e7ed57d51fb5e30b513934a4a23874fae",
"sha256:57119b0116e3f408acbdccf9eb6ef19d7fe7baf0d1e9aaa5381489bc1aa56556",
"sha256:58569c491e7f7e874f11519ef46737cea1d6eda1b514e4eb5ac7dab6aa864d02",
"sha256:5a011cf354107b47c58ea932d13b04d93c6d1d69b8b6dce885e642531f847566",
"sha256:5caeacc68642e5f19d707471890f037a13007feba8427eb7f2a60811a1fc1350",
"sha256:5dd34c1ae752515318224cba7fc62b53130c45ac6a1040c8b7c1a223c46e8967",
"sha256:60835badb5ed5f4e194a6f21c09283dd6e007664a86101431bf870d9e86266c4",
"sha256:62398c80ef57589bdbe1eb8537127321c1abcfdf8c5f14f479dbbe27d0322e66",
"sha256:6381fa66866b0ea35e15d197fc06ac3840a9b2643a6475c8fff267db8b9f1e69",
"sha256:64757f6460fc55d7e16ed4f1de193f362104285c667c112b50a804d482777edd",
"sha256:69f8ff4db108815addd900a74df665e135dbbd6547a8a69333a68e1f6e368ac2",
"sha256:6c180d89a28787e4b73b07e9b0e2dac7741261dbdca95f2b489c4f8f887dd810",
"sha256:71b0bf0c30d432278793d2141362ac853859e87de0a7dee24a1cea35231f0d50",
"sha256:769eef00437edf115e24d87f8926955f00f7704bede656ce605097584f9966dc",
"sha256:7f6979d20ee5693a1057ab53e043adffa1e7418d734c1532e2d9e915b08d8ec2",
"sha256:87f4d8941a9564cda3f7fa6a6cd9b32ec575830780677932abdec7bcb61717b0",
"sha256:89ba9bb365446a22411f0673abf6ee1fea3b2cf47b37533b970904880ceb72f3",
"sha256:8acf74b5d383414401926c1598ed77825cd530ac7b463ebc2e4f46638f56cce6",
"sha256:9056c5310eb1daa33fc234ef39ebfb8c8e2533f088bbf0bc7350f70a29bde1ac",
"sha256:95c3acddf921944f241b6773b767f1cbce71d03307270e2d769fd584d5d1092d",
"sha256:9e20e5a1908e18aaa60d9077a6d8753090e3f85ca25da6e25d30dc0a9e84c2c6",
"sha256:a1e97b86f73715e8670ef45292d7cc033548266f07d54e2183ecb3c87598888f",
"sha256:a877ada905f7d69b2a31796c4b66e31a8068b37aa9b78832d41c82fc3e056ddd",
"sha256:a9d7587d2fdc820cc9177139b56795c39fb8560f540bba9ceea215f1f66e1566",
"sha256:abf298af1e7ad44eeb93501e40eb5a67abbf93b5d90e468d01fc0c4451971afa",
"sha256:ae90d5a8590e5310c32a7630b4b8618cef7563cebf649011da80874d0aa8f414",
"sha256:b6d0f9e1d39dbfb3977f9dd79f156c86eb03e57a7face96f199e02b18e58d32a",
"sha256:b8d587cc39057d0afd4166083d289bdeff221ac6d3ee5046aef2d480dc4b503c",
"sha256:c5210e5d5117e9aec8c47d9156d1d3835570dd909a899171b9535cb4a3f32693",
"sha256:cc331c13902d0f50845099434cd936d49d7a2ca76cb654b39691974cb1e4812d",
"sha256:ce41676b3d0dd16dbcfabcc1dc46090aaf4688fd6e819ef343dbda5a57ef0161",
"sha256:d8165a088d31798b59e91117d1f5fc3df8168d8b48c4acc10fc0df0d0bdbcc5e",
"sha256:e7281244c99fd7c6f27c1c6bfafba878517b0b62925a09b586d88ce750a016d2",
"sha256:e96a08b62bb8de960d3a6afbc5ed8421bf1a2d9c85cc4ea73f4bc81b4910500f",
"sha256:ed33433fc3820263a6368e532f19ddb4c5990855e4886088ad84fd7c4e561c71",
"sha256:efb8f6d08ca7998cf59eaf50c9d60717f29a1a0a09caa46460d33b2924839dbd",
"sha256:efe99298ba37e37787f6a2ea868265465410822f7bea163edcc1bd3903354ea9",
"sha256:f99e9486bf1bb979d95d5cffed40689cb595abb2b841f2991fc894b3452290e8",
"sha256:fc1464c97579da9f3ab16763c32e5c5d5bb5fa1ec7ce509a4ca6108b61b84fab",
"sha256:fd7dc0e6812b799a34f6d12fcb1000539098c249c8da54f3566c6a6461d0dbad"
],
"markers": "python_version >= '3.8'",
"version": "==1.0.7"
},
"cycler": {
"hashes": [
"sha256:3a27e95f763a428a739d2add979fa7494c912a32c17c4c38c4d5f082cad165a3",
"sha256:9c87405839a19696e837b3b818fed3f5f69f16f1eec1a1ad77e043dcea9c772f"
],
"markers": "python_version >= '3.6'",
"version": "==0.11.0"
},
"fonttools": {
"hashes": [
"sha256:2bb244009f9bf3fa100fc3ead6aeb99febe5985fa20afbfbaa2f8946c2fbdaf1",
"sha256:820466f43c8be8c3009aef8b87e785014133508f0de64ec469e4efb643ae54fb"
],
"markers": "python_version >= '3.7'",
"version": "==4.38.0"
},
"kiwisolver": {
"hashes": [
"sha256:02f79693ec433cb4b5f51694e8477ae83b3205768a6fb48ffba60549080e295b",
"sha256:03baab2d6b4a54ddbb43bba1a3a2d1627e82d205c5cf8f4c924dc49284b87166",
"sha256:1041feb4cda8708ce73bb4dcb9ce1ccf49d553bf87c3954bdfa46f0c3f77252c",
"sha256:10ee06759482c78bdb864f4109886dff7b8a56529bc1609d4f1112b93fe6423c",
"sha256:1d1573129aa0fd901076e2bfb4275a35f5b7aa60fbfb984499d661ec950320b0",
"sha256:283dffbf061a4ec60391d51e6155e372a1f7a4f5b15d59c8505339454f8989e4",
"sha256:28bc5b299f48150b5f822ce68624e445040595a4ac3d59251703779836eceff9",
"sha256:2a66fdfb34e05b705620dd567f5a03f239a088d5a3f321e7b6ac3239d22aa286",
"sha256:2e307eb9bd99801f82789b44bb45e9f541961831c7311521b13a6c85afc09767",
"sha256:2e407cb4bd5a13984a6c2c0fe1845e4e41e96f183e5e5cd4d77a857d9693494c",
"sha256:2f5e60fabb7343a836360c4f0919b8cd0d6dbf08ad2ca6b9cf90bf0c76a3c4f6",
"sha256:36dafec3d6d6088d34e2de6b85f9d8e2324eb734162fba59d2ba9ed7a2043d5b",
"sha256:3fe20f63c9ecee44560d0e7f116b3a747a5d7203376abeea292ab3152334d004",
"sha256:41dae968a94b1ef1897cb322b39360a0812661dba7c682aa45098eb8e193dbdf",
"sha256:4bd472dbe5e136f96a4b18f295d159d7f26fd399136f5b17b08c4e5f498cd494",
"sha256:4ea39b0ccc4f5d803e3337dd46bcce60b702be4d86fd0b3d7531ef10fd99a1ac",
"sha256:5853eb494c71e267912275e5586fe281444eb5e722de4e131cddf9d442615626",
"sha256:5bce61af018b0cb2055e0e72e7d65290d822d3feee430b7b8203d8a855e78766",
"sha256:6295ecd49304dcf3bfbfa45d9a081c96509e95f4b9d0eb7ee4ec0530c4a96514",
"sha256:62ac9cc684da4cf1778d07a89bf5f81b35834cb96ca523d3a7fb32509380cbf6",
"sha256:70e7c2e7b750585569564e2e5ca9845acfaa5da56ac46df68414f29fea97be9f",
"sha256:7577c1987baa3adc4b3c62c33bd1118c3ef5c8ddef36f0f2c950ae0b199e100d",
"sha256:75facbe9606748f43428fc91a43edb46c7ff68889b91fa31f53b58894503a191",
"sha256:787518a6789009c159453da4d6b683f468ef7a65bbde796bcea803ccf191058d",
"sha256:78d6601aed50c74e0ef02f4204da1816147a6d3fbdc8b3872d263338a9052c51",
"sha256:7c43e1e1206cd421cd92e6b3280d4385d41d7166b3ed577ac20444b6995a445f",
"sha256:81e38381b782cc7e1e46c4e14cd997ee6040768101aefc8fa3c24a4cc58e98f8",
"sha256:841293b17ad704d70c578f1f0013c890e219952169ce8a24ebc063eecf775454",
"sha256:872b8ca05c40d309ed13eb2e582cab0c5a05e81e987ab9c521bf05ad1d5cf5cb",
"sha256:877272cf6b4b7e94c9614f9b10140e198d2186363728ed0f701c6eee1baec1da",
"sha256:8c808594c88a025d4e322d5bb549282c93c8e1ba71b790f539567932722d7bd8",
"sha256:8ed58b8acf29798b036d347791141767ccf65eee7f26bde03a71c944449e53de",
"sha256:91672bacaa030f92fc2f43b620d7b337fd9a5af28b0d6ed3f77afc43c4a64b5a",
"sha256:968f44fdbf6dd757d12920d63b566eeb4d5b395fd2d00d29d7ef00a00582aac9",
"sha256:9f85003f5dfa867e86d53fac6f7e6f30c045673fa27b603c397753bebadc3008",
"sha256:a553dadda40fef6bfa1456dc4be49b113aa92c2a9a9e8711e955618cd69622e3",
"sha256:a68b62a02953b9841730db7797422f983935aeefceb1679f0fc85cbfbd311c32",
"sha256:abbe9fa13da955feb8202e215c4018f4bb57469b1b78c7a4c5c7b93001699938",
"sha256:ad881edc7ccb9d65b0224f4e4d05a1e85cf62d73aab798943df6d48ab0cd79a1",
"sha256:b1792d939ec70abe76f5054d3f36ed5656021dcad1322d1cc996d4e54165cef9",
"sha256:b428ef021242344340460fa4c9185d0b1f66fbdbfecc6c63eff4b7c29fad429d",
"sha256:b533558eae785e33e8c148a8d9921692a9fe5aa516efbdff8606e7d87b9d5824",
"sha256:ba59c92039ec0a66103b1d5fe588fa546373587a7d68f5c96f743c3396afc04b",
"sha256:bc8d3bd6c72b2dd9decf16ce70e20abcb3274ba01b4e1c96031e0c4067d1e7cd",
"sha256:bc9db8a3efb3e403e4ecc6cd9489ea2bac94244f80c78e27c31dcc00d2790ac2",
"sha256:bf7d9fce9bcc4752ca4a1b80aabd38f6d19009ea5cbda0e0856983cf6d0023f5",
"sha256:c2dbb44c3f7e6c4d3487b31037b1bdbf424d97687c1747ce4ff2895795c9bf69",
"sha256:c79ebe8f3676a4c6630fd3f777f3cfecf9289666c84e775a67d1d358578dc2e3",
"sha256:c97528e64cb9ebeff9701e7938653a9951922f2a38bd847787d4a8e498cc83ae",
"sha256:d0611a0a2a518464c05ddd5a3a1a0e856ccc10e67079bb17f265ad19ab3c7597",
"sha256:d06adcfa62a4431d404c31216f0f8ac97397d799cd53800e9d3efc2fbb3cf14e",
"sha256:d41997519fcba4a1e46eb4a2fe31bc12f0ff957b2b81bac28db24744f333e955",
"sha256:d5b61785a9ce44e5a4b880272baa7cf6c8f48a5180c3e81c59553ba0cb0821ca",
"sha256:da152d8cdcab0e56e4f45eb08b9aea6455845ec83172092f09b0e077ece2cf7a",
"sha256:da7e547706e69e45d95e116e6939488d62174e033b763ab1496b4c29b76fabea",
"sha256:db5283d90da4174865d520e7366801a93777201e91e79bacbac6e6927cbceede",
"sha256:db608a6757adabb32f1cfe6066e39b3706d8c3aa69bbc353a5b61edad36a5cb4",
"sha256:e0ea21f66820452a3f5d1655f8704a60d66ba1191359b96541eaf457710a5fc6",
"sha256:e7da3fec7408813a7cebc9e4ec55afed2d0fd65c4754bc376bf03498d4e92686",
"sha256:e92a513161077b53447160b9bd8f522edfbed4bd9759e4c18ab05d7ef7e49408",
"sha256:ecb1fa0db7bf4cff9dac752abb19505a233c7f16684c5826d1f11ebd9472b871",
"sha256:efda5fc8cc1c61e4f639b8067d118e742b812c930f708e6667a5ce0d13499e29",
"sha256:f0a1dbdb5ecbef0d34eb77e56fcb3e95bbd7e50835d9782a45df81cc46949750",
"sha256:f0a71d85ecdd570ded8ac3d1c0f480842f49a40beb423bb8014539a9f32a5897",
"sha256:f4f270de01dd3e129a72efad823da90cc4d6aafb64c410c9033aba70db9f1ff0",
"sha256:f6cb459eea32a4e2cf18ba5fcece2dbdf496384413bc1bae15583f19e567f3b2",
"sha256:f8ad8285b01b0d4695102546b342b493b3ccc6781fc28c8c6a1bb63e95d22f09",
"sha256:f9f39e2f049db33a908319cf46624a569b36983c7c78318e9726a4cb8923b26c"
],
"markers": "python_version >= '3.7'",
"version": "==1.4.4"
},
"matplotlib": {
"hashes": [
"sha256:01b7f521a9a73c383825813af255f8c4485d1706e4f3e2ed5ae771e4403a40ab",
"sha256:11011c97d62c1db7bc20509572557842dbb8c2a2ddd3dd7f20501aa1cde3e54e",
"sha256:1183877d008c752d7d535396096c910f4663e4b74a18313adee1213328388e1e",
"sha256:12f999661589981e74d793ee2f41b924b3b87d65fd929f6153bf0f30675c59b1",
"sha256:1c235bf9be052347373f589e018988cad177abb3f997ab1a2e2210c41562cc0c",
"sha256:1f4d69707b1677560cd952544ee4962f68ff07952fb9069ff8c12b56353cb8c9",
"sha256:1fcc4cad498533d3c393a160975acc9b36ffa224d15a6b90ae579eacee5d8579",
"sha256:2787a16df07370dcba385fe20cdd0cc3cfaabd3c873ddabca78c10514c799721",
"sha256:29f17b7f2e068dc346687cbdf80b430580bab42346625821c2d3abf3a1ec5417",
"sha256:38d38cb1ea1d80ee0f6351b65c6f76cad6060bbbead015720ba001348ae90f0c",
"sha256:3f56a7252eee8f3438447f75f5e1148a1896a2756a92285fe5d73bed6deebff4",
"sha256:5223affa21050fb6118353c1380c15e23aedfb436bf3e162c26dc950617a7519",
"sha256:57ad1aee29043163374bfa8990e1a2a10ff72c9a1bfaa92e9c46f6ea59269121",
"sha256:59400cc9451094b7f08cc3f321972e6e1db4cd37a978d4e8a12824bf7fd2f03b",
"sha256:68d94a436f62b8a861bf3ace82067a71bafb724b4e4f9133521e4d8012420dd7",
"sha256:6adc441b5b2098a4b904bbf9d9e92fb816fef50c55aa2ea6a823fc89b94bb838",
"sha256:6d81b11ede69e3a751424b98dc869c96c10256b2206bfdf41f9c720eee86844c",
"sha256:73b93af33634ed919e72811c9703e1105185cd3fb46d76f30b7f4cfbbd063f89",
"sha256:77b384cee7ab8cf75ffccbfea351a09b97564fc62d149827a5e864bec81526e5",
"sha256:79e501eb847f4a489eb7065bb8d3187117f65a4c02d12ea3a19d6c5bef173bcc",
"sha256:809119d1cba3ece3c9742eb01827fe7a0e781ea3c5d89534655a75e07979344f",
"sha256:80c166a0e28512e26755f69040e6bf2f946a02ffdb7c00bf6158cca3d2b146e6",
"sha256:81b409b2790cf8d7c1ef35920f01676d2ae7afa8241844e7aa5484fdf493a9a0",
"sha256:994637e2995b0342699b396a320698b07cd148bbcf2dd2fa2daba73f34dd19f2",
"sha256:9ceebaf73f1a3444fa11014f38b9da37ff7ea328d6efa1652241fe3777bfdab9",
"sha256:9fb8fb19d03abf3c5dab89a8677e62c4023632f919a62b6dd1d6d2dbf42cd9f5",
"sha256:acc3b1a4bddbf56fe461e36fb9ef94c2cb607fc90d24ccc650040bfcc7610de4",
"sha256:bbddfeb1495484351fb5b30cf5bdf06b3de0bc4626a707d29e43dfd61af2a780",
"sha256:bbf269e1d24bc25247095d71c7a969813f7080e2a7c6fa28931a603f747ab012",
"sha256:bebcff4c3ed02c6399d47329f3554193abd824d3d53b5ca02cf583bcd94470e2",
"sha256:c3f08df2ac4636249b8bc7a85b8b82c983bef1441595936f62c2918370ca7e1d",
"sha256:ca94f0362f6b6f424b555b956971dcb94b12d0368a6c3e07dc7a40d32d6d873d",
"sha256:d00c248ab6b92bea3f8148714837937053a083ff03b4c5e30ed37e28fc0e7e56",
"sha256:d2cfaa7fd62294d945b8843ea24228a27c8e7c5b48fa634f3c168153b825a21b",
"sha256:d5f18430f5cfa5571ab8f4c72c89af52aa0618e864c60028f11a857d62200cba",
"sha256:debeab8e2ab07e5e3dac33e12456da79c7e104270d2b2d1df92b9e40347cca75",
"sha256:dfba7057609ca9567b9704626756f0142e97ec8c5ba2c70c6e7bd1c25ef99f06",
"sha256:e0a64d7cc336b52e90f59e6d638ae847b966f68582a7af041e063d568e814740",
"sha256:eb9421c403ffd387fbe729de6d9a03005bf42faba5e8432f4e51e703215b49fc",
"sha256:faff486b36530a836a6b4395850322e74211cd81fc17f28b4904e1bd53668e3e",
"sha256:ff2aa84e74f80891e6bcf292ebb1dd57714ffbe13177642d65fee25384a30894"
],
"index": "pypi",
"version": "==3.6.3"
},
"numpy": {
"hashes": [
"sha256:003a9f530e880cb2cd177cba1af7220b9aa42def9c4afc2a2fc3ee6be7eb2b22",
"sha256:150947adbdfeceec4e5926d956a06865c1c690f2fd902efede4ca6fe2e657c3f",
"sha256:2620e8592136e073bd12ee4536149380695fbe9ebeae845b81237f986479ffc9",
"sha256:2eabd64ddb96a1239791da78fa5f4e1693ae2dadc82a76bc76a14cbb2b966e96",
"sha256:4173bde9fa2a005c2c6e2ea8ac1618e2ed2c1c6ec8a7657237854d42094123a0",
"sha256:4199e7cfc307a778f72d293372736223e39ec9ac096ff0a2e64853b866a8e18a",
"sha256:4cecaed30dc14123020f77b03601559fff3e6cd0c048f8b5289f4eeabb0eb281",
"sha256:557d42778a6869c2162deb40ad82612645e21d79e11c1dc62c6e82a2220ffb04",
"sha256:63e45511ee4d9d976637d11e6c9864eae50e12dc9598f531c035265991910468",
"sha256:6524630f71631be2dabe0c541e7675db82651eb998496bbe16bc4f77f0772253",
"sha256:76807b4063f0002c8532cfeac47a3068a69561e9c8715efdad3c642eb27c0756",
"sha256:7de8fdde0003f4294655aa5d5f0a89c26b9f22c0a58790c38fae1ed392d44a5a",
"sha256:889b2cc88b837d86eda1b17008ebeb679d82875022200c6e8e4ce6cf549b7acb",
"sha256:92011118955724465fb6853def593cf397b4a1367495e0b59a7e69d40c4eb71d",
"sha256:97cf27e51fa078078c649a51d7ade3c92d9e709ba2bfb97493007103c741f1d0",
"sha256:9a23f8440561a633204a67fb44617ce2a299beecf3295f0d13c495518908e910",
"sha256:a51725a815a6188c662fb66fb32077709a9ca38053f0274640293a14fdd22978",
"sha256:a77d3e1163a7770164404607b7ba3967fb49b24782a6ef85d9b5f54126cc39e5",
"sha256:adbdce121896fd3a17a77ab0b0b5eedf05a9834a18699db6829a64e1dfccca7f",
"sha256:c29e6bd0ec49a44d7690ecb623a8eac5ab8a923bce0bea6293953992edf3a76a",
"sha256:c72a6b2f4af1adfe193f7beb91ddf708ff867a3f977ef2ec53c0ffb8283ab9f5",
"sha256:d0a2db9d20117bf523dde15858398e7c0858aadca7c0f088ac0d6edd360e9ad2",
"sha256:e3ab5d32784e843fc0dd3ab6dcafc67ef806e6b6828dc6af2f689be0eb4d781d",
"sha256:e428c4fbfa085f947b536706a2fc349245d7baa8334f0c5723c56a10595f9b95",
"sha256:e8d2859428712785e8a8b7d2b3ef0a1d1565892367b32f915c4a4df44d0e64f5",
"sha256:eef70b4fc1e872ebddc38cddacc87c19a3709c0e3e5d20bf3954c147b1dd941d",
"sha256:f64bb98ac59b3ea3bf74b02f13836eb2e24e48e0ab0145bbda646295769bd780",
"sha256:f9006288bcf4895917d02583cf3411f98631275bc67cce355a7f39f8c14338fa"
],
"markers": "python_version >= '3.8'",
"version": "==1.24.2"
},
"packaging": {
"hashes": [
"sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2",
"sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"
],
"markers": "python_version >= '3.7'",
"version": "==23.0"
},
"pandas": {
"hashes": [
"sha256:14e45300521902689a81f3f41386dc86f19b8ba8dd5ac5a3c7010ef8d2932813",
"sha256:26d9c71772c7afb9d5046e6e9cf42d83dd147b5cf5bcb9d97252077118543792",
"sha256:3749077d86e3a2f0ed51367f30bf5b82e131cc0f14260c4d3e499186fccc4406",
"sha256:41179ce559943d83a9b4bbacb736b04c928b095b5f25dd2b7389eda08f46f373",
"sha256:478ff646ca42b20376e4ed3fa2e8d7341e8a63105586efe54fa2508ee087f328",
"sha256:50869a35cbb0f2e0cd5ec04b191e7b12ed688874bd05dd777c19b28cbea90996",
"sha256:565fa34a5434d38e9d250af3c12ff931abaf88050551d9fbcdfafca50d62babf",
"sha256:5f2b952406a1588ad4cad5b3f55f520e82e902388a6d5a4a91baa8d38d23c7f6",
"sha256:5fbcb19d6fceb9e946b3e23258757c7b225ba450990d9ed63ccceeb8cae609f7",
"sha256:6973549c01ca91ec96199e940495219c887ea815b2083722821f1d7abfa2b4dc",
"sha256:74a3fd7e5a7ec052f183273dc7b0acd3a863edf7520f5d3a1765c04ffdb3b0b1",
"sha256:7a0a56cef15fd1586726dace5616db75ebcfec9179a3a55e78f72c5639fa2a23",
"sha256:7cec0bee9f294e5de5bbfc14d0573f65526071029d036b753ee6507d2a21480a",
"sha256:87bd9c03da1ac870a6d2c8902a0e1fd4267ca00f13bc494c9e5a9020920e1d51",
"sha256:972d8a45395f2a2d26733eb8d0f629b2f90bebe8e8eddbb8829b180c09639572",
"sha256:9842b6f4b8479e41968eced654487258ed81df7d1c9b7b870ceea24ed9459b31",
"sha256:9f69c4029613de47816b1bb30ff5ac778686688751a5e9c99ad8c7031f6508e5",
"sha256:a50d9a4336a9621cab7b8eb3fb11adb82de58f9b91d84c2cd526576b881a0c5a",
"sha256:bc4c368f42b551bf72fac35c5128963a171b40dce866fb066540eeaf46faa003",
"sha256:c39a8da13cede5adcd3be1182883aea1c925476f4e84b2807a46e2775306305d",
"sha256:c3ac844a0fe00bfaeb2c9b51ab1424e5c8744f89860b138434a363b1f620f354",
"sha256:c4c00e0b0597c8e4f59e8d461f797e5d70b4d025880516a8261b2817c47759ee",
"sha256:c74a62747864ed568f5a82a49a23a8d7fe171d0c69038b38cedf0976831296fa",
"sha256:dd05f7783b3274aa206a1af06f0ceed3f9b412cf665b7247eacd83be41cf7bf0",
"sha256:dfd681c5dc216037e0b0a2c821f5ed99ba9f03ebcf119c7dac0e9a7b960b9ec9",
"sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae",
"sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc"
],
"markers": "python_version >= '3.8'",
"version": "==1.5.3"
},
"pillow": {
"hashes": [
"sha256:013016af6b3a12a2f40b704677f8b51f72cb007dac785a9933d5c86a72a7fe33",
"sha256:0845adc64fe9886db00f5ab68c4a8cd933ab749a87747555cec1c95acea64b0b",
"sha256:0884ba7b515163a1a05440a138adeb722b8a6ae2c2b33aea93ea3118dd3a899e",
"sha256:09b89ddc95c248ee788328528e6a2996e09eaccddeeb82a5356e92645733be35",
"sha256:0dd4c681b82214b36273c18ca7ee87065a50e013112eea7d78c7a1b89a739153",
"sha256:0e51f608da093e5d9038c592b5b575cadc12fd748af1479b5e858045fff955a9",
"sha256:0f3269304c1a7ce82f1759c12ce731ef9b6e95b6df829dccd9fe42912cc48569",
"sha256:16a8df99701f9095bea8a6c4b3197da105df6f74e6176c5b410bc2df2fd29a57",
"sha256:19005a8e58b7c1796bc0167862b1f54a64d3b44ee5d48152b06bb861458bc0f8",
"sha256:1b4b4e9dda4f4e4c4e6896f93e84a8f0bcca3b059de9ddf67dac3c334b1195e1",
"sha256:28676836c7796805914b76b1837a40f76827ee0d5398f72f7dcc634bae7c6264",
"sha256:2968c58feca624bb6c8502f9564dd187d0e1389964898f5e9e1fbc8533169157",
"sha256:3f4cc516e0b264c8d4ccd6b6cbc69a07c6d582d8337df79be1e15a5056b258c9",
"sha256:3fa1284762aacca6dc97474ee9c16f83990b8eeb6697f2ba17140d54b453e133",
"sha256:43521ce2c4b865d385e78579a082b6ad1166ebed2b1a2293c3be1d68dd7ca3b9",
"sha256:451f10ef963918e65b8869e17d67db5e2f4ab40e716ee6ce7129b0cde2876eab",
"sha256:46c259e87199041583658457372a183636ae8cd56dbf3f0755e0f376a7f9d0e6",
"sha256:46f39cab8bbf4a384ba7cb0bc8bae7b7062b6a11cfac1ca4bc144dea90d4a9f5",
"sha256:519e14e2c49fcf7616d6d2cfc5c70adae95682ae20f0395e9280db85e8d6c4df",
"sha256:53dcb50fbdc3fb2c55431a9b30caeb2f7027fcd2aeb501459464f0214200a503",
"sha256:54614444887e0d3043557d9dbc697dbb16cfb5a35d672b7a0fcc1ed0cf1c600b",
"sha256:575d8912dca808edd9acd6f7795199332696d3469665ef26163cd090fa1f8bfa",
"sha256:5dd5a9c3091a0f414a963d427f920368e2b6a4c2f7527fdd82cde8ef0bc7a327",
"sha256:5f532a2ad4d174eb73494e7397988e22bf427f91acc8e6ebf5bb10597b49c493",
"sha256:60e7da3a3ad1812c128750fc1bc14a7ceeb8d29f77e0a2356a8fb2aa8925287d",
"sha256:653d7fb2df65efefbcbf81ef5fe5e5be931f1ee4332c2893ca638c9b11a409c4",
"sha256:6663977496d616b618b6cfa43ec86e479ee62b942e1da76a2c3daa1c75933ef4",
"sha256:6abfb51a82e919e3933eb137e17c4ae9c0475a25508ea88993bb59faf82f3b35",
"sha256:6c6b1389ed66cdd174d040105123a5a1bc91d0aa7059c7261d20e583b6d8cbd2",
"sha256:6d9dfb9959a3b0039ee06c1a1a90dc23bac3b430842dcb97908ddde05870601c",
"sha256:765cb54c0b8724a7c12c55146ae4647e0274a839fb6de7bcba841e04298e1011",
"sha256:7a21222644ab69ddd9967cfe6f2bb420b460dae4289c9d40ff9a4896e7c35c9a",
"sha256:7ac7594397698f77bce84382929747130765f66406dc2cd8b4ab4da68ade4c6e",
"sha256:7cfc287da09f9d2a7ec146ee4d72d6ea1342e770d975e49a8621bf54eaa8f30f",
"sha256:83125753a60cfc8c412de5896d10a0a405e0bd88d0470ad82e0869ddf0cb3848",
"sha256:847b114580c5cc9ebaf216dd8c8dbc6b00a3b7ab0131e173d7120e6deade1f57",
"sha256:87708d78a14d56a990fbf4f9cb350b7d89ee8988705e58e39bdf4d82c149210f",
"sha256:8a2b5874d17e72dfb80d917213abd55d7e1ed2479f38f001f264f7ce7bae757c",
"sha256:8f127e7b028900421cad64f51f75c051b628db17fb00e099eb148761eed598c9",
"sha256:94cdff45173b1919350601f82d61365e792895e3c3a3443cf99819e6fbf717a5",
"sha256:99d92d148dd03fd19d16175b6d355cc1b01faf80dae93c6c3eb4163709edc0a9",
"sha256:9a3049a10261d7f2b6514d35bbb7a4dfc3ece4c4de14ef5876c4b7a23a0e566d",
"sha256:9d9a62576b68cd90f7075876f4e8444487db5eeea0e4df3ba298ee38a8d067b0",
"sha256:9e5f94742033898bfe84c93c831a6f552bb629448d4072dd312306bab3bd96f1",
"sha256:a1c2d7780448eb93fbcc3789bf3916aa5720d942e37945f4056680317f1cd23e",
"sha256:a2e0f87144fcbbe54297cae708c5e7f9da21a4646523456b00cc956bd4c65815",
"sha256:a4dfdae195335abb4e89cc9762b2edc524f3c6e80d647a9a81bf81e17e3fb6f0",
"sha256:a96e6e23f2b79433390273eaf8cc94fec9c6370842e577ab10dabdcc7ea0a66b",
"sha256:aabdab8ec1e7ca7f1434d042bf8b1e92056245fb179790dc97ed040361f16bfd",
"sha256:b222090c455d6d1a64e6b7bb5f4035c4dff479e22455c9eaa1bdd4c75b52c80c",
"sha256:b52ff4f4e002f828ea6483faf4c4e8deea8d743cf801b74910243c58acc6eda3",
"sha256:b70756ec9417c34e097f987b4d8c510975216ad26ba6e57ccb53bc758f490dab",
"sha256:b8c2f6eb0df979ee99433d8b3f6d193d9590f735cf12274c108bd954e30ca858",
"sha256:b9b752ab91e78234941e44abdecc07f1f0d8f51fb62941d32995b8161f68cfe5",
"sha256:ba6612b6548220ff5e9df85261bddc811a057b0b465a1226b39bfb8550616aee",
"sha256:bd752c5ff1b4a870b7661234694f24b1d2b9076b8bf337321a814c612665f343",
"sha256:c3c4ed2ff6760e98d262e0cc9c9a7f7b8a9f61aa4d47c58835cdaf7b0b8811bb",
"sha256:c5c1362c14aee73f50143d74389b2c158707b4abce2cb055b7ad37ce60738d47",
"sha256:cb362e3b0976dc994857391b776ddaa8c13c28a16f80ac6522c23d5257156bed",
"sha256:d197df5489004db87d90b918033edbeee0bd6df3848a204bca3ff0a903bef837",
"sha256:d3b56206244dc8711f7e8b7d6cad4663917cd5b2d950799425076681e8766286",
"sha256:d5b2f8a31bd43e0f18172d8ac82347c8f37ef3e0b414431157718aa234991b28",
"sha256:d7081c084ceb58278dd3cf81f836bc818978c0ccc770cbbb202125ddabec6628",
"sha256:db74f5562c09953b2c5f8ec4b7dfd3f5421f31811e97d1dbc0a7c93d6e3a24df",
"sha256:df41112ccce5d47770a0c13651479fbcd8793f34232a2dd9faeccb75eb5d0d0d",
"sha256:e1339790c083c5a4de48f688b4841f18df839eb3c9584a770cbd818b33e26d5d",
"sha256:e621b0246192d3b9cb1dc62c78cfa4c6f6d2ddc0ec207d43c0dedecb914f152a",
"sha256:e8c5cf126889a4de385c02a2c3d3aba4b00f70234bfddae82a5eaa3ee6d5e3e6",
"sha256:e9d7747847c53a16a729b6ee5e737cf170f7a16611c143d95aa60a109a59c336",
"sha256:eaef5d2de3c7e9b21f1e762f289d17b726c2239a42b11e25446abf82b26ac132",
"sha256:ed3e4b4e1e6de75fdc16d3259098de7c6571b1a6cc863b1a49e7d3d53e036070",
"sha256:ef21af928e807f10bf4141cad4746eee692a0dd3ff56cfb25fce076ec3cc8abe",
"sha256:f09598b416ba39a8f489c124447b007fe865f786a89dbfa48bb5cf395693132a",
"sha256:f0caf4a5dcf610d96c3bd32932bfac8aee61c96e60481c2a0ea58da435e25acd",
"sha256:f6e78171be3fb7941f9910ea15b4b14ec27725865a73c15277bc39f5ca4f8391",
"sha256:f715c32e774a60a337b2bb8ad9839b4abf75b267a0f18806f6f4f5f1688c4b5a",
"sha256:fb5c1ad6bad98c57482236a21bf985ab0ef42bd51f7ad4e4538e89a997624e12"
],
"markers": "python_version >= '3.7'",
"version": "==9.4.0"
},
"pyparsing": {
"hashes": [
"sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb",
"sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"
],
"markers": "python_full_version >= '3.6.8'",
"version": "==3.0.9"
},
"python-dateutil": {
"hashes": [
"sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
"sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
"version": "==2.8.2"
},
"pytz": {
"hashes": [
"sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0",
"sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a"
],
"version": "==2022.7.1"
},
"seaborn": {
"hashes": [
"sha256:374645f36509d0dcab895cba5b47daf0586f77bfe3b36c97c607db7da5be0139",
"sha256:ebf15355a4dba46037dfd65b7350f014ceb1f13c05e814eda2c9f5fd731afc08"
],
"index": "pypi",
"version": "==0.12.2"
},
"six": {
"hashes": [
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
"sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
"version": "==1.16.0"
}
},
"develop": {}
}

145
README.md
View File

@@ -1,16 +1,12 @@
# TikTok hashtag analysis toolset
> IMPORTANT NOTE: this tool relies on [drawrowfly/tiktok-scraper](https://github.com/drawrowfly/tiktok-scraper) which seems to be broken at time of writing and without updates for some time with several open issues ([796](https://github.com/drawrowfly/tiktok-scraper/issues/796) [#799](https://github.com/drawrowfly/tiktok-scraper/issues/799)) that need to be fixed before this library can work smoothly :/
The tool helps to download posts and videos from TikTok for a given set of hashtags over a period of time. Users can create a growing database of posts for specific hashtags which can then be used for further hashtag analysis. It uses the [tiktok-scraper](https://github.com/drawrowfly/tiktok-scraper) Node package to download the posts and videos.
The tool helps to download posts and videos from TikTok for a given set of hashtags over a period of time. Users can create a growing database of posts for specific hashtags which can then be used for further hashtag analysis. It uses the [TikTokApi](https://github.com/davidteather/TikTok-Api) Python package to download the posts and uses [yt-dlp](https://github.com/yt-dlp/yt-dlp) to download the videos.
[![PyPI version](https://badge.fury.io/py/tiktok-hashtag-analysis.svg)](https://badge.fury.io/py/tiktok-hashtag-analysis)
## Pre-requisites
1. Make sure you have Python 3.6 or a later version installed
2. And, you need to have node version 16. On Mac, do `brew install node` followed by `npm install -g n` and then `n 16`
4. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper
5. Install the tool with pip: `pip install tiktok-hashtag-analysis`
1. Make sure you have Python 3.9 or a later version installed
2. Install the tool with pip: `pip install tiktok-hashtag-analysis`
1. or directly from the repo version: `pip install git+https://github.com/bellingcat/tiktok-hashtag-analysis`
You should now be ready to start using it.
@@ -19,88 +15,83 @@ You should now be ready to start using it.
## About the tool
### Command-line arguments
```
tiktok-hashtag-analysis --help
usage: tiktok-hashtag-analysis [-h] [-t [T ...]] [-f F] [-p] [-v] [-ht HASHTAG] [-n NUMBER] [-plt] [-d] {download,frequencies}
usage: tiktok-hashtag-analysis [-h] [--file FILE] [-d] [--number NUMBER] [-p] [-t] [--output-dir OUTPUT_DIR] [--config CONFIG] [--log LOG] [hashtags ...]
Analyze hashtags within posts scraped from TikTok.
positional arguments:
{download,frequencies}
command to initialize
hashtags List of hashtags to scrape
options:
optional arguments:
-h, --help show this help message and exit
-t [T ...] List of hashtags to scrape (module: run_downloader)
-f F File name containing list of hashtags to scrape (module: run_downloader)
-p Download post data (module: run_downloader)
-v Download video files (module: run_downloader)
-ht HASHTAG, --hashtag HASHTAG
The hashtag of scraped posts to analyze (module: hashtag_frequencies)
-n NUMBER, --number NUMBER
The number of top n occurrences (module: hashtag_frequencies)
-plt, --plot Plot the occurrences (module: hashtag_frequencies)
-d, --print List top n hashtags (module: hashtag_frequencies)
--file FILE File name containing list of hashtags to scrape
-d, --download Download video files corresponding to scraped posts
--number NUMBER The number of co-occurring hashtags to analyze
-p, --plot Plot the most common co-occurring hashtags
-t, --table Print a table of the most common co-occurring hashtags
--output-dir OUTPUT_DIR
Directory to save scraped data and visualizations to
--config CONFIG File name of configuration file to store TikTok credentials to
--log LOG File to write logs to
```
### Structure of output data
```
$ tree ../data
../data
├── ids
│ └── post_ids.json
├── london
── posts
└── data.json
── plots
├── posts.json
│ └── media
├── newyork
── posts
└── data.json
── plots
├── posts.json
│ └── media
└── paris
── posts
── data.json
── plots
── posts.json
│ └── media
```
The `data` folder contains all the downloaded data as shown in the tree diagram above.
- The `ids` folder contains two files `post_ids.json` and `video_ids.json` that record the ids of the downloaded posts and videos for each hashtag.
- Each hashtag has a folder with two subfolders `posts` and `videos` that store posts and videos respectively. The posts are stored in the `data.json` file in the `posts` folder, and videos are stored as the `.mp4` files in the `videos` folder.
- Each hashtag has a folder with two subfolders `plots` and `media` that store plots of the most common co-occurring hashtags, and media downloaded from the posts. The posts are stored in the `posts.json` file, and downloaded media is stored as `.mp4` files (for videos) or audio and image files (for image galleries) in the `media` folder.
## How to use
### Post downloading
Running the `tiktok-hashtag-analysis download` command with the following options will scrape posts containing the hashtags `#london`, `#paris`, or `#newyork`:
Running the `tiktok-hashtag-analysis` command with the following options will scrape posts that contain the hashtags `#london`, `#paris`, or `#newyork`:
tiktok-hashtag-analysis download -t london paris newyork -p
tiktok-hashtag-analysis london paris newyork
and will produce an output similar to the following log:
$ tiktok-hashtag-analysis download -t london paris newyork -p
$ tiktok-hashtag-analysis download london paris newyork
Hashtags to scrape: ['london', 'paris', 'newyork']
Scraped 963 posts containing the hashtag 'london'
Scraped 961 posts containing the hashtag 'paris'
Scraped 940 posts containing the hashtag 'newyork'
Successfully scraped 2864 total entries
- The `-t` flag allows a space-separated list of hashtags to be specified as a command line argument
- The `-p` flag specifies that posts, not videos, will be downloaded
- The list of hashtags to scrape is specified as a positional argument
### Video downloading
Running the `tiktok-hashtag-analysis download` script with the following options will scrape trending videos containing the hashtag `#london`:
`tiktok-hashtag-analysis download -t london -v`
Running the `tiktok-hashtag-analysis` script with the following options will scrape trending posts containing the hashtag `#london`:
`tiktok-hashtag-analysis london --download`
- The `-t` flag allows a space-separated list of hashtags to be specified as a command line argument
- The `-v` flag specifies that videos, not posts, will be downloaded
- The `--download` flag specifies that video files for scraped posts should be downloaded
Note that video downloading is a time and data rate consuming task, as a result we recommend using one hashtag at a time when using the `-v` flag to avoid complications.
Note that video downloading is a time and data rate consuming task, as a result we recommend using one hashtag at a time when using the `--download` flag to avoid complications.
## Analyzing results
### Top n hashtag occurrences
The script `tiktok-hashtag-analysis frequencies` analyzes the frequencies of top occurring hashtags in a given set of posts.
### Most common co-occurring hashtags
In addition to scraping data and downloading media, the `tiktok-hashtag-analysis` script can also analyze the frequencies of the most common co-occurring hashtags in a given set of posts.
Assume we want to analyze the 20 most frequently occurring hashtags in the downloaded posts of the `#london` hashtag.
Assume we want to analyze the 20 most frequently co-occurring hashtags in the downloaded posts of the `#london` hashtag.
- The results can be plotted and saved as a PNG file by executing the following command:
`tiktok-hashtag-analysis frequencies london 20 -p`
`tiktok-hashtag-analysis london --number 20 --plot`
which will produce a figure similar to that shown below:
<p align="center">
@@ -111,32 +102,48 @@ Assume we want to analyze the 20 most frequently occurring hashtags in the downl
- The results can be displayed in tabular form by executing the following command:
`tiktok-hashtag-analysis frequencies london 20 -d`
`tiktok-hashtag-analysis london --number 20 --table`
which will produce a terminal output similar to the following:
```
Rank Hashtag Occurrences Frequency
0 london 960 1.0000
1 fyp 494 0.5146
2 uk 238 0.2479
3 foryou 221 0.2302
4 foryoupage 184 0.1917
5 viral 179 0.1865
6 fypシ 84 0.0875
7 funny 56 0.0583
8 xyzbca 51 0.0531
9 british 45 0.0469
10 england 44 0.0458
11 trending 40 0.0417
12 fy 33 0.0344
13 comedy 32 0.0333
14 roadman 28 0.0292
15 4u 27 0.0281
16 usa 26 0.0271
17 tiktok 26 0.0271
18 travel 21 0.0219
19 america 20 0.0208
Total posts: 960
Co-occurring hashtags for #london posts
Rank Hashtag Occurrences Frequency
0 london 881 1.0000
1 fyp 399 0.4529
2 uk 174 0.1975
3 foryou 168 0.1907
4 viral 152 0.1725
5 foryoupage 137 0.1555
6 fypシ 73 0.0829
7 funny 54 0.0613
8 tiktok 43 0.0488
9 trending 43 0.0488
10 british 41 0.0465
11 england 38 0.0431
12 xyzbca 34 0.0386
13 fy 33 0.0375
14 usa 33 0.0375
15 love 29 0.0329
16 comedy 25 0.0284
17 royalfamily 23 0.0261
18 queen 23 0.0261
19 queenelizabeth 22 0.0250
Total posts: 881
```
The `Frequency` column shows the ratio of the occurrence to the total number of downloaded posts.
### Contributing
To run the build-in tests in the `tests/` directory, first install the test dependency packages:
```
pip install .[test]
```
and then run the tests using the following command:
```
pytest
```
This repo uses [black](https://github.com/psf/black) to format source code, please run the `black` command before submitting a PR.

15
pytest.ini Normal file
View File

@@ -0,0 +1,15 @@
[pytest]
minversion =
7.0.0
testpaths =
tests/
python_files =
*.py
addopts =
-vvv
--cov='tiktok_hashtag_analysis'
--cov-report html:reports/coverage
--html='reports/tests.html'
--self-contained-html
filterwarnings =
ignore:Glyph (.*) missing from current font

View File

@@ -1,2 +1,5 @@
matplotlib
seaborn
seaborn==0.12.2
matplotlib==3.7.2
yt-dlp==2023.7.6
TikTokApi==6.1.1
requests==2.31.0

View File

@@ -3,7 +3,7 @@
set -e
TAG=$(python -c 'from tiktok_hashtag_analysis.version import __version__; print("v" + __version__)')
TAG=$(python -c 'from tiktok_hashtag_analysis import __version__; print("v" + __version__)')
read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt

View File

@@ -1,36 +1,64 @@
from setuptools import setup, find_packages
from tiktok_hashtag_analysis.version import __version__
from setuptools import setup
def read_requirements(filename: str):
with open(filename) as requirements_file:
import re
def fix_url_dependencies(req: str) -> str:
"""Pip and setuptools disagree about how URL dependencies should be handled."""
m = re.match(
r"^(git\+)?(https|ssh)://(git@)?github\.com/([\w-]+)/(?P<name>[\w-]+)\.git",
req,
)
if m is None:
return req
else:
return f"{m.group('name')} @ {req}"
requirements = []
for line in requirements_file:
line = line.strip()
if line.startswith("#") or len(line) <= 0:
continue
requirements.append(fix_url_dependencies(line))
return requirements
with open("README.md", "r", encoding="utf-8") as file:
long_description = file.read()
# version.py defines the VERSION and VERSION_SHORT variables.
# We use exec here so we don't import cached_path whilst setting up.
VERSION = {} # type: ignore
with open("tiktok_hashtag_analysis/version.py", "r") as version_file:
exec(version_file.read(), VERSION)
setup(
name="tiktok-hashtag-analysis",
version=__version__,
version=VERSION["VERSION"],
author="Bellingcat",
author_email="tech@bellingcat.com",
packages=["tiktok_hashtag_analysis"],
package_data={
"tiktok_hashtag_analysis": [
"logging.config",
]
},
description="Analyze hashtags within posts scraped from TikTok",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/bellingcat/tiktok-hashtag-analysis",
license="MIT License",
install_requires=["seaborn", "matplotlib"],
# install_requires=read_requirements("requirements.txt"),
# extras_require={"dev": read_requirements("dev-requirements.txt")},
install_requires=["seaborn", "matplotlib", "TikTokApi", "requests", "yt_dlp"],
extras_require={"test": ["pytest", "pytest-cov", "pytest-html", "pytest-metadata"]},
classifiers=[
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Information Technology',
'License :: OSI Approved :: MIT License',
'Natural Language :: English',
'Programming Language :: Python :: 3'
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Information Technology",
"License :: OSI Approved :: MIT License",
"Natural Language :: English",
"Programming Language :: Python :: 3",
],
entry_points={
"console_scripts": [
"tiktok-hashtag-analysis=tiktok_hashtag_analysis.__main__:main",
"tiktok-hashtag-analysis=tiktok_hashtag_analysis.cli:main",
]
},
)

0
tests/__init__.py Normal file
View File

24
tests/auth.py Normal file
View File

@@ -0,0 +1,24 @@
import pytest
from tiktok_hashtag_analysis.auth import Authorization
MS_TOKEN = "thisisafakemstokenfortiktok"
def test_auth_input(tmp_path, monkeypatch):
config_file = tmp_path / ".tiktok"
monkeypatch.setattr("builtins.input", lambda _: MS_TOKEN)
auth = Authorization(config_file=config_file)
auth.get_token()
assert auth.ms_token == MS_TOKEN
def test_auth(tmp_path):
config_file = tmp_path / ".tiktok"
auth = Authorization(config_file=config_file)
auth.dump_token(ms_token=MS_TOKEN)
auth.get_token()
assert auth.ms_token == MS_TOKEN

15
tests/base.py Normal file
View File

@@ -0,0 +1,15 @@
from tiktok_hashtag_analysis.base import TikTokDownloader, load_hashtags_from_file
def test_scrape(tmp_path, hashtags):
downloader = TikTokDownloader(hashtags=hashtags[:1], data_dir=tmp_path)
downloader.run(download=True, plot=True, table=True, number=20)
def test_load_hashtags_from_file(tmp_path, hashtags):
file = tmp_path / "hashtags.txt"
with open(file, "w", encoding="utf-8") as f:
f.write("\n".join(hashtags))
loaded_hashtags = load_hashtags_from_file(file=file)
assert loaded_hashtags == hashtags

31
tests/cli.py Normal file
View File

@@ -0,0 +1,31 @@
import pytest
from tiktok_hashtag_analysis.cli import create_parser
ARGUMENTS = [
("file", "hashtags.txt", "--file"),
("download", True, "--download"),
("download", True, "-d"),
("number", 20, "--number"),
("plot", True, "--plot"),
("plot", True, "-p"),
("table", True, "--table"),
("table", True, "-t"),
("output_dir", "/tmp/tiktok_download", "--output-dir"),
("config", "~/.tiktok", "--config"),
("log", "../logfile.log", "--log"),
]
@pytest.mark.parametrize("attribute,value,flag", ARGUMENTS)
def test_parser(hashtags, attribute, value, flag):
argument_list = [*hashtags, flag]
if not isinstance(value, bool):
argument_list.append(str(value))
parser = create_parser()
args = vars(parser.parse_args(argument_list))
assert args.get(attribute) == value
assert args.get("hashtags") == hashtags

11
tests/conftest.py Normal file
View File

@@ -0,0 +1,11 @@
import os
import tempfile
import pytest
TEST_HASHTAGS = ["embraceeuropa", "francisparkeryockey"]
@pytest.fixture(scope="package")
def hashtags():
return TEST_HASHTAGS

View File

@@ -0,0 +1 @@
from .base import TikTokDownloader

View File

@@ -1,76 +0,0 @@
import logging, argparse
from .file_methods import log_writer
from .run_downloader import * # Import everything from run_downloader.py
from .hashtag_frequencies import * # Import everything from hashtag_frequencies.py
logger = logging.getLogger()
def create_parser() -> argparse.ArgumentParser:
"""Create the parser and the arguments for the user input."""
parser = argparse.ArgumentParser(description="Analyze hashtags within posts scraped from TikTok.")
parser.add_argument("command", help="command to initialize", choices=['download', 'frequencies'])
parser.add_argument("-t", type=str, nargs="*", help="List of hashtags to scrape (module: run_downloader)")
parser.add_argument("-f", type=str, help="File name containing list of hashtags to scrape (module: run_downloader)")
parser.add_argument("-p", action="store_true", help="Download post data (module: run_downloader)")
parser.add_argument("-v", action="store_true", help="Download video files (module: run_downloader)")
parser.add_argument("-ht", "--hashtag", type=str,
help="The hashtag of scraped posts to analyze (module: hashtag_frequencies)", )
parser.add_argument("-n", "--number", type=int, help="The number of top n occurrences (module: hashtag_frequencies)")
parser.add_argument("-plt", "--plot", help="Plot the occurrences (module: hashtag_frequencies)", action="store_true")
parser.add_argument("-d", "--print", help="List top n hashtags (module: hashtag_frequencies)", action="store_true")
return parser
def main():
parser = create_parser()
args = parser.parse_args()
if args.command == "download":
if not (args.t or args.f):
parser.error(
"No hashtags were given, please use either the `-t` flag or the `-f` flag to specify one or more hashtags.")
if not (args.p or args.v):
parser.error(
"No argument given, please specify either the `-p` flag to download post data or the `-v` flag to download video files, or both."
)
if args.t:
hashtags = args.t
elif args.f:
file_name = args.f
hashtags = get_hashtag_list(file_name)
logger.info(f"Hashtags to scrape: {hashtags}")
if not hashtags:
raise ValueError(
"No hashtags were specified: please use either the `-t` flag to specify a sspace-separated list of one or more hashtags as a command-line argument, or use the `-f` flag to specify a text file of newline-separated hashtags.")
download_data_type = {"posts": args.p, "videos": args.v}
scraped_summary_list = get_data(hashtags, download_data_type)
if scraped_summary_list:
log_writer(scraped_summary_list)
elif args.command == "frequencies":
img_folder = IMAGES
check_file(img_folder, "dir")
if args.n < 1:
raise ValueError(
f"Specified argument `n` (the number of hashtags to analyze) must be greater than zero, not: {args.n}.")
input_file = data_file = os.path.join(
FILES["data"], args.hashtag, FILES["posts"], FILES["data_file"]
)
if not check_existence(input_file, "file"):
raise FileNotFoundError(
f"File ({input_file}) for specified argument `hashtag` ({args.hashtag}) does not exist.")
# base = os.path.splitext(input_file)[0]
# path = f"./{base}_sorted_hashtags.csv"
occs = get_occurrences(input_file, args.n)
if args.plot:
plot(occs, img_folder)
else:
print_occurrences(occs)
if __name__=="__main__":
main()

View File

@@ -0,0 +1,72 @@
import os
import configparser
from pathlib import Path
import logging
from typing import Optional
class Authorization:
"""Handle authorization for TikTok, using the `msToken`."""
def __init__(self, config_file: Optional[str] = None):
if config_file:
self.config_file = Path(config_file)
else:
self.config_file = Path.home() / ".tiktok"
self.section = "TikTok"
self.ms_token = None
def get_token(self) -> str:
"""Load the "msToken" cookie taken from TikTok, which the scraper requires."""
# Step 1: check if MS_TOKEN is defined as environment variable
if ms_token := os.environ.get("MS_TOKEN"):
self.ms_token = ms_token
logging.info("Loaded token from environment variable")
# Step 2: check if MS_TOKEN is defined in config file
elif self.config_file.is_file():
if ms_token := self.load_token():
self.ms_token = ms_token
logging.info(f"Loaded token from config file: {self.config_file}")
# Step 3: have user enter MS_TOKEN via terminal
else:
ms_token = self.input_token()
self.dump_token(ms_token=ms_token)
self.ms_token = ms_token
logging.info(
f"Loaded token from user input and saved to config file: {self.config_file}"
)
return self.ms_token
def load_token(self) -> Optional[str]:
"""Parse a config file and extract the token."""
config = configparser.ConfigParser()
config.read(self.config_file)
return config.get(section=self.section, option="MS_TOKEN", fallback=None)
def dump_token(self, ms_token: str):
"""Write the token to a config file."""
config = configparser.ConfigParser()
config.read(self.config_file)
config.add_section(self.section)
config.set(section=self.section, option="MS_TOKEN", value=ms_token)
with open(self.config_file, "w", encoding="utf-8") as f:
config.write(f)
def input_token(self) -> str:
"""Allow user to manually enter the token in the terminal."""
print(
"\nPlease copy and paste your `msToken` cookie taken from your web browser when visiting the TikTok website. See [THIS VIDEO] for more information.\n"
)
ms_token = input("msToken: ")
return ms_token

View File

@@ -0,0 +1,267 @@
import os
import json
from pathlib import Path
from collections import Counter
from datetime import datetime
import warnings
import asyncio
import logging
import re
from typing import List, Dict
import yt_dlp
import requests
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
from TikTokApi import TikTokApi
from .auth import Authorization
warnings.filterwarnings("ignore", message="Glyph (.*) missing from current font")
sns.set_theme(style="darkgrid")
def process_hashtag_list(hashtags: List[str]) -> List[str]:
"""Convert a list of hashtags to a standard form (remove whitespace, make
lowercase, etc.)."""
return list(
filter(None, (hashtag.strip().strip("#").lower() for hashtag in hashtags))
)
def load_hashtags_from_file(file: str) -> List[str]:
"""Read and process hashtags specified in a text file."""
if not os.path.isfile(file):
raise OSError(f"{file} does not exist")
with open(file, "r", encoding="utf-8") as f:
hashtags = re.split(r"\n|,", f.read())
return process_hashtag_list(hashtags=hashtags)
async def _fetch_hashtag_data(hashtag: str, ms_token: str) -> List[Dict]:
"""Fetch data for videos containing a specified hashtag, asynchronously."""
data = []
async with TikTokApi() as api:
await api.create_sessions(ms_tokens=[ms_token], num_sessions=1, sleep_after=3)
async for video in api.hashtag(name=hashtag).videos(count=1000):
data.append(video.as_dict)
return data
def json_load(file_path: Path) -> List:
"""Read a JSON file and return the read data."""
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(fp=f)
return data
def json_dump(file_path: Path, data: List):
"""Write data to a JSON file."""
with open(file_path, "w", encoding="utf-8") as f:
json.dump(obj=data, fp=f)
def download_gallery(video_data: Dict, video_dir: Path):
"""yt-dlp doesn't seem to support downloading images from an image gallery,
so this is a quick fix that likely will fail on edge cases."""
video_id = video_data["id"]
if play_url := video_data["music"]["playUrl"]:
r = requests.get(play_url)
with open(video_dir / f"{video_id}.mp3", "wb") as f:
f.write(r.content)
for i, image in enumerate(video_data["imagePost"]["images"]):
image_url = image["imageURL"]["urlList"][0]
r = requests.get(image_url)
ext = r.headers["Content-Type"].split("/")[-1]
with open(video_dir / f"{video_id}_{i:02d}.{ext}", "wb") as f:
f.write(r.content)
def aggregate_cooccurring_hashtags(hashtag_file: Path) -> Counter:
"""Aggregate how frequently hashtags are used, from a file containing a
list of raw TikTok post API responses."""
videos = json_load(file_path=hashtag_file)
all_hashtags: List[set] = []
for video in videos:
video_hashtags = set(
hashtag["hashtagName"]
for hashtag in video.get("textExtra", [])
if hashtag.get("hashtagName")
)
all_hashtags.extend(video_hashtags)
return Counter(all_hashtags)
class TikTokDownloader:
"""Main class for scraping data from TikTok."""
def __init__(self, hashtags: List[str], data_dir: str, config_file: str = None):
self.hashtags = process_hashtag_list(hashtags)
logging.info(f"Hashtags to scrape: {hashtags}")
self.data_dir = Path(data_dir)
os.makedirs(self.data_dir, exist_ok=True)
self.auth = Authorization(config_file=config_file)
self.ms_token = self.auth.get_token()
def get_hashtag_posts(self, hashtag: str):
"""Fetch data about posts that used a specified hashtag and merge with
existing data, if it exists."""
# Define file to store hashtags in and create parent directory
hashtag_file = self.data_dir / hashtag / "posts.json"
hashtag_file.parent.mkdir(exist_ok=True, parents=True)
# If there are previously scraped posts, load them
if hashtag_file.is_file():
already_fetched_data = json_load(file_path=hashtag_file)
already_fetched_ids = set(video["id"] for video in already_fetched_data)
else:
already_fetched_ids = set()
already_fetched_data = []
# Scrape posts that use the specified hashtag
fetched_data = asyncio.run(
_fetch_hashtag_data(hashtag=hashtag, ms_token=self.ms_token)
)
if len(fetched_data) == 0:
logging.warning(f"No posts were found for the hashtag: {hashtag}")
# Determine which newly scraped posts haven't been scraped before
new_fetched_data = [
video for video in fetched_data if video["id"] not in already_fetched_ids
]
if len(new_fetched_data) == 0:
logging.warning(f"No new posts were found for the hashtag: {hashtag}")
# Merge new and old data and write to file
all_fetched_data = already_fetched_data + new_fetched_data
json_dump(file_path=hashtag_file, data=all_fetched_data)
logging.info(
f"Scraped {len(new_fetched_data)} new posts containing the hashtag "
f"'{hashtag}', with {len(already_fetched_data)} posts previously scraped"
)
def get_hashtag_videos(self, hashtag: str):
"""Download videos and other media corresponding to posts that used a
specified hashtag,"""
# Define file containing post data and directory to save videos to
hashtag_file = self.data_dir / hashtag / "posts.json"
video_dir = self.data_dir / hashtag / "media"
video_dir.mkdir(exist_ok=True)
# Get list of post IDs that have previously had their media downloaded
already_downloaded_ids = set(
file.split(".")[0].split("_")[0] for file in os.listdir(video_dir)
)
# Get list of posts that have been scraped but not had their media downloaded
video_list = json_load(file_path=hashtag_file)
new_video_list = [
video for video in video_list if video["id"] not in already_downloaded_ids
]
if len(new_video_list) == 0:
logging.warning(
f"No new videos to be downloaded for the hashtag: {hashtag}"
)
# Populate list of URLs to download using yt-dlp, and list of image
# galleries to download using the `download_gallery` function
urls_to_download = []
galleries_to_download = []
for video in new_video_list:
if video.get("imagePost") is None:
url = f"https://www.tiktok.com/@{video['author']['uniqueId']}/video/{video['id']}"
urls_to_download.append(url)
else:
galleries_to_download.append(video)
# Download audio and image files for all image gallery posts
if len(galleries_to_download) > 0:
logging.info(f"Downloading image galleries for hashtag {hashtag}")
for video in galleries_to_download:
download_gallery(video_data=video, video_dir=video_dir)
# Download video files for all video posts
if len(urls_to_download) > 0:
logging.info(f"Downloading media for hashtag {hashtag}")
ydl_opts = {
"outtmpl": os.path.join(video_dir, "%(id)s.%(ext)s"),
"ignore_errors": True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download(urls_to_download)
def frequency_table(self, hashtag: str, number: int):
"""Print `number`-most commonly co-occurring hashtags for a specified
source hashtag, in tabular form."""
# Load video data file and extract co-occurring hashtag frequency information
hashtag_file = self.data_dir / hashtag / "posts.json"
frequencies = aggregate_cooccurring_hashtags(hashtag_file=hashtag_file)
# Print table that displays most commonly co-occurring hashtags
total_posts = max(frequencies.values())
print(f"\nCo-occurring hashtags for #{hashtag} posts")
print(f"{'Rank':<8} {'Hashtag':<30} {'Occurrences':<15} {'Frequency':<15}")
for row, (hashtag, frequency) in enumerate(frequencies.most_common(number)):
ratio = frequency / total_posts
print(f"{row:<8} {hashtag:<30} {frequency:<15} {ratio:.4f}")
print(f"Total posts: {total_posts}\n\n")
def plot(self, hashtag: str, number: int):
"""Create plot of `number`-most commonly co-occurring hashtags for a
specified source hashtag."""
# Load video data file and extract co-occurring hashtag frequency information
hashtag_file = self.data_dir / hashtag / "posts.json"
frequencies = aggregate_cooccurring_hashtags(hashtag_file=hashtag_file)
# Define labels and other fields used in plot
total_posts = max(frequencies.values())
sorted_frequencices = frequencies.most_common(number)
labels = [label for label, _ in sorted_frequencices[1:]]
ratios = [freq / total_posts * 100 for _, freq in sorted_frequencices[1:]]
y_pos = list(reversed(range(len(sorted_frequencices) - 1)))
# Visualize data in bar chart
fig, ax = plt.subplots(figsize=(5, 6.66))
ax.barh(y_pos, ratios)
ax.set_yticks(y_pos)
ax.set_yticklabels(labels)
ax.grid(axis="y")
ax.set_xlabel("Percent of posts with co-occurring hashtag")
ax.set_ylim(min(y_pos) - 1, max(y_pos) + 1)
ax.set_title(f"Co-occurring hashtags for #{hashtag} posts")
ax.xaxis.set_major_formatter(mtick.PercentFormatter(decimals=0))
# Write image of plot to file
current_time = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
plot_file = self.data_dir / hashtag / "plots" / f"{hashtag}__{current_time}.png"
plot_file.parent.mkdir(exist_ok=True, parents=True)
plt.savefig(plot_file, bbox_inches="tight", facecolor="white", dpi=300)
logging.info(f"Plot saved to file: {plot_file}")
def run(self, download: bool, plot: bool, table: bool, number: int):
"""Execute the specified operations on all specified hashtags."""
# Scrape all specified hashtags and perform analyses, depending on if
# `--table` and `--plot` flags are used in the command
for hashtag in self.hashtags:
self.get_hashtag_posts(hashtag=hashtag)
if plot:
self.plot(hashtag=hashtag, number=number)
if table:
self.frequency_table(hashtag=hashtag, number=number)
# Download media for all hashtags if `--download` flag is used in the command
for hashtag in self.hashtags:
if download:
self.get_hashtag_videos(hashtag=hashtag)

View File

@@ -0,0 +1,102 @@
import logging
import argparse
from pathlib import Path
from .base import TikTokDownloader, load_hashtags_from_file
def create_parser():
"""Create parser tp parse input command-line arguments."""
parser = argparse.ArgumentParser(
description="Analyze hashtags within posts scraped from TikTok."
)
parser.add_argument(
"hashtags",
type=str,
nargs="*",
help="List of hashtags to scrape",
)
parser.add_argument(
"--file",
type=str,
help="File name containing list of hashtags to scrape",
)
parser.add_argument(
"-d",
"--download",
action="store_true",
help="Download video files corresponding to scraped posts",
)
parser.add_argument(
"--number",
type=int,
help="The number of co-occurring hashtags to analyze",
default=20,
)
parser.add_argument(
"-p",
"--plot",
help="Plot the most common co-occurring hashtags",
action="store_true",
)
parser.add_argument(
"-t",
"--table",
help="Print a table of the most common co-occurring hashtags",
action="store_true",
)
parser.add_argument(
"--output-dir",
type=str,
help="Directory to save scraped data and visualizations to",
default=Path(".").resolve().parent / "data",
)
parser.add_argument(
"--config",
type=str,
help="File name of configuration file to store TikTok credentials to",
default=None,
)
parser.add_argument("--log", type=str, help="File to write logs to", default=None)
return parser
def main():
"""Parse and process command-line arguments, scrape specified hashtags, and perform specified analyses."""
parser = create_parser()
args = parser.parse_args()
logging.basicConfig(
level=logging.INFO,
filename=args.log,
format="%(asctime)s %(levelname)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
if len(args.hashtags) == 0:
if not args.file:
parser.error(
"No hashtags were specified, please specify one or more hashtags "
"to scrape or use the `--file` flag to specify a text file containing "
"hashtags."
)
else:
hashtags = load_hashtags_from_file(file=args.file)
else:
hashtags = args.hashtags
downloader = TikTokDownloader(
hashtags=hashtags, data_dir=args.output_dir, config_file=args.config
)
downloader.run(
download=args.download, plot=args.plot, table=args.table, number=args.number
)
if __name__ == "__main__":
main()

View File

@@ -1,161 +0,0 @@
"""Utility functions that perform data processing related tasks.
"""
from typing import NamedTuple, List, Tuple, Set, Optional, Dict, Any
import logging
from . import file_methods
logger = logging.getLogger()
class Diff(NamedTuple):
"""Keep track of scraped post IDs and whether previously-scraped posts have been filtered."""
ids: Set[str]
filter_posts: bool
class Total(NamedTuple):
"""Keep track of number of total and number of unique scraped posts."""
total: int
unique: int
def get_difference(tag: str, file_name: str, ids: List[str]) -> Optional[Diff]:
"""Find TikTok post IDs that haven't previously been scraped.
Filter out the new posts for the hashtag `tag` by comparing the list of
post IDs contained in `filename` to the list of newly downloaded IDs
contained in `ids`.
"""
filter_posts = False
current_id_data = file_methods.get_data(file_name)
if tag in current_id_data:
current_ids = current_id_data[tag]
set_current_ids = set(current_ids)
total_current_ids = len(set_current_ids)
set_ids = set(ids)
new_ids = set_ids.difference(set_current_ids)
if not new_ids:
return None
else:
total_new_ids = len(new_ids)
if total_new_ids == total_current_ids:
new_data = Diff(new_ids, filter_posts)
else:
new_data = Diff(new_ids, filter_posts)
return new_data
else:
filter_posts = True
new_data = Diff(set(ids), filter_posts)
return new_data
def extract_posts(
settings: Dict[Any, Any], file_name: str, tag: str
) -> Optional[Tuple[List[str], List[Dict]]]:
"""Find TikTok posts that haven't previously been scraped.
Compares the file downloaded by tiktok-scraper to the list of
previously-scraped posts (from the file ids/post_ids.json).
"""
ids = []
posts = []
posts = file_methods.get_data(file_name)
for post in posts:
ids.append(post["id"])
if not ids:
logger.warn(f"No posts were found for the hashtag: {tag}")
return None
status = file_methods.check_existence(settings["post_ids"], "file")
if not status:
new_data = (ids, posts)
return new_data
else:
new_ids = get_difference(tag, settings["post_ids"], ids)
if not new_ids:
logger.warn(f"No new posts were found for the hashtag: {tag}")
return None
elif new_ids.filter_posts:
new_posts = [post for post in posts if post["id"] in new_ids.ids]
return (list(new_ids.ids), new_posts)
else:
return (list(new_ids.ids), posts)
def extract_videos(settings: dict, tag: str, download_list: List[str]) -> List[str]:
"""Find TikTok videos that haven't previously been scraped.
Compares the file downloaded by tiktok-scraper to the list of
previously-scraped videos (from the file ids/video_ids.json).
"""
status = file_methods.check_existence(settings["video_ids"], "file")
if not status:
new_data = download_list
return new_data
else:
new_videos = get_difference(tag, settings["video_ids"], download_list)
if not new_videos:
logger.warn(
f"No new videos were found for the {tag} in the downloaded folder."
)
return []
else:
return list(new_videos.ids)
def update_posts(
file_path: str, file_type: str, new_data: List[Any], tag: str = None
) -> Optional[Tuple[str, int]]:
"""Update the file containing scraped post IDs (`ids/post_ids.json`) with
the IDs of the recently scraped posts.
"""
status = file_methods.check_existence(file_path, file_type)
if not tag:
file_methods.post_writer(file_path, new_data, status)
return None
else:
scraped_data = file_methods.id_writer(file_path, new_data, tag, status)
return scraped_data
def update_videos(
settings: Dict[str, Any], new_data: List[str], tag: str
) -> Tuple[str, int]:
"""Update the file containing video IDs (`ids/video_ids.json`) with the IDs
of the recently scraped videos.
"""
file_path = settings["video_ids"]
file_methods.check_file(file_path, "file")
number_scraped = file_methods.id_writer(file_path, new_data, tag, True)
file_methods.clean_video_files(settings, tag, new_data)
return number_scraped
def get_total_posts(file_path: str, tag: str) -> Total:
"""Count number of total scraped posts and number of unique scraped posts."""
status = file_methods.check_existence(file_path, "file")
if not status:
raise OSError(f"{file_path} not found!")
else:
data = file_methods.get_data(file_path)
total_posts = len(data[tag])
unique = len(set(data[tag]))
t = Total(total_posts, unique)
return t
def print_total(file_path: str, tag: str, data_type: str):
"""Print number of total and unique scraped posts, warn if any non-unique posts."""
total = get_total_posts(file_path, tag)
if total.total == total.unique:
logger.info(f"Scraped {total.total} {data_type} containing the hashtag '{tag}'")
else:
logger.warn(
f"Out of total {data_type} for the hashtag {tag} {total.total}, only {total.unique} are unique. Something is going wrong..."
)

View File

@@ -1,216 +0,0 @@
"""Utility functions that operate on files, such as writing to reading from a file.
"""
import os
import json
import subprocess
from os import path
from datetime import datetime
import shutil
from typing import Tuple, List, Optional, Dict, Any
import logging, logging.config
logging.config.fileConfig(path.join(path.dirname(path.abspath(__file__)), 'logging.config'))
logger = logging.getLogger("Logger")
def create_file(name: str, file_type: str):
"""Create a file or directory."""
if file_type == "dir":
os.makedirs(name, mode=0o777)
elif file_type == "file":
with open(name, "w"):
pass
else:
raise ValueError(f"{file_type} has to be either 'dir' or 'file'")
def check_existence(file_path: str, file_type: str):
"""Check if a file or a directory exists."""
if file_type == "file":
return os.path.isfile(file_path)
elif file_type == "dir":
return os.path.isdir(file_path)
else:
raise ValueError(f"{file_type} has to be either 'dir' or 'file'")
def check_file(file_path: str, file_type: str):
"""If path does not exist, creates a file or directory."""
status = check_existence(file_path, file_type)
if not status:
create_file(file_path, file_type)
def download_posts(settings: Dict, tag: str, output_dir: Any):
"""Run the tiktok-scraper command to download posts for a given hashtag.
Returns the path to the downloaded file of posts. If no file was downloaded,
prints the error and returns nothing in order to move on.
os.chdir is used to execute shell commands in the correct folder and then
reused to return to the original folder of execution of run_downloader script.
"""
path = os.path.join(settings["data"], tag, settings["posts"])
os.makedirs(path, exist_ok=True)
tiktok_command = f"tiktok-scraper hashtag {tag} -t 'json' --filepath {output_dir}"
output = subprocess.check_output(tiktok_command, shell=True, encoding="utf-8")
new_file = output.split()[-1]
if "json" in new_file:
return new_file
else:
logger.warn(
f"Something's wrong with what is returned by tiktok-scraper for the hashtag {tag} - *{new_file}* is not a json file.\n\ntiktok-scraper returned {output}"
)
def download_videos(settings: Dict, tag: str):
"""Run the tiktok-scraper command to download videos for a given hashtag.
Note that all the videos are downloaded that are returned by the TikTok API,
making this a time- and data-intensive process.
The list of downloaded video IDs is constucted and returned if the
downloaded folder contains at least 1 video.
os.chdir is used to execute shell commands in the correct folder and then
reused to return to the original folder of execution of run_downloader script.
"""
path = os.path.join(settings["data"], tag, settings["videos"])
os.makedirs(path, exist_ok=True)
tiktok_command = f"tiktok-scraper hashtag {tag} -d --filepath {path}"
result = subprocess.check_output(tiktok_command, shell=True)
downloaded_list_tmp = os.listdir(os.path.join(path, f"#{tag}"))
if downloaded_list_tmp:
downloaded_list = []
for file in downloaded_list_tmp:
file = file.split(".")[0]
downloaded_list.append(file)
return downloaded_list
else:
logger.warn(f"No video files were downloaded for the hashtag {tag}.")
shutil.rmtree(settings["videos_delete"])
def get_data(file_path: str) -> Any:
"""Read a JSON file and return the read data."""
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
return data
def dump_data(file_path: str, data: Any):
"""Write data to a JSON file."""
with open(file_path, "w", encoding="utf-8") as f:
json.dump(data, f)
def log_writer(log_data: List[Tuple[str, Tuple[str, int]]]):
"""Create the dictionary of total downloads (posts and videos) per hashtag.
Example : {
timetamp : {
hashtag : {
videos : number_of_new_videos ,
posts : number_of_new_posts
}
}
}
Writes the dictionary to the log file (`logs/log.json`).
"""
total = 0
scraped_summary_dict = {} # type: Dict[str, Dict[str, int]]
for hashtag, (data_type, count) in log_data:
if hashtag in scraped_summary_dict:
if data_type in scraped_summary_dict[hashtag]:
scraped_summary_dict[hashtag][data_type] += count
else:
scraped_summary_dict[hashtag][data_type] = count
total += count
else:
scraped_summary_dict[hashtag] = {data_type: count}
total += count
now_str = datetime.now().strftime("%d-%m-%Y %H:%M:%S")
data = {now_str: scraped_summary_dict}
logger.debug(f"Logged post data: {data}")
logger.info(f"Successfully scraped {total} total entries")
def id_writer(
file_path: str, new_data: List[str], tag: str, status: bool
) -> Tuple[str, int]:
"""Write the list of new ids to the post_ids or video_ids file."""
total = len(new_data)
if status:
try:
data = get_data(file_path)
if tag in data:
data[tag] += new_data
else:
data[tag] = new_data
dump_data(file_path, data)
except json.decoder.JSONDecodeError:
data = {tag: new_data}
dump_data(file_path, data)
else:
data = {tag: new_data}
dump_data(file_path, data)
logger.debug(f"SUCCESS - {total} entries added to {file_path}")
number_scraped = (tag, total)
return number_scraped
def post_writer(file_path: str, new_data: List[Dict], status: bool):
"""Write the new posts in the post file of the given hashtag
(`/data/{hashtag}/posts/data.json`).
"""
total = len(new_data)
if status:
try:
data = get_data(file_path)
data += new_data
dump_data(file_path, data)
except json.decoder.JSONDecodeError:
data = new_data
dump_data(file_path, data)
else:
data = new_data
dump_data(file_path, data)
logger.debug(f"SUCCESS - {total} entries added to {file_path}")
def delete_file(file_path: str, file_type: str):
"""Delete a directory or file."""
if not check_existence(file_path, file_type):
raise OSError(f"Attempt to delete file failed: {file_path} does not exist")
elif file_type == "file":
os.remove(file_path)
logger.debug(f"Successfully deleted {file_path}")
elif file_type == "dir":
os.rmdir(file_path)
logger.debug(f"Successfully deleted {file_path}")
else:
raise OSError("{file_type} needs to be either 'file' or 'dir'")
def clean_video_files(settings: dict, tag: str, new_data: Optional[List[str]] = None):
"""Move the new videos from the tiktok-scraper video folder to `/data/{hashtag}/videos/`.
Deletes the residual tiktok-scraper video folder.
"""
if new_data:
for file in new_data:
settings["videos_from"] = (
settings["data"] + f"/{tag}/videos/#{tag}/{file}.mp4"
)
shutil.move(settings["videos_from"], settings["videos_to"])
shutil.rmtree(settings["videos_delete"])
logger.debug(
f"Successfully deleted the folder {settings['videos_delete']} folder of videos."
)

View File

@@ -1,32 +0,0 @@
"""Specify global constants including file paths and scraping options.
"""
# Directories
DATA = "../data"
IDS = "ids"
POSTS = "posts"
VIDEOS = "videos"
IMAGES = f"{DATA}/img"
# Files
POST_IDS = "post_ids.json"
VIDEO_IDS = "video_ids.json"
DATA_FILE = "data.json"
FILES = {
"data": DATA,
"ids": IDS,
"posts": POSTS,
"videos": VIDEOS,
"images": IMAGES,
"post_ids": f"{DATA}/{IDS}/{POST_IDS}",
"video_ids": f"{DATA}/{IDS}/{VIDEO_IDS}",
"data_file": f"{DATA_FILE}",
"downloads": [],
}
PARAMETERS = {
"scraper_attempts": 3,
"sleep": 8,
}

View File

@@ -1,99 +0,0 @@
"""Analyze the frequency of hashtags appearing in the set of given posts.
- The "hashtag" positional argument specifies the hashtag of scraped posts to analyze
- The "n" positional argument specifies how many hashtags does the user wants to analyze
- Specifying the "-d" flag prints the hashtag frequencies on the shell
- Specifying the "-p" flag plots the hashtag frequencies and saves as a png file
"""
import json
from datetime import datetime
import warnings
import logging
from typing import List, Tuple, Dict, Any
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
warnings.filterwarnings("ignore", message="Glyph (.*) missing from current font")
sns.set_theme(style="darkgrid")
def get_hashtags(obj: Dict) -> List[Tuple[str, int]]:
if not obj:
raise ValueError(f"Empty item, no hashtags could be extracted.")
else:
hashtags = {}
tags = [set([tag["name"] for tag in ele["hashtags"]]) for ele in obj]
{
tag: (
1
if tag not in hashtags and not hashtags.update({tag: 1})
else hashtags[tag] + 1 and not hashtags.update({tag: hashtags[tag] + 1})
)
for ele in tags
for tag in ele
}
return sorted(hashtags.items(), key=lambda e: e[1], reverse=True)
def get_occurrences(filename: str, n: int = 1) -> Dict[str, Any]:
"""Aggregate hashtag frequency information for a specified JSON file.
Example: {
"total": total posts in the file,
top_n: [[top n hashtags ], [frequencies of corresponding hashtags]]
}
"""
with open(filename) as f:
obj = json.load(f)
l = len(obj)
tags = get_hashtags(obj)
occs = {"total": l, "top_n": []}
occs["top_n"] = [[ele[i] for ele in tags[0 : min(l, n)]] for i in range(2)]
return occs
def plot(occs: dict, img_folder: str):
"""Save plot of common hashtags as bar chart to file."""
y_pos = list(reversed(range(len(occs["top_n"][0]) - 1)))
max_count = occs["top_n"][1][0]
freqs = [count / max_count * 100 for count in occs["top_n"][1][1:]]
labels = occs["top_n"][0][1:]
hashtag = occs["top_n"][0][0]
fig, ax = plt.subplots(figsize=(5, 6.66))
ax.barh(y_pos, freqs)
ax.set_yticks(y_pos)
ax.set_yticklabels(labels)
ax.grid(axis="y")
ax.set_xlabel("Percent of posts with common hashtag")
ax.set_ylim(min(y_pos) - 1, max(y_pos) + 1)
ax.set_title(f"Common hashtags for #{hashtag} posts")
ax.xaxis.set_major_formatter(mtick.PercentFormatter(decimals=0))
save_plot(img_folder, hashtag)
def save_plot(img_folder, hashtag):
"""Save the plot as a png file in the folder ../data/imgs/"""
now = datetime.now()
current_time = now.strftime("%Y_%m_%d_%H_%M_%S")
filename = f"{img_folder}/{hashtag}_{current_time}.png"
logging.info(f"Plot saved to file: {filename}")
plt.savefig(filename, bbox_inches="tight", facecolor="white", dpi=300)
def print_occurrences(occs):
"""Print information about the top n hashtags and their frequencies."""
row_number = 0
total_posts = occs["total"]
print(
"{:<8} {:<30} {:<15} {:<15}".format(
"Rank", "Hashtag", "Occurrences", "Frequency"
)
)
for key, value in zip(occs["top_n"][0], occs["top_n"][1]):
ratio = value / total_posts
print("{:<8} {:<30} {:<15} {:.4f}".format(row_number, key, value, ratio))
row_number += 1
print(f"Total posts: {total_posts}")

View File

@@ -1,5 +0,0 @@
# Enter a hashtag per line. Each line should contain only one word.
london
paris
tokyo
newyork

View File

@@ -1,36 +0,0 @@
[loggers]
keys=root,Logger
[handlers]
keys=consoleHandler,fileHandler
[formatters]
keys=consoleFormatter,fileFormatter
[logger_root]
level=DEBUG
handlers=consoleHandler
[logger_Logger]
level=DEBUG
handlers=consoleHandler,fileHandler
qualname=Logger
propagate=0
[handler_consoleHandler]
class=StreamHandler
level=INFO
formatter=consoleFormatter
args=(sys.stdout,)
[handler_fileHandler]
class=FileHandler
level=DEBUG
formatter=fileFormatter
args=("../logfile.log",)
[formatter_consoleFormatter]
format=%(message)s
[formatter_fileFormatter]
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s

View File

@@ -1,150 +0,0 @@
"""Download post data or videos from TikToks containing one or more specified hashtags.
- The "-p" flag specifies that only data from posts is downloaded, no video files
- The "-v" flag specifies that only video files are downloaded, no post data
- Specifying both "-p" and "-v" flags downloads both post data and video files
- The "-t" flag allows the user to specify a list of space-separated hashtags as an argument
- The "-f" flag allows the user to specify the filename of a text file containing a list of newline-separated hashtags as an argument
"""
import os
import time
from typing import List, Tuple, Dict, Any, Optional
from tempfile import TemporaryDirectory
from tiktok_hashtag_analysis import global_data
import tiktok_hashtag_analysis.file_methods as file_methods
from tiktok_hashtag_analysis import data_methods
def get_hashtag_list(file_name: str) -> List[str]:
"""Extract list of newline-separated hashtags from text file."""
if not file_methods.check_existence(file_name, "file"):
raise OSError(f"{file_name} does not exist")
with open(file_name) as f:
tags = list(
filter(None, [line.strip() for line in f if not line.startswith("#")])
)
return tags
def set_download_settings(download_data_type: Dict[str, bool]) -> Dict[str, Any]:
"""Load the constants from global_data module into the `settings` dict."""
settings = {
"data": global_data.FILES["data"],
"ids": global_data.FILES["ids"],
"sleep": global_data.PARAMETERS["sleep"],
"scraper": global_data.PARAMETERS["scraper_attempts"],
}
file_methods.check_file(f"{settings['data']}/{settings['ids']}", "dir")
if download_data_type["posts"]:
settings["posts"] = global_data.FILES["posts"]
settings["post_ids"] = global_data.FILES["post_ids"]
settings["data_file"] = global_data.FILES["data_file"]
if download_data_type["videos"]:
settings["videos"] = global_data.FILES["videos"]
settings["video_ids"] = global_data.FILES["video_ids"]
return settings
def get_posts(settings: dict, tag: str) -> Optional[Tuple[str, int]]:
"""Scrape trending TikTok post data for the specified hashtag.
1. Calls `file_methods.download_posts` to scrape the post data for a given hashtag
2. Calls `data_methods.extract_posts` to determine which if any posts
haven't previously been downloaded.
3. Calls `data_methods.update_posts` to update the ID list with the IDs of
newly downloaded posts.
"""
with TemporaryDirectory() as temp_dir:
file_path = file_methods.download_posts(settings, tag, temp_dir)
number_scraped = None
if file_path:
new_data = data_methods.extract_posts(settings, file_path, tag)
if new_data:
data_file = os.path.join(
settings["data"], tag, settings["posts"], settings["data_file"]
)
data_methods.update_posts(data_file, "file", new_data[1])
number_scraped = data_methods.update_posts(
settings["post_ids"], "file", new_data[0], tag
)
return number_scraped
def get_videos(settings: dict, tag: str) -> Optional[Tuple[str, int]]:
"""Scrape trending TikTok video files for the specified hashtag.
1. Calls `file_methods.download_videos` to download the video files for a given hashtag
2. Calls `data_methods.extract_videos` to determine which if any videos
haven't previouly been downloaded.
3. Calls `data_methods.update_videos` to update the ID list with the IDs of
newly downloaded videos.
4. Calls `clean_video_files` function to delete the residual video folder
after the data processing.
"""
number_scraped = None
download_list = file_methods.download_videos(settings, tag)
if download_list:
new_data = data_methods.extract_videos(settings, tag, download_list)
if new_data:
number_scraped = data_methods.update_videos(settings, new_data, tag)
else:
file_methods.clean_video_files(settings, tag)
return number_scraped
def get_data(
hashtags: list, download_data_type: Dict[str, bool]
) -> List[Tuple[str, Tuple[str, int]]]:
"""Check command-line arguments and scrape posts/videos for specified hashtags."""
counter = 0
total_hashtags = len(hashtags)
total_hashtags_offset = total_hashtags - 1
scraped_summary_list = []
if download_data_type["posts"]:
settings = set_download_settings(download_data_type)
while counter < total_hashtags:
tag = hashtags[counter]
file_methods.check_file(
os.path.join(settings["data"], tag, settings["posts"]), "dir"
)
file_methods.check_file(
os.path.join(
settings["data"], tag, settings["posts"], settings["data_file"]
),
"file",
)
res = get_posts(settings, tag)
if res:
number_scraped = (res[0], ("posts", res[1]))
scraped_summary_list.append(number_scraped)
data_methods.print_total(settings["post_ids"], tag, "posts")
counter += 1
if counter < total_hashtags_offset:
time.sleep(settings["sleep"])
if download_data_type["videos"]:
settings = set_download_settings(download_data_type)
while counter < total_hashtags:
tag = hashtags[counter]
file_methods.check_file(
os.path.join(settings["data"], tag, settings["videos"]), "dir"
)
settings["videos_delete"] = settings["data"] + f"/{tag}/videos/#{tag}"
settings["videos_to"] = settings["data"] + f"/{tag}/videos"
_res = get_videos(settings, tag)
if _res:
scraped_summary_list.append((_res[0], ("videos", _res[1])))
data_methods.print_total(settings["video_ids"], tag, "videos")
counter += 1
if counter < total_hashtags_offset:
time.sleep(settings["sleep"])
return scraped_summary_list

View File

@@ -1,12 +1,11 @@
_MAJOR = "1"
_MAJOR = "2"
_MINOR = "0"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "4"
_PATCH = "0"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""
VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
__version__ = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX)
VERSION = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX)