updated README and added authorization

This commit is contained in:
Tristan Lee
2023-09-01 18:33:32 -05:00
parent a7bd023c21
commit cf575e6cf6
5 changed files with 107 additions and 40 deletions

2
.gitignore vendored
View File

@@ -1,5 +1,7 @@
# Data directory
data/
build/
*.egg-info/
# Miscellaneous files
**/.DS_Store

View File

@@ -59,40 +59,38 @@ The `data` folder contains all the downloaded data as shown in the tree diagram
## How to use
### Post downloading
Running the `tiktok-hashtag-analysis` command with the following options will scrape posts containing the hashtags `#london`, `#paris`, or `#newyork`:
Running the `tiktok-hashtag-analysis` command with the following options will scrape posts that contain the hashtags `#london`, `#paris`, or `#newyork`:
tiktok-hashtag-analysis london paris newyork
and will produce an output similar to the following log:
$ tiktok-hashtag-analysis download -t london paris newyork -p
$ tiktok-hashtag-analysis download london paris newyork
Hashtags to scrape: ['london', 'paris', 'newyork']
Scraped 963 posts containing the hashtag 'london'
Scraped 961 posts containing the hashtag 'paris'
Scraped 940 posts containing the hashtag 'newyork'
Successfully scraped 2864 total entries
- The `-t` flag allows a space-separated list of hashtags to be specified as a command line argument
- The `-p` flag specifies that posts, not videos, will be downloaded
- The list of hashtags to scrape is specified as a positional argument
### Video downloading
Running the `tiktok-hashtag-analysis download` script with the following options will scrape trending videos containing the hashtag `#london`:
`tiktok-hashtag-analysis download -t london -v`
Running the `tiktok-hashtag-analysis` script with the following options will scrape trending videos containing the hashtag `#london`:
`tiktok-hashtag-analysis download london --download`
- The `-t` flag allows a space-separated list of hashtags to be specified as a command line argument
- The `-v` flag specifies that videos, not posts, will be downloaded
- The `--download` flag specifies that video files for scraped posts should be downloaded
Note that video downloading is a time and data rate consuming task, as a result we recommend using one hashtag at a time when using the `-v` flag to avoid complications.
Note that video downloading is a time and data rate consuming task, as a result we recommend using one hashtag at a time when using the `--download` flag to avoid complications.
## Analyzing results
### Top n hashtag occurrences
The script `tiktok-hashtag-analysis frequencies` analyzes the frequencies of top occurring hashtags in a given set of posts.
### Most common co-occurring hashtags
In addition to scraping data and downloading videos, the `tiktok-hashtag-analysis` script can also analyze the frequencies of the most common co-occurring hashtags in a given set of posts.
Assume we want to analyze the 20 most frequently occurring hashtags in the downloaded posts of the `#london` hashtag.
Assume we want to analyze the 20 most frequently co-occurring hashtags in the downloaded posts of the `#london` hashtag.
- The results can be plotted and saved as a PNG file by executing the following command:
`tiktok-hashtag-analysis frequencies --hashtag london --number 20 --plot`
`tiktok-hashtag-analysis london --number 20 --plot`
which will produce a figure similar to that shown below:
<p align="center">
@@ -103,32 +101,33 @@ Assume we want to analyze the 20 most frequently occurring hashtags in the downl
- The results can be displayed in tabular form by executing the following command:
`tiktok-hashtag-analysis frequencies --hashtag london --number 20 --print`
`tiktok-hashtag-analysis london --number 20 --table`
which will produce a terminal output similar to the following:
```
Rank Hashtag Occurrences Frequency
0 london 960 1.0000
1 fyp 494 0.5146
2 uk 238 0.2479
3 foryou 221 0.2302
4 foryoupage 184 0.1917
5 viral 179 0.1865
6 fypシ 84 0.0875
7 funny 56 0.0583
8 xyzbca 51 0.0531
9 british 45 0.0469
10 england 44 0.0458
11 trending 40 0.0417
12 fy 33 0.0344
13 comedy 32 0.0333
14 roadman 28 0.0292
15 4u 27 0.0281
16 usa 26 0.0271
17 tiktok 26 0.0271
18 travel 21 0.0219
19 america 20 0.0208
Total posts: 960
Co-occurring hashtags for #london posts
Rank Hashtag Occurrences Frequency
0 london 881 1.0000
1 fyp 399 0.4529
2 uk 174 0.1975
3 foryou 168 0.1907
4 viral 152 0.1725
5 foryoupage 137 0.1555
6 fypシ 73 0.0829
7 funny 54 0.0613
8 tiktok 43 0.0488
9 trending 43 0.0488
10 british 41 0.0465
11 england 38 0.0431
12 xyzbca 34 0.0386
13 fy 33 0.0375
14 usa 33 0.0375
15 love 29 0.0329
16 comedy 25 0.0284
17 royalfamily 23 0.0261
18 queen 23 0.0261
19 queenelizabeth 22 0.0250
Total posts: 881
```
The `Frequency` column shows the ratio of the occurrence to the total number of downloaded posts.

View File

@@ -1,7 +1,6 @@
import logging
import argparse
from pathlib import Path
import sys
from .base import TikTokDownloader, load_hashtags_from_file

View File

@@ -0,0 +1,67 @@
import os
import configparser
from pathlib import Path
import logging
class Authorization:
"""Handle authorization for TikTok, using the `msToken`."""
def __init__(self):
self.config_file = Path.home() / ".tiktok"
self.section = "TikTok"
self.ms_token = None
def get_token(self):
"""Load the "msToken" cookie taken from TikTok, which the scraper requires."""
# Step 1: check if MS_TOKEN is defined as environment variable
if ms_token := os.environ.get("MS_TOKEN"):
self.ms_token = ms_token
logging.info("Loaded token from environment variable")
# Step 2: check if MS_TOKEN is defined in config file
elif self.config_file.is_file():
if ms_token := self.load_token():
self.ms_token = ms_token
logging.info(f"Loaded token from config file: {self.config_file}")
# Step 3: have user enter MS_TOKEN via terminal
else:
ms_token = self.input_token()
self.dump_token(ms_token=ms_token)
self.ms_token = ms_token
logging.info(
f"Loaded token from user input and saved to config file: {self.config_file}"
)
return self.ms_token
def load_token(self):
"""Parse a config file and extract the token."""
config = configparser.ConfigParser()
config.read(self.config_file)
return config.get(section=self.section, option="MS_TOKEN", fallback=None)
def dump_token(self, ms_token):
"""Write the token to a config file."""
config = configparser.ConfigParser()
config.read(self.config_file)
config.add_section(self.section)
config.set(section=self.section, option="MS_TOKEN", value=ms_token)
with open(self.config_file, "w") as f:
config.write(f)
def input_token(self):
"""Allow user to manually enter the token in the terminal."""
print(
"\nPlease copy and paste your `msToken` cookie taken from your web browser when visiting the TikTok website. See [THIS VIDEO] for more information.\n"
)
ms_token = input("msToken: ")
return ms_token

View File

@@ -167,7 +167,7 @@ class TikTokDownloader:
f"No new videos to be downloaded for the hashtag: {hashtag}"
)
# Populate list of URLs to download using yt-dlp, and list of image
# Populate list of URLs to download using yt-dlp, and list of image
# galleries to download using the `download_gallery` function
urls_to_download = []
galleries_to_download = []
@@ -233,7 +233,7 @@ class TikTokDownloader:
ax.set_ylim(min(y_pos) - 1, max(y_pos) + 1)
ax.set_title(f"Co-occurring hashtags for #{hashtag} posts")
ax.xaxis.set_major_formatter(mtick.PercentFormatter(decimals=0))
# Write image of plot to file
current_time = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
plot_file = self.data_dir / hashtag / "plots" / f"{hashtag}__{current_time}.png"
@@ -244,7 +244,7 @@ class TikTokDownloader:
def run(self, download: bool, plot: bool, table: bool, number: int):
"""Execute the specified operations on all specified hashtags."""
# Scrape all specified hashtags and perform analyses, depending on if
# Scrape all specified hashtags and perform analyses, depending on if
# `--table` and `--plot` flags are used in the command
for hashtag in self.hashtags:
self.get_hashtag_posts(hashtag=hashtag)