diff --git a/.gitignore b/.gitignore index eca42b1..525e540 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ # Data directory data/ +build/ +*.egg-info/ # Miscellaneous files **/.DS_Store diff --git a/README.md b/README.md index b0e3f25..75e5e26 100644 --- a/README.md +++ b/README.md @@ -59,40 +59,38 @@ The `data` folder contains all the downloaded data as shown in the tree diagram ## How to use ### Post downloading -Running the `tiktok-hashtag-analysis` command with the following options will scrape posts containing the hashtags `#london`, `#paris`, or `#newyork`: +Running the `tiktok-hashtag-analysis` command with the following options will scrape posts that contain the hashtags `#london`, `#paris`, or `#newyork`: tiktok-hashtag-analysis london paris newyork and will produce an output similar to the following log: - $ tiktok-hashtag-analysis download -t london paris newyork -p + $ tiktok-hashtag-analysis download london paris newyork Hashtags to scrape: ['london', 'paris', 'newyork'] Scraped 963 posts containing the hashtag 'london' Scraped 961 posts containing the hashtag 'paris' Scraped 940 posts containing the hashtag 'newyork' Successfully scraped 2864 total entries -- The `-t` flag allows a space-separated list of hashtags to be specified as a command line argument -- The `-p` flag specifies that posts, not videos, will be downloaded +- The list of hashtags to scrape is specified as a positional argument ### Video downloading -Running the `tiktok-hashtag-analysis download` script with the following options will scrape trending videos containing the hashtag `#london`: -`tiktok-hashtag-analysis download -t london -v` +Running the `tiktok-hashtag-analysis` script with the following options will scrape trending videos containing the hashtag `#london`: +`tiktok-hashtag-analysis download london --download` -- The `-t` flag allows a space-separated list of hashtags to be specified as a command line argument -- The `-v` flag specifies that videos, not posts, will be downloaded +- The `--download` flag specifies that video files for scraped posts should be downloaded -Note that video downloading is a time and data rate consuming task, as a result we recommend using one hashtag at a time when using the `-v` flag to avoid complications. +Note that video downloading is a time and data rate consuming task, as a result we recommend using one hashtag at a time when using the `--download` flag to avoid complications. ## Analyzing results -### Top n hashtag occurrences -The script `tiktok-hashtag-analysis frequencies` analyzes the frequencies of top occurring hashtags in a given set of posts. +### Most common co-occurring hashtags +In addition to scraping data and downloading videos, the `tiktok-hashtag-analysis` script can also analyze the frequencies of the most common co-occurring hashtags in a given set of posts. -Assume we want to analyze the 20 most frequently occurring hashtags in the downloaded posts of the `#london` hashtag. +Assume we want to analyze the 20 most frequently co-occurring hashtags in the downloaded posts of the `#london` hashtag. - The results can be plotted and saved as a PNG file by executing the following command: - `tiktok-hashtag-analysis frequencies --hashtag london --number 20 --plot` + `tiktok-hashtag-analysis london --number 20 --plot` which will produce a figure similar to that shown below:

@@ -103,32 +101,33 @@ Assume we want to analyze the 20 most frequently occurring hashtags in the downl - The results can be displayed in tabular form by executing the following command: - `tiktok-hashtag-analysis frequencies --hashtag london --number 20 --print` + `tiktok-hashtag-analysis london --number 20 --table` which will produce a terminal output similar to the following: ``` - Rank Hashtag Occurrences Frequency - 0 london 960 1.0000 - 1 fyp 494 0.5146 - 2 uk 238 0.2479 - 3 foryou 221 0.2302 - 4 foryoupage 184 0.1917 - 5 viral 179 0.1865 - 6 fypシ 84 0.0875 - 7 funny 56 0.0583 - 8 xyzbca 51 0.0531 - 9 british 45 0.0469 - 10 england 44 0.0458 - 11 trending 40 0.0417 - 12 fy 33 0.0344 - 13 comedy 32 0.0333 - 14 roadman 28 0.0292 - 15 4u 27 0.0281 - 16 usa 26 0.0271 - 17 tiktok 26 0.0271 - 18 travel 21 0.0219 - 19 america 20 0.0208 - Total posts: 960 + Co-occurring hashtags for #london posts + Rank Hashtag Occurrences Frequency + 0 london 881 1.0000 + 1 fyp 399 0.4529 + 2 uk 174 0.1975 + 3 foryou 168 0.1907 + 4 viral 152 0.1725 + 5 foryoupage 137 0.1555 + 6 fypシ 73 0.0829 + 7 funny 54 0.0613 + 8 tiktok 43 0.0488 + 9 trending 43 0.0488 + 10 british 41 0.0465 + 11 england 38 0.0431 + 12 xyzbca 34 0.0386 + 13 fy 33 0.0375 + 14 usa 33 0.0375 + 15 love 29 0.0329 + 16 comedy 25 0.0284 + 17 royalfamily 23 0.0261 + 18 queen 23 0.0261 + 19 queenelizabeth 22 0.0250 + Total posts: 881 ``` The `Frequency` column shows the ratio of the occurrence to the total number of downloaded posts. diff --git a/tiktok_hashtag_analysis/__main__.py b/tiktok_hashtag_analysis/__main__.py index 8e7dce2..8a9e5ee 100644 --- a/tiktok_hashtag_analysis/__main__.py +++ b/tiktok_hashtag_analysis/__main__.py @@ -1,7 +1,6 @@ import logging import argparse from pathlib import Path -import sys from .base import TikTokDownloader, load_hashtags_from_file diff --git a/tiktok_hashtag_analysis/auth.py b/tiktok_hashtag_analysis/auth.py new file mode 100644 index 0000000..17b8f3c --- /dev/null +++ b/tiktok_hashtag_analysis/auth.py @@ -0,0 +1,67 @@ +import os +import configparser +from pathlib import Path +import logging + + +class Authorization: + """Handle authorization for TikTok, using the `msToken`.""" + + def __init__(self): + self.config_file = Path.home() / ".tiktok" + self.section = "TikTok" + self.ms_token = None + + def get_token(self): + """Load the "msToken" cookie taken from TikTok, which the scraper requires.""" + + # Step 1: check if MS_TOKEN is defined as environment variable + if ms_token := os.environ.get("MS_TOKEN"): + self.ms_token = ms_token + logging.info("Loaded token from environment variable") + + # Step 2: check if MS_TOKEN is defined in config file + elif self.config_file.is_file(): + if ms_token := self.load_token(): + self.ms_token = ms_token + logging.info(f"Loaded token from config file: {self.config_file}") + + # Step 3: have user enter MS_TOKEN via terminal + else: + ms_token = self.input_token() + self.dump_token(ms_token=ms_token) + self.ms_token = ms_token + logging.info( + f"Loaded token from user input and saved to config file: {self.config_file}" + ) + + return self.ms_token + + def load_token(self): + """Parse a config file and extract the token.""" + + config = configparser.ConfigParser() + config.read(self.config_file) + return config.get(section=self.section, option="MS_TOKEN", fallback=None) + + def dump_token(self, ms_token): + """Write the token to a config file.""" + + config = configparser.ConfigParser() + config.read(self.config_file) + config.add_section(self.section) + config.set(section=self.section, option="MS_TOKEN", value=ms_token) + + with open(self.config_file, "w") as f: + config.write(f) + + def input_token(self): + """Allow user to manually enter the token in the terminal.""" + + print( + "\nPlease copy and paste your `msToken` cookie taken from your web browser when visiting the TikTok website. See [THIS VIDEO] for more information.\n" + ) + + ms_token = input("msToken: ") + + return ms_token diff --git a/tiktok_hashtag_analysis/base.py b/tiktok_hashtag_analysis/base.py index 63224ef..74df81b 100644 --- a/tiktok_hashtag_analysis/base.py +++ b/tiktok_hashtag_analysis/base.py @@ -167,7 +167,7 @@ class TikTokDownloader: f"No new videos to be downloaded for the hashtag: {hashtag}" ) - # Populate list of URLs to download using yt-dlp, and list of image + # Populate list of URLs to download using yt-dlp, and list of image # galleries to download using the `download_gallery` function urls_to_download = [] galleries_to_download = [] @@ -233,7 +233,7 @@ class TikTokDownloader: ax.set_ylim(min(y_pos) - 1, max(y_pos) + 1) ax.set_title(f"Co-occurring hashtags for #{hashtag} posts") ax.xaxis.set_major_formatter(mtick.PercentFormatter(decimals=0)) - + # Write image of plot to file current_time = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") plot_file = self.data_dir / hashtag / "plots" / f"{hashtag}__{current_time}.png" @@ -244,7 +244,7 @@ class TikTokDownloader: def run(self, download: bool, plot: bool, table: bool, number: int): """Execute the specified operations on all specified hashtags.""" - # Scrape all specified hashtags and perform analyses, depending on if + # Scrape all specified hashtags and perform analyses, depending on if # `--table` and `--plot` flags are used in the command for hashtag in self.hashtags: self.get_hashtag_posts(hashtag=hashtag)