From 6a56c354e1e5d9f8ac59d79a0afac765c09094ee Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Wed, 6 Sep 2023 13:17:27 -0500 Subject: [PATCH 1/7] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5f891ac..05aaf4e 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ The tool helps to download posts and videos from TikTok for a given set of hasht You should now be ready to start using it. +The scraper this tool uses requires an `msToken` taken from the TikTok website on your browser. The first time you run the tool, it will ask for this token. You can see how to retrieve the token by accessing your browser's "Developer Tools", and how to input its value into the tool's command-line interface in [this video](https://github.com/bellingcat/tiktok-hashtag-analysis/assets/18430739/b9d40957-c59e-4b6d-a843-13d210f89055). ## About the tool ### Command-line arguments @@ -146,4 +147,4 @@ and then run the tests using the following command: pytest ``` -This repo uses [black](https://github.com/psf/black) to format source code, please run the `black` command before submitting a PR. \ No newline at end of file +This repo uses [black](https://github.com/psf/black) to format source code, please run the `black` command before submitting a PR. From 91a8aaef385a16e93f0db09a3128684986fe96cd Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Wed, 6 Sep 2023 19:51:16 -0500 Subject: [PATCH 2/7] added video link to msToken input, improved handling of output directories without write permission (and added relevant unit test), removed unused requirements.txt things --- requirements.txt | 5 --- setup.py | 26 --------------- tests/cli.py | 58 +++++++++++++++++++++++++++++++-- tiktok_hashtag_analysis/auth.py | 3 +- tiktok_hashtag_analysis/base.py | 9 +++-- tiktok_hashtag_analysis/cli.py | 34 +++++++++++++++++-- 6 files changed, 93 insertions(+), 42 deletions(-) delete mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index e4144ef..0000000 --- a/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -seaborn==0.12.2 -matplotlib==3.7.2 -yt-dlp==2023.7.6 -TikTokApi==6.1.1 -requests==2.31.0 \ No newline at end of file diff --git a/setup.py b/setup.py index 5760f41..52d7d35 100644 --- a/setup.py +++ b/setup.py @@ -1,30 +1,6 @@ from setuptools import setup -def read_requirements(filename: str): - with open(filename) as requirements_file: - import re - - def fix_url_dependencies(req: str) -> str: - """Pip and setuptools disagree about how URL dependencies should be handled.""" - m = re.match( - r"^(git\+)?(https|ssh)://(git@)?github\.com/([\w-]+)/(?P[\w-]+)\.git", - req, - ) - if m is None: - return req - else: - return f"{m.group('name')} @ {req}" - - requirements = [] - for line in requirements_file: - line = line.strip() - if line.startswith("#") or len(line) <= 0: - continue - requirements.append(fix_url_dependencies(line)) - return requirements - - with open("README.md", "r", encoding="utf-8") as file: long_description = file.read() @@ -45,8 +21,6 @@ setup( long_description_content_type="text/markdown", url="https://github.com/bellingcat/tiktok-hashtag-analysis", license="MIT License", - # install_requires=read_requirements("requirements.txt"), - # extras_require={"dev": read_requirements("dev-requirements.txt")}, install_requires=["seaborn", "matplotlib", "TikTokApi", "requests", "yt_dlp"], extras_require={"test": ["pytest", "pytest-cov", "pytest-html", "pytest-metadata"]}, classifiers=[ diff --git a/tests/cli.py b/tests/cli.py index dd58f5e..ea7f399 100644 --- a/tests/cli.py +++ b/tests/cli.py @@ -1,8 +1,15 @@ +import os +from pathlib import Path + import pytest -from tiktok_hashtag_analysis.cli import create_parser +from tiktok_hashtag_analysis.cli import ( + create_parser, + process_output_dir, + DEFAULT_OUTPUT_DIR, +) -ARGUMENTS = [ +PARSER_ARGUMENTS = [ ("file", "hashtags.txt", "--file"), ("download", True, "--download"), ("download", True, "-d"), @@ -17,7 +24,7 @@ ARGUMENTS = [ ] -@pytest.mark.parametrize("attribute,value,flag", ARGUMENTS) +@pytest.mark.parametrize("attribute,value,flag", PARSER_ARGUMENTS) def test_parser(hashtags, attribute, value, flag): argument_list = [*hashtags, flag] @@ -29,3 +36,48 @@ def test_parser(hashtags, attribute, value, flag): assert args.get(attribute) == value assert args.get("hashtags") == hashtags + + +def test_process_output_dir(monkeypatch, tmp_path): + + home_dir = Path.home().resolve() + + # Specified nonexistent output directory without write permissions + parser = create_parser() + specified_output_dir = home_dir.parent / "test" + with pytest.raises(SystemExit) as system_exit: + result = process_output_dir( + specified_output_dir=specified_output_dir, parser=parser + ) + assert system_exit.type == SystemExit + + # Specified existing output directory without write permissions + parser = create_parser() + specified_output_dir = home_dir.parent + with pytest.raises(SystemExit) as system_exit: + result = process_output_dir( + specified_output_dir=specified_output_dir, parser=parser + ) + assert system_exit.type == SystemExit + + # Unspecified, in current directory without write permissions + cwd = os.getcwd() + monkeypatch.chdir(specified_output_dir) + result = process_output_dir(specified_output_dir=None, parser=parser) + monkeypatch.chdir(cwd) + assert result == DEFAULT_OUTPUT_DIR + + # Specified nonexisting output directory with write permissions + parser = create_parser() + specified_output_dir = tmp_path / "test" / "tiktok" + result = process_output_dir( + specified_output_dir=specified_output_dir, parser=parser + ) + assert result == specified_output_dir + + # Unspecified, in current directory with write permissions + cwd = os.getcwd() + monkeypatch.chdir(specified_output_dir) + result = process_output_dir(specified_output_dir=None, parser=parser) + monkeypatch.chdir(cwd) + assert result == DEFAULT_OUTPUT_DIR diff --git a/tiktok_hashtag_analysis/auth.py b/tiktok_hashtag_analysis/auth.py index 545e2ce..3255ad9 100644 --- a/tiktok_hashtag_analysis/auth.py +++ b/tiktok_hashtag_analysis/auth.py @@ -15,7 +15,6 @@ class Authorization: self.config_file = Path.home() / ".tiktok" self.section = "TikTok" - self.ms_token = None def get_token(self) -> str: """Load the "msToken" cookie taken from TikTok, which the scraper requires.""" @@ -64,7 +63,7 @@ class Authorization: """Allow user to manually enter the token in the terminal.""" print( - "\nPlease copy and paste your `msToken` cookie taken from your web browser when visiting the TikTok website. See [THIS VIDEO] for more information.\n" + "\nPlease copy and paste your `msToken` cookie taken from your web browser when visiting the TikTok website. For more information, watch the video: https://tinyurl.com/tiktok-mstoken\n" ) ms_token = input("msToken: ") diff --git a/tiktok_hashtag_analysis/base.py b/tiktok_hashtag_analysis/base.py index d7a9e9e..694b82a 100644 --- a/tiktok_hashtag_analysis/base.py +++ b/tiktok_hashtag_analysis/base.py @@ -7,7 +7,7 @@ import warnings import asyncio import logging import re -from typing import List, Dict +from typing import List, Dict, Optional import yt_dlp import requests @@ -101,7 +101,9 @@ def aggregate_cooccurring_hashtags(hashtag_file: Path) -> Counter: class TikTokDownloader: """Main class for scraping data from TikTok.""" - def __init__(self, hashtags: List[str], data_dir: str, config_file: str = None): + def __init__( + self, hashtags: List[str], data_dir: Path, config_file: Optional[str] = None + ): self.hashtags = process_hashtag_list(hashtags) logging.info(f"Hashtags to scrape: {hashtags}") @@ -146,7 +148,8 @@ class TikTokDownloader: json_dump(file_path=hashtag_file, data=all_fetched_data) logging.info( f"Scraped {len(new_fetched_data)} new posts containing the hashtag " - f"'{hashtag}', with {len(already_fetched_data)} posts previously scraped" + f"'{hashtag}' to output directory {self.data_dir}, with " + f"{len(already_fetched_data)} posts previously scraped" ) def get_hashtag_videos(self, hashtag: str): diff --git a/tiktok_hashtag_analysis/cli.py b/tiktok_hashtag_analysis/cli.py index 3c3bbfd..333ed49 100644 --- a/tiktok_hashtag_analysis/cli.py +++ b/tiktok_hashtag_analysis/cli.py @@ -1,9 +1,12 @@ +import os import logging import argparse from pathlib import Path - +from typing import Optional from .base import TikTokDownloader, load_hashtags_from_file +DEFAULT_OUTPUT_DIR = Path.home() / "tiktok_hashtag_data" + def create_parser(): """Create parser tp parse input command-line arguments.""" @@ -51,7 +54,7 @@ def create_parser(): "--output-dir", type=str, help="Directory to save scraped data and visualizations to", - default=Path(".").resolve().parent / "data", + default=None, ) parser.add_argument( "--config", @@ -64,6 +67,29 @@ def create_parser(): return parser +def process_output_dir( + specified_output_dir: Optional[str], parser: argparse.ArgumentParser +) -> Path: + """Make sure the output directory can be created or has write permissions.""" + + error_message = ( + lambda _output_dir: f"You don't have write permissions for the specified output directory (`{_output_dir}`). Please specify an output directory that you have write access to." + ) + + if specified_output_dir is None: + return DEFAULT_OUTPUT_DIR + else: + _output_dir = Path(specified_output_dir).resolve() + try: + os.makedirs(_output_dir, exist_ok=True) + if not os.access(path=_output_dir, mode=os.W_OK): + parser.error(error_message(_output_dir)) + else: + return _output_dir + except PermissionError: + parser.error(error_message(_output_dir)) + + def main(): """Parse and process command-line arguments, scrape specified hashtags, and perform specified analyses.""" @@ -89,8 +115,10 @@ def main(): else: hashtags = args.hashtags + output_dir = process_output_dir(specified_output_dir=args.output_dir, parser=parser) + downloader = TikTokDownloader( - hashtags=hashtags, data_dir=args.output_dir, config_file=args.config + hashtags=hashtags, data_dir=output_dir, config_file=args.config ) downloader.run( From 1f4b956ce901796dca8667e197ea05068fc935b7 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Thu, 7 Sep 2023 11:18:22 -0500 Subject: [PATCH 3/7] made scraping more robust against transient playwright exceptions, set order of hashtags to scrape based on file modified time --- README.md | 2 +- setup.py | 2 +- tiktok_hashtag_analysis/base.py | 31 ++++++++++++++++++++++++------- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 05aaf4e..01cd57a 100644 --- a/README.md +++ b/README.md @@ -147,4 +147,4 @@ and then run the tests using the following command: pytest ``` -This repo uses [black](https://github.com/psf/black) to format source code, please run the `black` command before submitting a PR. +This repo uses [black](https://github.com/psf/black) to format source code, please run the `black` command before submitting a PR. \ No newline at end of file diff --git a/setup.py b/setup.py index 52d7d35..c53600d 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ setup( long_description_content_type="text/markdown", url="https://github.com/bellingcat/tiktok-hashtag-analysis", license="MIT License", - install_requires=["seaborn", "matplotlib", "TikTokApi", "requests", "yt_dlp"], + install_requires=["seaborn", "matplotlib", "TikTokApi", "requests", "yt_dlp", "tenacity"], extras_require={"test": ["pytest", "pytest-cov", "pytest-html", "pytest-metadata"]}, classifiers=[ "Development Status :: 5 - Production/Stable", diff --git a/tiktok_hashtag_analysis/base.py b/tiktok_hashtag_analysis/base.py index 694b82a..71884d5 100644 --- a/tiktok_hashtag_analysis/base.py +++ b/tiktok_hashtag_analysis/base.py @@ -14,7 +14,8 @@ import requests import matplotlib.pyplot as plt import matplotlib.ticker as mtick import seaborn as sns - +from tenacity import retry, retry_if_exception_type, stop_after_attempt +from playwright._impl._api_types import Error from TikTokApi import TikTokApi from .auth import Authorization @@ -40,6 +41,8 @@ def load_hashtags_from_file(file: str) -> List[str]: return process_hashtag_list(hashtags=hashtags) +# Retry upon encountering transient playwright errors +@retry(retry=retry_if_exception_type(Error), stop=stop_after_attempt(3)) async def _fetch_hashtag_data(hashtag: str, ms_token: str) -> List[Dict]: """Fetch data for videos containing a specified hashtag, asynchronously.""" data = [] @@ -105,14 +108,28 @@ class TikTokDownloader: self, hashtags: List[str], data_dir: Path, config_file: Optional[str] = None ): self.hashtags = process_hashtag_list(hashtags) - logging.info(f"Hashtags to scrape: {hashtags}") self.data_dir = Path(data_dir) os.makedirs(self.data_dir, exist_ok=True) + self.prioritize_hashtags() + logging.info(f"Hashtags to scrape: {self.hashtags}") + logging.info(f"Writing data to directory: {self.data_dir}") + self.auth = Authorization(config_file=config_file) self.ms_token = self.auth.get_token() + def prioritize_hashtags(self): + """Order hashtags basd on whether they've been scraped before, and + the time they were most recently scraped""" + + previously_scraped_hashtags = set(os.listdir(self.data_dir)) + last_edited = { + hashtag: (self.data_dir / hashtag / "posts.json").lstat().st_mtime + for hashtag in previously_scraped_hashtags + } + self.hashtags.sort(key=lambda h: last_edited.get(h, 0)) + def get_hashtag_posts(self, hashtag: str): """Fetch data about posts that used a specified hashtag and merge with existing data, if it exists.""" @@ -148,8 +165,7 @@ class TikTokDownloader: json_dump(file_path=hashtag_file, data=all_fetched_data) logging.info( f"Scraped {len(new_fetched_data)} new posts containing the hashtag " - f"'{hashtag}' to output directory {self.data_dir}, with " - f"{len(already_fetched_data)} posts previously scraped" + f"'{hashtag}', with {len(already_fetched_data)} posts previously scraped" ) def get_hashtag_videos(self, hashtag: str): @@ -229,10 +245,11 @@ class TikTokDownloader: # Define labels and other fields used in plot total_posts = max(frequencies.values()) + frequencies.pop(hashtag) sorted_frequencices = frequencies.most_common(number) - labels = [label for label, _ in sorted_frequencices[1:]] - ratios = [freq / total_posts * 100 for _, freq in sorted_frequencices[1:]] - y_pos = list(reversed(range(len(sorted_frequencices) - 1))) + labels = [label for label, _ in sorted_frequencices] + ratios = [freq / total_posts * 100 for _, freq in sorted_frequencices] + y_pos = list(reversed(range(len(sorted_frequencices)))) # Visualize data in bar chart fig, ax = plt.subplots(figsize=(5, 6.66)) From 6fa1e5026c5667b6d14ec1bc515f01d70ff3fc6e Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Sat, 9 Sep 2023 00:42:56 -0500 Subject: [PATCH 4/7] made downloading more robust against transient and permanent errors, fixed issue where media file URLs weren't being updated after scraping --- README.md | 11 +++- setup.py | 21 +++++- tests/base.py | 2 +- tests/cli.py | 3 + tiktok_hashtag_analysis/auth.py | 6 +- tiktok_hashtag_analysis/base.py | 111 +++++++++++++++++++++----------- tiktok_hashtag_analysis/cli.py | 20 +++++- 7 files changed, 124 insertions(+), 50 deletions(-) diff --git a/README.md b/README.md index 01cd57a..166550c 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ The scraper this tool uses requires an `msToken` taken from the TikTok website o ## About the tool ### Command-line arguments ``` -usage: tiktok-hashtag-analysis [-h] [--file FILE] [-d] [--number NUMBER] [-p] [-t] [--output-dir OUTPUT_DIR] [--config CONFIG] [--log LOG] [hashtags ...] +usage: tiktok-hashtag-analysis [-h] [--file FILE] [-d] [--number NUMBER] [-p] [-t] [--output-dir OUTPUT_DIR] [--config CONFIG] [--log LOG] [--limit LIMIT] [-v] [hashtags ...] Analyze hashtags within posts scraped from TikTok. @@ -34,6 +34,8 @@ optional arguments: Directory to save scraped data and visualizations to --config CONFIG File name of configuration file to store TikTok credentials to --log LOG File to write logs to + --limit LIMIT Maximum number of videos to download for each hashtag + -v, --verbose Increase output verbosity ``` ### Structure of output data @@ -138,7 +140,7 @@ Assume we want to analyze the 20 most frequently co-occurring hashtags in the do To run the build-in tests in the `tests/` directory, first install the test dependency packages: ``` -pip install .[test] +pip install .[dev] ``` and then run the tests using the following command: @@ -147,4 +149,7 @@ and then run the tests using the following command: pytest ``` -This repo uses [black](https://github.com/psf/black) to format source code, please run the `black` command before submitting a PR. \ No newline at end of file +This repo uses [black](https://github.com/psf/black) to format source code and [mypy](https://mypy.readthedocs.io/en/stable/) for static type checking. Before submitting a pull request, please run both tools on the source code. + +- yt-dlp warning: (unable to find video in feed) +https://www.tiktok.com/@sa_diya_34/video/7261180335763754242 diff --git a/setup.py b/setup.py index c53600d..13d1599 100644 --- a/setup.py +++ b/setup.py @@ -21,8 +21,25 @@ setup( long_description_content_type="text/markdown", url="https://github.com/bellingcat/tiktok-hashtag-analysis", license="MIT License", - install_requires=["seaborn", "matplotlib", "TikTokApi", "requests", "yt_dlp", "tenacity"], - extras_require={"test": ["pytest", "pytest-cov", "pytest-html", "pytest-metadata"]}, + install_requires=[ + "seaborn", + "matplotlib", + "TikTokApi", + "requests", + "yt_dlp", + "tenacity", + "msvc-runtime; os_name=='nt'", + ], + extras_require={ + "dev": [ + "pytest", + "pytest-cov", + "pytest-html", + "pytest-metadata", + "black", + "mypy", + ] + }, classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Information Technology", diff --git a/tests/base.py b/tests/base.py index c0d2a07..dbc139d 100644 --- a/tests/base.py +++ b/tests/base.py @@ -3,7 +3,7 @@ from tiktok_hashtag_analysis.base import TikTokDownloader, load_hashtags_from_fi def test_scrape(tmp_path, hashtags): downloader = TikTokDownloader(hashtags=hashtags[:1], data_dir=tmp_path) - downloader.run(download=True, plot=True, table=True, number=20) + downloader.run(limit=1000, download=True, plot=True, table=True, number=20) def test_load_hashtags_from_file(tmp_path, hashtags): diff --git a/tests/cli.py b/tests/cli.py index ea7f399..58999cb 100644 --- a/tests/cli.py +++ b/tests/cli.py @@ -13,11 +13,14 @@ PARSER_ARGUMENTS = [ ("file", "hashtags.txt", "--file"), ("download", True, "--download"), ("download", True, "-d"), + ("limit", 1000, "--limit"), ("number", 20, "--number"), ("plot", True, "--plot"), ("plot", True, "-p"), ("table", True, "--table"), ("table", True, "-t"), + ("verbose", True, "--verbose"), + ("verbose", True, "-v"), ("output_dir", "/tmp/tiktok_download", "--output-dir"), ("config", "~/.tiktok", "--config"), ("log", "../logfile.log", "--log"), diff --git a/tiktok_hashtag_analysis/auth.py b/tiktok_hashtag_analysis/auth.py index 3255ad9..16252ab 100644 --- a/tiktok_hashtag_analysis/auth.py +++ b/tiktok_hashtag_analysis/auth.py @@ -22,20 +22,20 @@ class Authorization: # Step 1: check if MS_TOKEN is defined as environment variable if ms_token := os.environ.get("MS_TOKEN"): self.ms_token = ms_token - logging.info("Loaded token from environment variable") + logging.debug("Loaded token from environment variable") # Step 2: check if MS_TOKEN is defined in config file elif self.config_file.is_file(): if ms_token := self.load_token(): self.ms_token = ms_token - logging.info(f"Loaded token from config file: {self.config_file}") + logging.debug(f"Loaded token from config file: {self.config_file}") # Step 3: have user enter MS_TOKEN via terminal else: ms_token = self.input_token() self.dump_token(ms_token=ms_token) self.ms_token = ms_token - logging.info( + logging.debug( f"Loaded token from user input and saved to config file: {self.config_file}" ) diff --git a/tiktok_hashtag_analysis/base.py b/tiktok_hashtag_analysis/base.py index 71884d5..92d6d1e 100644 --- a/tiktok_hashtag_analysis/base.py +++ b/tiktok_hashtag_analysis/base.py @@ -7,14 +7,22 @@ import warnings import asyncio import logging import re +from urllib.error import HTTPError from typing import List, Dict, Optional import yt_dlp +from yt_dlp.utils import ExtractorError, DownloadError import requests import matplotlib.pyplot as plt import matplotlib.ticker as mtick import seaborn as sns -from tenacity import retry, retry_if_exception_type, stop_after_attempt +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + TryAgain, + wait_exponential, +) from playwright._impl._api_types import Error from TikTokApi import TikTokApi @@ -43,12 +51,12 @@ def load_hashtags_from_file(file: str) -> List[str]: # Retry upon encountering transient playwright errors @retry(retry=retry_if_exception_type(Error), stop=stop_after_attempt(3)) -async def _fetch_hashtag_data(hashtag: str, ms_token: str) -> List[Dict]: +async def _fetch_hashtag_data(hashtag: str, ms_token: str, limit: int) -> List[Dict]: """Fetch data for videos containing a specified hashtag, asynchronously.""" data = [] async with TikTokApi() as api: - await api.create_sessions(ms_tokens=[ms_token], num_sessions=1, sleep_after=3) - async for video in api.hashtag(name=hashtag).videos(count=1000): + await api.create_sessions(ms_tokens=[], num_sessions=1, sleep_after=3) + async for video in api.hashtag(name=hashtag).videos(count=limit): data.append(video.as_dict) return data @@ -66,22 +74,44 @@ def json_dump(file_path: Path, data: List): json.dump(obj=data, fp=f) +@retry(wait=wait_exponential(multiplier=1, max=10)) +def _get(url: str) -> requests.Response: + """Safe version of requests.get that can handle timeouts and retries""" + + r = requests.get(url=url, timeout=30) + if r.status_code not in {200, 403}: + raise TryAgain + else: + return r + + +def download_file_and_save(url: str, filepath: Path): + """Download a file from a specified URL and write its contents to a file""" + + r = _get(url=url) + if r.status_code == 403: + return + ext = r.headers["Content-Type"].split("/")[-1] + path_with_ext = filepath.with_suffix(f".{ext}") + with open(path_with_ext, "wb") as f: + f.write(r.content) + logging.debug(f"Saved file to: {path_with_ext}") + + def download_gallery(video_data: Dict, video_dir: Path): - """yt-dlp doesn't seem to support downloading images from an image gallery, - so this is a quick fix that likely will fail on edge cases.""" + """yt-dlp doesn't support downloading images from an image gallery, + so this downloads all images and audio files from image galleries.""" video_id = video_data["id"] + # A small percentage of image galleries don't have an associated audio file if play_url := video_data["music"]["playUrl"]: - r = requests.get(play_url) - with open(video_dir / f"{video_id}.mp3", "wb") as f: - f.write(r.content) + filepath = video_dir / f"{video_id}" + download_file_and_save(url=play_url, filepath=filepath) for i, image in enumerate(video_data["imagePost"]["images"]): image_url = image["imageURL"]["urlList"][0] - r = requests.get(image_url) - ext = r.headers["Content-Type"].split("/")[-1] - with open(video_dir / f"{video_id}_{i:02d}.{ext}", "wb") as f: - f.write(r.content) + filepath = video_dir / f"{video_id}_{i:02d}" + download_file_and_save(url=image_url, filepath=filepath) def aggregate_cooccurring_hashtags(hashtag_file: Path) -> Counter: @@ -120,17 +150,16 @@ class TikTokDownloader: self.ms_token = self.auth.get_token() def prioritize_hashtags(self): - """Order hashtags basd on whether they've been scraped before, and + """Order hashtags based on whether they've been scraped before, and the time they were most recently scraped""" - previously_scraped_hashtags = set(os.listdir(self.data_dir)) last_edited = { - hashtag: (self.data_dir / hashtag / "posts.json").lstat().st_mtime - for hashtag in previously_scraped_hashtags + file.parts[-2]: file.lstat().st_mtime + for file in self.data_dir.glob("*/posts.json") } self.hashtags.sort(key=lambda h: last_edited.get(h, 0)) - def get_hashtag_posts(self, hashtag: str): + def get_hashtag_posts(self, hashtag: str, limit: int): """Fetch data about posts that used a specified hashtag and merge with existing data, if it exists.""" @@ -141,31 +170,32 @@ class TikTokDownloader: # If there are previously scraped posts, load them if hashtag_file.is_file(): already_fetched_data = json_load(file_path=hashtag_file) - already_fetched_ids = set(video["id"] for video in already_fetched_data) else: - already_fetched_ids = set() already_fetched_data = [] + already_fetched_ids = set(video["id"] for video in already_fetched_data) # Scrape posts that use the specified hashtag fetched_data = asyncio.run( - _fetch_hashtag_data(hashtag=hashtag, ms_token=self.ms_token) + _fetch_hashtag_data(hashtag=hashtag, ms_token=self.ms_token, limit=limit) ) + fetched_ids = set(video["id"] for video in fetched_data) + if len(fetched_data) == 0: logging.warning(f"No posts were found for the hashtag: {hashtag}") # Determine which newly scraped posts haven't been scraped before - new_fetched_data = [ - video for video in fetched_data if video["id"] not in already_fetched_ids + old_fetched_data = [ + video for video in already_fetched_data if video["id"] not in fetched_ids ] - if len(new_fetched_data) == 0: - logging.warning(f"No new posts were found for the hashtag: {hashtag}") + new_post_count = len(fetched_ids - already_fetched_ids) + old_post_count = len(already_fetched_ids) # Merge new and old data and write to file - all_fetched_data = already_fetched_data + new_fetched_data + all_fetched_data = old_fetched_data + fetched_data json_dump(file_path=hashtag_file, data=all_fetched_data) logging.info( - f"Scraped {len(new_fetched_data)} new posts containing the hashtag " - f"'{hashtag}', with {len(already_fetched_data)} posts previously scraped" + f"Scraped {new_post_count} new posts containing the hashtag " + f"'{hashtag}', with {old_post_count} posts previously scraped" ) def get_hashtag_videos(self, hashtag: str): @@ -186,10 +216,6 @@ class TikTokDownloader: new_video_list = [ video for video in video_list if video["id"] not in already_downloaded_ids ] - if len(new_video_list) == 0: - logging.warning( - f"No new videos to be downloaded for the hashtag: {hashtag}" - ) # Populate list of URLs to download using yt-dlp, and list of image # galleries to download using the `download_gallery` function @@ -197,6 +223,8 @@ class TikTokDownloader: galleries_to_download = [] for video in new_video_list: if video.get("imagePost") is None: + if video.get("author") is None: + continue url = f"https://www.tiktok.com/@{video['author']['uniqueId']}/video/{video['id']}" urls_to_download.append(url) else: @@ -206,6 +234,7 @@ class TikTokDownloader: if len(galleries_to_download) > 0: logging.info(f"Downloading image galleries for hashtag {hashtag}") for video in galleries_to_download: + logging.debug(f"Downloading image gallery for video: {video['id']}") download_gallery(video_data=video, video_dir=video_dir) # Download video files for all video posts @@ -216,7 +245,14 @@ class TikTokDownloader: "ignore_errors": True, } with yt_dlp.YoutubeDL(ydl_opts) as ydl: - ydl.download(urls_to_download) + for url in urls_to_download: + try: + ydl.download([url]) + except (HTTPError, TypeError, ExtractorError, DownloadError) as e: + # catch urllib and yt-dlp errors when video not found + logging.warning( + f"Encountered error {e} when attempting to download url: {url}" + ) def frequency_table(self, hashtag: str, number: int): """Print `number`-most commonly co-occurring hashtags for a specified @@ -269,19 +305,16 @@ class TikTokDownloader: plt.savefig(plot_file, bbox_inches="tight", facecolor="white", dpi=300) logging.info(f"Plot saved to file: {plot_file}") - def run(self, download: bool, plot: bool, table: bool, number: int): + def run(self, limit: int, download: bool, plot: bool, table: bool, number: int): """Execute the specified operations on all specified hashtags.""" # Scrape all specified hashtags and perform analyses, depending on if - # `--table` and `--plot` flags are used in the command + # `--table`, `--plot`, and `--download` flags are used in the command for hashtag in self.hashtags: - self.get_hashtag_posts(hashtag=hashtag) + self.get_hashtag_posts(hashtag=hashtag, limit=limit) if plot: self.plot(hashtag=hashtag, number=number) if table: self.frequency_table(hashtag=hashtag, number=number) - - # Download media for all hashtags if `--download` flag is used in the command - for hashtag in self.hashtags: if download: self.get_hashtag_videos(hashtag=hashtag) diff --git a/tiktok_hashtag_analysis/cli.py b/tiktok_hashtag_analysis/cli.py index 333ed49..10818fb 100644 --- a/tiktok_hashtag_analysis/cli.py +++ b/tiktok_hashtag_analysis/cli.py @@ -63,6 +63,18 @@ def create_parser(): default=None, ) parser.add_argument("--log", type=str, help="File to write logs to", default=None) + parser.add_argument( + "--limit", + type=int, + help="Maximum number of videos to download for each hashtag", + default=1000, + ) + parser.add_argument( + "-v", + "--verbose", + help="Increase output verbosity", + action="store_true", + ) return parser @@ -97,7 +109,7 @@ def main(): args = parser.parse_args() logging.basicConfig( - level=logging.INFO, + level=logging.DEBUG if args.verbose else logging.INFO, filename=args.log, format="%(asctime)s %(levelname)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", @@ -122,7 +134,11 @@ def main(): ) downloader.run( - download=args.download, plot=args.plot, table=args.table, number=args.number + limit=args.limit, + download=args.download, + plot=args.plot, + table=args.table, + number=args.number, ) From 92861e0e5d86fe569e76496b5d8efd0b8a61f19f Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Mon, 11 Sep 2023 21:29:37 -0500 Subject: [PATCH 5/7] configured verbosity argument with logging level --- README.md | 5 +---- tests/cli.py | 1 - tiktok_hashtag_analysis/base.py | 26 +++++++++++++++----------- tiktok_hashtag_analysis/cli.py | 2 ++ 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 166550c..db0d6f3 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ The tool helps to download posts and videos from TikTok for a given set of hasht ## Pre-requisites 1. Make sure you have Python 3.9 or a later version installed 2. Install the tool with pip: `pip install tiktok-hashtag-analysis` - 1. or directly from the repo version: `pip install git+https://github.com/bellingcat/tiktok-hashtag-analysis` + - Alternatively you can install directly from the latest version on GitHub: `pip install git+https://github.com/bellingcat/tiktok-hashtag-analysis` You should now be ready to start using it. @@ -150,6 +150,3 @@ pytest ``` This repo uses [black](https://github.com/psf/black) to format source code and [mypy](https://mypy.readthedocs.io/en/stable/) for static type checking. Before submitting a pull request, please run both tools on the source code. - -- yt-dlp warning: (unable to find video in feed) -https://www.tiktok.com/@sa_diya_34/video/7261180335763754242 diff --git a/tests/cli.py b/tests/cli.py index 58999cb..cf40f5a 100644 --- a/tests/cli.py +++ b/tests/cli.py @@ -42,7 +42,6 @@ def test_parser(hashtags, attribute, value, flag): def test_process_output_dir(monkeypatch, tmp_path): - home_dir = Path.home().resolve() # Specified nonexistent output directory without write permissions diff --git a/tiktok_hashtag_analysis/base.py b/tiktok_hashtag_analysis/base.py index 92d6d1e..71b01d4 100644 --- a/tiktok_hashtag_analysis/base.py +++ b/tiktok_hashtag_analysis/base.py @@ -31,6 +31,8 @@ from .auth import Authorization warnings.filterwarnings("ignore", message="Glyph (.*) missing from current font") sns.set_theme(style="darkgrid") +logger = logging.getLogger(__name__) + def process_hashtag_list(hashtags: List[str]) -> List[str]: """Convert a list of hashtags to a standard form (remove whitespace, make @@ -95,7 +97,7 @@ def download_file_and_save(url: str, filepath: Path): path_with_ext = filepath.with_suffix(f".{ext}") with open(path_with_ext, "wb") as f: f.write(r.content) - logging.debug(f"Saved file to: {path_with_ext}") + logger.debug(f"Saved file to: {path_with_ext}") def download_gallery(video_data: Dict, video_dir: Path): @@ -143,8 +145,8 @@ class TikTokDownloader: os.makedirs(self.data_dir, exist_ok=True) self.prioritize_hashtags() - logging.info(f"Hashtags to scrape: {self.hashtags}") - logging.info(f"Writing data to directory: {self.data_dir}") + logger.info(f"Hashtags to scrape: {self.hashtags}") + logger.info(f"Writing data to directory: {self.data_dir}") self.auth = Authorization(config_file=config_file) self.ms_token = self.auth.get_token() @@ -181,7 +183,7 @@ class TikTokDownloader: fetched_ids = set(video["id"] for video in fetched_data) if len(fetched_data) == 0: - logging.warning(f"No posts were found for the hashtag: {hashtag}") + logger.warning(f"No posts were found for the hashtag: {hashtag}") # Determine which newly scraped posts haven't been scraped before old_fetched_data = [ @@ -193,7 +195,7 @@ class TikTokDownloader: # Merge new and old data and write to file all_fetched_data = old_fetched_data + fetched_data json_dump(file_path=hashtag_file, data=all_fetched_data) - logging.info( + logger.info( f"Scraped {new_post_count} new posts containing the hashtag " f"'{hashtag}', with {old_post_count} posts previously scraped" ) @@ -232,25 +234,27 @@ class TikTokDownloader: # Download audio and image files for all image gallery posts if len(galleries_to_download) > 0: - logging.info(f"Downloading image galleries for hashtag {hashtag}") + logger.info(f"Downloading image galleries for hashtag {hashtag}") for video in galleries_to_download: - logging.debug(f"Downloading image gallery for video: {video['id']}") + logger.debug(f"Downloading image gallery for video: {video['id']}") download_gallery(video_data=video, video_dir=video_dir) # Download video files for all video posts if len(urls_to_download) > 0: - logging.info(f"Downloading media for hashtag {hashtag}") + logger.info(f"Downloading media for hashtag {hashtag}") + ydl_opts = { "outtmpl": os.path.join(video_dir, "%(id)s.%(ext)s"), "ignore_errors": True, + "quiet": logger.getEffectiveLevel() > logging.DEBUG, } with yt_dlp.YoutubeDL(ydl_opts) as ydl: for url in urls_to_download: try: ydl.download([url]) except (HTTPError, TypeError, ExtractorError, DownloadError) as e: - # catch urllib and yt-dlp errors when video not found - logging.warning( + # Catch urllib and yt-dlp errors when video not found + logger.warning( f"Encountered error {e} when attempting to download url: {url}" ) @@ -303,7 +307,7 @@ class TikTokDownloader: plot_file = self.data_dir / hashtag / "plots" / f"{hashtag}__{current_time}.png" plot_file.parent.mkdir(exist_ok=True, parents=True) plt.savefig(plot_file, bbox_inches="tight", facecolor="white", dpi=300) - logging.info(f"Plot saved to file: {plot_file}") + logger.info(f"Plot saved to file: {plot_file}") def run(self, limit: int, download: bool, plot: bool, table: bool, number: int): """Execute the specified operations on all specified hashtags.""" diff --git a/tiktok_hashtag_analysis/cli.py b/tiktok_hashtag_analysis/cli.py index 10818fb..37141d5 100644 --- a/tiktok_hashtag_analysis/cli.py +++ b/tiktok_hashtag_analysis/cli.py @@ -7,6 +7,8 @@ from .base import TikTokDownloader, load_hashtags_from_file DEFAULT_OUTPUT_DIR = Path.home() / "tiktok_hashtag_data" +logger = logging.getLogger(__name__) + def create_parser(): """Create parser tp parse input command-line arguments.""" From b916512bde5f68072703eeb1813c667c261ec6a7 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Mon, 11 Sep 2023 21:43:33 -0500 Subject: [PATCH 6/7] removed auth module and authorization, since msToken isnt actually required to run scraper --- README.md | 2 - tests/auth.py | 24 ----------- tiktok_hashtag_analysis/auth.py | 71 --------------------------------- tiktok_hashtag_analysis/base.py | 10 +---- 4 files changed, 2 insertions(+), 105 deletions(-) delete mode 100644 tests/auth.py delete mode 100644 tiktok_hashtag_analysis/auth.py diff --git a/README.md b/README.md index db0d6f3..fa210ec 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,6 @@ The tool helps to download posts and videos from TikTok for a given set of hasht You should now be ready to start using it. -The scraper this tool uses requires an `msToken` taken from the TikTok website on your browser. The first time you run the tool, it will ask for this token. You can see how to retrieve the token by accessing your browser's "Developer Tools", and how to input its value into the tool's command-line interface in [this video](https://github.com/bellingcat/tiktok-hashtag-analysis/assets/18430739/b9d40957-c59e-4b6d-a843-13d210f89055). - ## About the tool ### Command-line arguments ``` diff --git a/tests/auth.py b/tests/auth.py deleted file mode 100644 index 6d0c078..0000000 --- a/tests/auth.py +++ /dev/null @@ -1,24 +0,0 @@ -import pytest - -from tiktok_hashtag_analysis.auth import Authorization - -MS_TOKEN = "thisisafakemstokenfortiktok" - - -def test_auth_input(tmp_path, monkeypatch): - config_file = tmp_path / ".tiktok" - monkeypatch.setattr("builtins.input", lambda _: MS_TOKEN) - auth = Authorization(config_file=config_file) - auth.get_token() - - assert auth.ms_token == MS_TOKEN - - -def test_auth(tmp_path): - config_file = tmp_path / ".tiktok" - auth = Authorization(config_file=config_file) - - auth.dump_token(ms_token=MS_TOKEN) - auth.get_token() - - assert auth.ms_token == MS_TOKEN diff --git a/tiktok_hashtag_analysis/auth.py b/tiktok_hashtag_analysis/auth.py deleted file mode 100644 index 16252ab..0000000 --- a/tiktok_hashtag_analysis/auth.py +++ /dev/null @@ -1,71 +0,0 @@ -import os -import configparser -from pathlib import Path -import logging -from typing import Optional - - -class Authorization: - """Handle authorization for TikTok, using the `msToken`.""" - - def __init__(self, config_file: Optional[str] = None): - if config_file: - self.config_file = Path(config_file) - else: - self.config_file = Path.home() / ".tiktok" - - self.section = "TikTok" - - def get_token(self) -> str: - """Load the "msToken" cookie taken from TikTok, which the scraper requires.""" - - # Step 1: check if MS_TOKEN is defined as environment variable - if ms_token := os.environ.get("MS_TOKEN"): - self.ms_token = ms_token - logging.debug("Loaded token from environment variable") - - # Step 2: check if MS_TOKEN is defined in config file - elif self.config_file.is_file(): - if ms_token := self.load_token(): - self.ms_token = ms_token - logging.debug(f"Loaded token from config file: {self.config_file}") - - # Step 3: have user enter MS_TOKEN via terminal - else: - ms_token = self.input_token() - self.dump_token(ms_token=ms_token) - self.ms_token = ms_token - logging.debug( - f"Loaded token from user input and saved to config file: {self.config_file}" - ) - - return self.ms_token - - def load_token(self) -> Optional[str]: - """Parse a config file and extract the token.""" - - config = configparser.ConfigParser() - config.read(self.config_file) - return config.get(section=self.section, option="MS_TOKEN", fallback=None) - - def dump_token(self, ms_token: str): - """Write the token to a config file.""" - - config = configparser.ConfigParser() - config.read(self.config_file) - config.add_section(self.section) - config.set(section=self.section, option="MS_TOKEN", value=ms_token) - - with open(self.config_file, "w", encoding="utf-8") as f: - config.write(f) - - def input_token(self) -> str: - """Allow user to manually enter the token in the terminal.""" - - print( - "\nPlease copy and paste your `msToken` cookie taken from your web browser when visiting the TikTok website. For more information, watch the video: https://tinyurl.com/tiktok-mstoken\n" - ) - - ms_token = input("msToken: ") - - return ms_token diff --git a/tiktok_hashtag_analysis/base.py b/tiktok_hashtag_analysis/base.py index 71b01d4..71d7e63 100644 --- a/tiktok_hashtag_analysis/base.py +++ b/tiktok_hashtag_analysis/base.py @@ -26,7 +26,6 @@ from tenacity import ( from playwright._impl._api_types import Error from TikTokApi import TikTokApi -from .auth import Authorization warnings.filterwarnings("ignore", message="Glyph (.*) missing from current font") sns.set_theme(style="darkgrid") @@ -53,7 +52,7 @@ def load_hashtags_from_file(file: str) -> List[str]: # Retry upon encountering transient playwright errors @retry(retry=retry_if_exception_type(Error), stop=stop_after_attempt(3)) -async def _fetch_hashtag_data(hashtag: str, ms_token: str, limit: int) -> List[Dict]: +async def _fetch_hashtag_data(hashtag: str, limit: int) -> List[Dict]: """Fetch data for videos containing a specified hashtag, asynchronously.""" data = [] async with TikTokApi() as api: @@ -148,9 +147,6 @@ class TikTokDownloader: logger.info(f"Hashtags to scrape: {self.hashtags}") logger.info(f"Writing data to directory: {self.data_dir}") - self.auth = Authorization(config_file=config_file) - self.ms_token = self.auth.get_token() - def prioritize_hashtags(self): """Order hashtags based on whether they've been scraped before, and the time they were most recently scraped""" @@ -177,9 +173,7 @@ class TikTokDownloader: already_fetched_ids = set(video["id"] for video in already_fetched_data) # Scrape posts that use the specified hashtag - fetched_data = asyncio.run( - _fetch_hashtag_data(hashtag=hashtag, ms_token=self.ms_token, limit=limit) - ) + fetched_data = asyncio.run(_fetch_hashtag_data(hashtag=hashtag, limit=limit)) fetched_ids = set(video["id"] for video in fetched_data) if len(fetched_data) == 0: From 92ae29c72227b92f2fba079b8bbb3b6d815ce3f2 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Tue, 12 Sep 2023 11:26:07 -0500 Subject: [PATCH 7/7] updated version --- tiktok_hashtag_analysis/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tiktok_hashtag_analysis/version.py b/tiktok_hashtag_analysis/version.py index aba80f2..99da557 100644 --- a/tiktok_hashtag_analysis/version.py +++ b/tiktok_hashtag_analysis/version.py @@ -2,7 +2,7 @@ _MAJOR = "2" _MINOR = "0" # On main and in a nightly release the patch should be one ahead of the last # released build. -_PATCH = "0" +_PATCH = "1" # This is mainly for nightly builds which have the suffix ".dev$DATE". See # https://semver.org/#is-v123-a-semantic-version for the semantics. _SUFFIX = ""