From 1f4b956ce901796dca8667e197ea05068fc935b7 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Thu, 7 Sep 2023 11:18:22 -0500 Subject: [PATCH] made scraping more robust against transient playwright exceptions, set order of hashtags to scrape based on file modified time --- README.md | 2 +- setup.py | 2 +- tiktok_hashtag_analysis/base.py | 31 ++++++++++++++++++++++++------- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 05aaf4e..01cd57a 100644 --- a/README.md +++ b/README.md @@ -147,4 +147,4 @@ and then run the tests using the following command: pytest ``` -This repo uses [black](https://github.com/psf/black) to format source code, please run the `black` command before submitting a PR. +This repo uses [black](https://github.com/psf/black) to format source code, please run the `black` command before submitting a PR. \ No newline at end of file diff --git a/setup.py b/setup.py index 52d7d35..c53600d 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ setup( long_description_content_type="text/markdown", url="https://github.com/bellingcat/tiktok-hashtag-analysis", license="MIT License", - install_requires=["seaborn", "matplotlib", "TikTokApi", "requests", "yt_dlp"], + install_requires=["seaborn", "matplotlib", "TikTokApi", "requests", "yt_dlp", "tenacity"], extras_require={"test": ["pytest", "pytest-cov", "pytest-html", "pytest-metadata"]}, classifiers=[ "Development Status :: 5 - Production/Stable", diff --git a/tiktok_hashtag_analysis/base.py b/tiktok_hashtag_analysis/base.py index 694b82a..71884d5 100644 --- a/tiktok_hashtag_analysis/base.py +++ b/tiktok_hashtag_analysis/base.py @@ -14,7 +14,8 @@ import requests import matplotlib.pyplot as plt import matplotlib.ticker as mtick import seaborn as sns - +from tenacity import retry, retry_if_exception_type, stop_after_attempt +from playwright._impl._api_types import Error from TikTokApi import TikTokApi from .auth import Authorization @@ -40,6 +41,8 @@ def load_hashtags_from_file(file: str) -> List[str]: return process_hashtag_list(hashtags=hashtags) +# Retry upon encountering transient playwright errors +@retry(retry=retry_if_exception_type(Error), stop=stop_after_attempt(3)) async def _fetch_hashtag_data(hashtag: str, ms_token: str) -> List[Dict]: """Fetch data for videos containing a specified hashtag, asynchronously.""" data = [] @@ -105,14 +108,28 @@ class TikTokDownloader: self, hashtags: List[str], data_dir: Path, config_file: Optional[str] = None ): self.hashtags = process_hashtag_list(hashtags) - logging.info(f"Hashtags to scrape: {hashtags}") self.data_dir = Path(data_dir) os.makedirs(self.data_dir, exist_ok=True) + self.prioritize_hashtags() + logging.info(f"Hashtags to scrape: {self.hashtags}") + logging.info(f"Writing data to directory: {self.data_dir}") + self.auth = Authorization(config_file=config_file) self.ms_token = self.auth.get_token() + def prioritize_hashtags(self): + """Order hashtags basd on whether they've been scraped before, and + the time they were most recently scraped""" + + previously_scraped_hashtags = set(os.listdir(self.data_dir)) + last_edited = { + hashtag: (self.data_dir / hashtag / "posts.json").lstat().st_mtime + for hashtag in previously_scraped_hashtags + } + self.hashtags.sort(key=lambda h: last_edited.get(h, 0)) + def get_hashtag_posts(self, hashtag: str): """Fetch data about posts that used a specified hashtag and merge with existing data, if it exists.""" @@ -148,8 +165,7 @@ class TikTokDownloader: json_dump(file_path=hashtag_file, data=all_fetched_data) logging.info( f"Scraped {len(new_fetched_data)} new posts containing the hashtag " - f"'{hashtag}' to output directory {self.data_dir}, with " - f"{len(already_fetched_data)} posts previously scraped" + f"'{hashtag}', with {len(already_fetched_data)} posts previously scraped" ) def get_hashtag_videos(self, hashtag: str): @@ -229,10 +245,11 @@ class TikTokDownloader: # Define labels and other fields used in plot total_posts = max(frequencies.values()) + frequencies.pop(hashtag) sorted_frequencices = frequencies.most_common(number) - labels = [label for label, _ in sorted_frequencices[1:]] - ratios = [freq / total_posts * 100 for _, freq in sorted_frequencices[1:]] - y_pos = list(reversed(range(len(sorted_frequencices) - 1))) + labels = [label for label, _ in sorted_frequencices] + ratios = [freq / total_posts * 100 for _, freq in sorted_frequencices] + y_pos = list(reversed(range(len(sorted_frequencices)))) # Visualize data in bar chart fig, ax = plt.subplots(figsize=(5, 6.66))