made scraping more robust against transient playwright exceptions, set order of hashtags to scrape based on file modified time

2026-06-08 03:18:31 +03:00 · 2023-09-07 11:18:22 -05:00
parent 91a8aaef38
commit 1f4b956ce9
3 changed files with 26 additions and 9 deletions
--- a/README.md
+++ b/README.md
@@ -147,4 +147,4 @@ and then run the tests using the following command:
 pytest
 ```

-This repo uses [black](https://github.com/psf/black) to format source code, please run the `black` command before submitting a PR. 
+This repo uses [black](https://github.com/psf/black) to format source code, please run the `black` command before submitting a PR. 
--- a/setup.py
+++ b/setup.py
@@ -21,7 +21,7 @@ setup(
    long_description_content_type="text/markdown",
    url="https://github.com/bellingcat/tiktok-hashtag-analysis",
    license="MIT License",
-    install_requires=["seaborn", "matplotlib", "TikTokApi", "requests", "yt_dlp"],
+    install_requires=["seaborn", "matplotlib", "TikTokApi", "requests", "yt_dlp", "tenacity"],
    extras_require={"test": ["pytest", "pytest-cov", "pytest-html", "pytest-metadata"]},
    classifiers=[
        "Development Status :: 5 - Production/Stable",
--- a/tiktok_hashtag_analysis/base.py
+++ b/tiktok_hashtag_analysis/base.py
@@ -14,7 +14,8 @@ import requests
 import matplotlib.pyplot as plt
 import matplotlib.ticker as mtick
 import seaborn as sns
-
+from tenacity import retry, retry_if_exception_type, stop_after_attempt
+from playwright._impl._api_types import Error
 from TikTokApi import TikTokApi

 from .auth import Authorization
@@ -40,6 +41,8 @@ def load_hashtags_from_file(file: str) -> List[str]:
    return process_hashtag_list(hashtags=hashtags)


+# Retry upon encountering transient playwright errors
+@retry(retry=retry_if_exception_type(Error), stop=stop_after_attempt(3))
 async def _fetch_hashtag_data(hashtag: str, ms_token: str) -> List[Dict]:
    """Fetch data for videos containing a specified hashtag, asynchronously."""
    data = []
@@ -105,14 +108,28 @@ class TikTokDownloader:
        self, hashtags: List[str], data_dir: Path, config_file: Optional[str] = None
    ):
        self.hashtags = process_hashtag_list(hashtags)
-        logging.info(f"Hashtags to scrape: {hashtags}")

        self.data_dir = Path(data_dir)
        os.makedirs(self.data_dir, exist_ok=True)

+        self.prioritize_hashtags()
+        logging.info(f"Hashtags to scrape: {self.hashtags}")
+        logging.info(f"Writing data to directory: {self.data_dir}")
+
        self.auth = Authorization(config_file=config_file)
        self.ms_token = self.auth.get_token()

+    def prioritize_hashtags(self):
+        """Order hashtags basd on whether they've been scraped before, and
+        the time they were most recently scraped"""
+
+        previously_scraped_hashtags = set(os.listdir(self.data_dir))
+        last_edited = {
+            hashtag: (self.data_dir / hashtag / "posts.json").lstat().st_mtime
+            for hashtag in previously_scraped_hashtags
+        }
+        self.hashtags.sort(key=lambda h: last_edited.get(h, 0))
+
    def get_hashtag_posts(self, hashtag: str):
        """Fetch data about posts that used a specified hashtag and merge with
        existing data, if it exists."""
@@ -148,8 +165,7 @@ class TikTokDownloader:
        json_dump(file_path=hashtag_file, data=all_fetched_data)
        logging.info(
            f"Scraped {len(new_fetched_data)} new posts containing the hashtag "
-            f"'{hashtag}' to output directory {self.data_dir}, with "
-            f"{len(already_fetched_data)} posts previously scraped"
+            f"'{hashtag}', with {len(already_fetched_data)} posts previously scraped"
        )

    def get_hashtag_videos(self, hashtag: str):
@@ -229,10 +245,11 @@ class TikTokDownloader:

        # Define labels and other fields used in plot
        total_posts = max(frequencies.values())
+        frequencies.pop(hashtag)
        sorted_frequencices = frequencies.most_common(number)
-        labels = [label for label, _ in sorted_frequencices[1:]]
-        ratios = [freq / total_posts * 100 for _, freq in sorted_frequencices[1:]]
-        y_pos = list(reversed(range(len(sorted_frequencices) - 1)))
+        labels = [label for label, _ in sorted_frequencices]
+        ratios = [freq / total_posts * 100 for _, freq in sorted_frequencices]
+        y_pos = list(reversed(range(len(sorted_frequencices))))

        # Visualize data in bar chart
        fig, ax = plt.subplots(figsize=(5, 6.66))