mirror of
https://github.com/bellingcat/tiktok-hashtag-analysis.git
synced 2026-06-08 03:18:31 +03:00
made scraping more robust against transient playwright exceptions, set order of hashtags to scrape based on file modified time
This commit is contained in:
@@ -147,4 +147,4 @@ and then run the tests using the following command:
|
||||
pytest
|
||||
```
|
||||
|
||||
This repo uses [black](https://github.com/psf/black) to format source code, please run the `black` command before submitting a PR.
|
||||
This repo uses [black](https://github.com/psf/black) to format source code, please run the `black` command before submitting a PR.
|
||||
2
setup.py
2
setup.py
@@ -21,7 +21,7 @@ setup(
|
||||
long_description_content_type="text/markdown",
|
||||
url="https://github.com/bellingcat/tiktok-hashtag-analysis",
|
||||
license="MIT License",
|
||||
install_requires=["seaborn", "matplotlib", "TikTokApi", "requests", "yt_dlp"],
|
||||
install_requires=["seaborn", "matplotlib", "TikTokApi", "requests", "yt_dlp", "tenacity"],
|
||||
extras_require={"test": ["pytest", "pytest-cov", "pytest-html", "pytest-metadata"]},
|
||||
classifiers=[
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
|
||||
@@ -14,7 +14,8 @@ import requests
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.ticker as mtick
|
||||
import seaborn as sns
|
||||
|
||||
from tenacity import retry, retry_if_exception_type, stop_after_attempt
|
||||
from playwright._impl._api_types import Error
|
||||
from TikTokApi import TikTokApi
|
||||
|
||||
from .auth import Authorization
|
||||
@@ -40,6 +41,8 @@ def load_hashtags_from_file(file: str) -> List[str]:
|
||||
return process_hashtag_list(hashtags=hashtags)
|
||||
|
||||
|
||||
# Retry upon encountering transient playwright errors
|
||||
@retry(retry=retry_if_exception_type(Error), stop=stop_after_attempt(3))
|
||||
async def _fetch_hashtag_data(hashtag: str, ms_token: str) -> List[Dict]:
|
||||
"""Fetch data for videos containing a specified hashtag, asynchronously."""
|
||||
data = []
|
||||
@@ -105,14 +108,28 @@ class TikTokDownloader:
|
||||
self, hashtags: List[str], data_dir: Path, config_file: Optional[str] = None
|
||||
):
|
||||
self.hashtags = process_hashtag_list(hashtags)
|
||||
logging.info(f"Hashtags to scrape: {hashtags}")
|
||||
|
||||
self.data_dir = Path(data_dir)
|
||||
os.makedirs(self.data_dir, exist_ok=True)
|
||||
|
||||
self.prioritize_hashtags()
|
||||
logging.info(f"Hashtags to scrape: {self.hashtags}")
|
||||
logging.info(f"Writing data to directory: {self.data_dir}")
|
||||
|
||||
self.auth = Authorization(config_file=config_file)
|
||||
self.ms_token = self.auth.get_token()
|
||||
|
||||
def prioritize_hashtags(self):
|
||||
"""Order hashtags basd on whether they've been scraped before, and
|
||||
the time they were most recently scraped"""
|
||||
|
||||
previously_scraped_hashtags = set(os.listdir(self.data_dir))
|
||||
last_edited = {
|
||||
hashtag: (self.data_dir / hashtag / "posts.json").lstat().st_mtime
|
||||
for hashtag in previously_scraped_hashtags
|
||||
}
|
||||
self.hashtags.sort(key=lambda h: last_edited.get(h, 0))
|
||||
|
||||
def get_hashtag_posts(self, hashtag: str):
|
||||
"""Fetch data about posts that used a specified hashtag and merge with
|
||||
existing data, if it exists."""
|
||||
@@ -148,8 +165,7 @@ class TikTokDownloader:
|
||||
json_dump(file_path=hashtag_file, data=all_fetched_data)
|
||||
logging.info(
|
||||
f"Scraped {len(new_fetched_data)} new posts containing the hashtag "
|
||||
f"'{hashtag}' to output directory {self.data_dir}, with "
|
||||
f"{len(already_fetched_data)} posts previously scraped"
|
||||
f"'{hashtag}', with {len(already_fetched_data)} posts previously scraped"
|
||||
)
|
||||
|
||||
def get_hashtag_videos(self, hashtag: str):
|
||||
@@ -229,10 +245,11 @@ class TikTokDownloader:
|
||||
|
||||
# Define labels and other fields used in plot
|
||||
total_posts = max(frequencies.values())
|
||||
frequencies.pop(hashtag)
|
||||
sorted_frequencices = frequencies.most_common(number)
|
||||
labels = [label for label, _ in sorted_frequencices[1:]]
|
||||
ratios = [freq / total_posts * 100 for _, freq in sorted_frequencices[1:]]
|
||||
y_pos = list(reversed(range(len(sorted_frequencices) - 1)))
|
||||
labels = [label for label, _ in sorted_frequencices]
|
||||
ratios = [freq / total_posts * 100 for _, freq in sorted_frequencices]
|
||||
y_pos = list(reversed(range(len(sorted_frequencices))))
|
||||
|
||||
# Visualize data in bar chart
|
||||
fig, ax = plt.subplots(figsize=(5, 6.66))
|
||||
|
||||
Reference in New Issue
Block a user