mirror of
https://github.com/bellingcat/tiktok-hashtag-analysis.git
synced 2026-06-07 19:08:32 +03:00
added headed argument to more robustly handle issues with scrapers headless mode
This commit is contained in:
@@ -15,7 +15,7 @@ You should now be ready to start using it.
|
||||
## About the tool
|
||||
### Command-line arguments
|
||||
```
|
||||
usage: tiktok-hashtag-analysis [-h] [--file FILE] [-d] [--number NUMBER] [-p] [-t] [--output-dir OUTPUT_DIR] [--config CONFIG] [--log LOG] [--limit LIMIT] [-v] [hashtags ...]
|
||||
usage: tiktok-hashtag-analysis [-h] [--file FILE] [-d] [--number NUMBER] [-p] [-t] [--output-dir OUTPUT_DIR] [--config CONFIG] [--log LOG] [--limit LIMIT] [-v] [--headed] [hashtags ...]
|
||||
|
||||
Analyze hashtags within posts scraped from TikTok.
|
||||
|
||||
@@ -35,6 +35,7 @@ optional arguments:
|
||||
--log LOG File to write logs to
|
||||
--limit LIMIT Maximum number of videos to download for each hashtag
|
||||
-v, --verbose Increase output verbosity
|
||||
--headed Don't use headless version of TikTok scraper
|
||||
```
|
||||
|
||||
### Structure of output data
|
||||
|
||||
@@ -3,7 +3,16 @@ from tiktok_hashtag_analysis.base import TikTokDownloader, load_hashtags_from_fi
|
||||
|
||||
def test_scrape(tmp_path, hashtags):
|
||||
downloader = TikTokDownloader(hashtags=hashtags[:1], data_dir=tmp_path)
|
||||
downloader.run(limit=1000, download=True, plot=True, table=True, number=20)
|
||||
downloader.run(
|
||||
limit=10, download=True, plot=True, table=True, number=5, headed=True
|
||||
)
|
||||
|
||||
|
||||
def test_scrape_headless(tmp_path, hashtags):
|
||||
downloader = TikTokDownloader(hashtags=hashtags[:1], data_dir=tmp_path)
|
||||
downloader.run(
|
||||
limit=10, download=True, plot=True, table=True, number=5, headed=False
|
||||
)
|
||||
|
||||
|
||||
def test_load_hashtags_from_file(tmp_path, hashtags):
|
||||
|
||||
@@ -20,6 +20,7 @@ PARSER_ARGUMENTS = [
|
||||
("table", True, "--table"),
|
||||
("table", True, "-t"),
|
||||
("verbose", True, "--verbose"),
|
||||
("headed", True, "--headed"),
|
||||
("verbose", True, "-v"),
|
||||
("output_dir", "/tmp/tiktok_download", "--output-dir"),
|
||||
("config", "~/.tiktok", "--config"),
|
||||
@@ -51,6 +52,7 @@ def test_output_dir_spec_noexist_nowrite(tmp_path):
|
||||
specified_output_dir=specified_output_dir, parser=parser
|
||||
)
|
||||
assert system_exit.type == SystemExit
|
||||
os.chmod(tmp_path, 0o666)
|
||||
|
||||
|
||||
def test_output_dir_spec_exist_nowrite(tmp_path):
|
||||
@@ -63,6 +65,7 @@ def test_output_dir_spec_exist_nowrite(tmp_path):
|
||||
specified_output_dir=specified_output_dir, parser=parser
|
||||
)
|
||||
assert system_exit.type == SystemExit
|
||||
os.chmod(tmp_path, 0o666)
|
||||
|
||||
|
||||
def test_output_dir_unspec_nowrite(monkeypatch, tmp_path):
|
||||
@@ -75,6 +78,7 @@ def test_output_dir_unspec_nowrite(monkeypatch, tmp_path):
|
||||
result = process_output_dir(specified_output_dir=None, parser=parser)
|
||||
monkeypatch.chdir(cwd)
|
||||
assert result == DEFAULT_OUTPUT_DIR
|
||||
os.chmod(tmp_path, 0o666)
|
||||
|
||||
|
||||
def test_output_dir_spec_noexist_write(tmp_path):
|
||||
|
||||
@@ -52,11 +52,15 @@ def load_hashtags_from_file(file: str) -> List[str]:
|
||||
|
||||
# Retry upon encountering transient playwright errors
|
||||
@retry(retry=retry_if_exception_type(Error), stop=stop_after_attempt(3))
|
||||
async def _fetch_hashtag_data(hashtag: str, limit: int) -> List[Dict]:
|
||||
async def _fetch_hashtag_data(
|
||||
hashtag: str, limit: int, headed: bool = False
|
||||
) -> List[Dict]:
|
||||
"""Fetch data for videos containing a specified hashtag, asynchronously."""
|
||||
data = []
|
||||
async with TikTokApi() as api:
|
||||
await api.create_sessions(ms_tokens=[], num_sessions=1, sleep_after=3)
|
||||
await api.create_sessions(
|
||||
ms_tokens=[], num_sessions=1, sleep_after=3, headless=not headed
|
||||
)
|
||||
async for video in api.hashtag(name=hashtag).videos(count=limit):
|
||||
data.append(video.as_dict)
|
||||
return data
|
||||
@@ -157,7 +161,7 @@ class TikTokDownloader:
|
||||
}
|
||||
self.hashtags.sort(key=lambda h: last_edited.get(h, 0))
|
||||
|
||||
def get_hashtag_posts(self, hashtag: str, limit: int):
|
||||
def get_hashtag_posts(self, hashtag: str, limit: int, headed: bool):
|
||||
"""Fetch data about posts that used a specified hashtag and merge with
|
||||
existing data, if it exists."""
|
||||
|
||||
@@ -172,8 +176,20 @@ class TikTokDownloader:
|
||||
already_fetched_data = []
|
||||
already_fetched_ids = set(video["id"] for video in already_fetched_data)
|
||||
|
||||
# Scrape posts that use the specified hashtag
|
||||
fetched_data = asyncio.run(_fetch_hashtag_data(hashtag=hashtag, limit=limit))
|
||||
# Scrape posts that use the specified hashag
|
||||
# Attempt to be robust against TikTok's countermeasures for headless browsing
|
||||
try:
|
||||
fetched_data = asyncio.run(
|
||||
_fetch_hashtag_data(hashtag=hashtag, limit=limit, headed=headed)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Encountered error {e} when fetching data, retrying in headed mode"
|
||||
)
|
||||
fetched_data = asyncio.run(
|
||||
_fetch_hashtag_data(hashtag=hashtag, limit=limit, headed=True)
|
||||
)
|
||||
|
||||
fetched_ids = set(video["id"] for video in fetched_data)
|
||||
|
||||
if len(fetched_data) == 0:
|
||||
@@ -303,13 +319,21 @@ class TikTokDownloader:
|
||||
plt.savefig(plot_file, bbox_inches="tight", facecolor="white", dpi=300)
|
||||
logger.info(f"Plot saved to file: {plot_file}")
|
||||
|
||||
def run(self, limit: int, download: bool, plot: bool, table: bool, number: int):
|
||||
def run(
|
||||
self,
|
||||
limit: int,
|
||||
download: bool,
|
||||
plot: bool,
|
||||
table: bool,
|
||||
number: int,
|
||||
headed: bool,
|
||||
):
|
||||
"""Execute the specified operations on all specified hashtags."""
|
||||
|
||||
# Scrape all specified hashtags and perform analyses, depending on if
|
||||
# `--table`, `--plot`, and `--download` flags are used in the command
|
||||
for hashtag in self.hashtags:
|
||||
self.get_hashtag_posts(hashtag=hashtag, limit=limit)
|
||||
self.get_hashtag_posts(hashtag=hashtag, limit=limit, headed=headed)
|
||||
if plot:
|
||||
self.plot(hashtag=hashtag, number=number)
|
||||
if table:
|
||||
|
||||
@@ -77,7 +77,11 @@ def create_parser():
|
||||
help="Increase output verbosity",
|
||||
action="store_true",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--headed",
|
||||
help="Don't use headless version of TikTok scraper",
|
||||
action="store_true",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
@@ -146,6 +150,7 @@ def main():
|
||||
plot=args.plot,
|
||||
table=args.table,
|
||||
number=args.number,
|
||||
headed=args.headed,
|
||||
)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user