added headed argument to more robustly handle issues with scrapers headless mode

This commit is contained in:
Tristan Lee
2023-09-19 16:30:13 -05:00
parent 0bd87f944e
commit 89d89521fa
5 changed files with 53 additions and 10 deletions

View File

@@ -15,7 +15,7 @@ You should now be ready to start using it.
## About the tool
### Command-line arguments
```
usage: tiktok-hashtag-analysis [-h] [--file FILE] [-d] [--number NUMBER] [-p] [-t] [--output-dir OUTPUT_DIR] [--config CONFIG] [--log LOG] [--limit LIMIT] [-v] [hashtags ...]
usage: tiktok-hashtag-analysis [-h] [--file FILE] [-d] [--number NUMBER] [-p] [-t] [--output-dir OUTPUT_DIR] [--config CONFIG] [--log LOG] [--limit LIMIT] [-v] [--headed] [hashtags ...]
Analyze hashtags within posts scraped from TikTok.
@@ -35,6 +35,7 @@ optional arguments:
--log LOG File to write logs to
--limit LIMIT Maximum number of videos to download for each hashtag
-v, --verbose Increase output verbosity
--headed Don't use headless version of TikTok scraper
```
### Structure of output data

View File

@@ -3,7 +3,16 @@ from tiktok_hashtag_analysis.base import TikTokDownloader, load_hashtags_from_fi
def test_scrape(tmp_path, hashtags):
downloader = TikTokDownloader(hashtags=hashtags[:1], data_dir=tmp_path)
downloader.run(limit=1000, download=True, plot=True, table=True, number=20)
downloader.run(
limit=10, download=True, plot=True, table=True, number=5, headed=True
)
def test_scrape_headless(tmp_path, hashtags):
downloader = TikTokDownloader(hashtags=hashtags[:1], data_dir=tmp_path)
downloader.run(
limit=10, download=True, plot=True, table=True, number=5, headed=False
)
def test_load_hashtags_from_file(tmp_path, hashtags):

View File

@@ -20,6 +20,7 @@ PARSER_ARGUMENTS = [
("table", True, "--table"),
("table", True, "-t"),
("verbose", True, "--verbose"),
("headed", True, "--headed"),
("verbose", True, "-v"),
("output_dir", "/tmp/tiktok_download", "--output-dir"),
("config", "~/.tiktok", "--config"),
@@ -51,6 +52,7 @@ def test_output_dir_spec_noexist_nowrite(tmp_path):
specified_output_dir=specified_output_dir, parser=parser
)
assert system_exit.type == SystemExit
os.chmod(tmp_path, 0o666)
def test_output_dir_spec_exist_nowrite(tmp_path):
@@ -63,6 +65,7 @@ def test_output_dir_spec_exist_nowrite(tmp_path):
specified_output_dir=specified_output_dir, parser=parser
)
assert system_exit.type == SystemExit
os.chmod(tmp_path, 0o666)
def test_output_dir_unspec_nowrite(monkeypatch, tmp_path):
@@ -75,6 +78,7 @@ def test_output_dir_unspec_nowrite(monkeypatch, tmp_path):
result = process_output_dir(specified_output_dir=None, parser=parser)
monkeypatch.chdir(cwd)
assert result == DEFAULT_OUTPUT_DIR
os.chmod(tmp_path, 0o666)
def test_output_dir_spec_noexist_write(tmp_path):

View File

@@ -52,11 +52,15 @@ def load_hashtags_from_file(file: str) -> List[str]:
# Retry upon encountering transient playwright errors
@retry(retry=retry_if_exception_type(Error), stop=stop_after_attempt(3))
async def _fetch_hashtag_data(hashtag: str, limit: int) -> List[Dict]:
async def _fetch_hashtag_data(
hashtag: str, limit: int, headed: bool = False
) -> List[Dict]:
"""Fetch data for videos containing a specified hashtag, asynchronously."""
data = []
async with TikTokApi() as api:
await api.create_sessions(ms_tokens=[], num_sessions=1, sleep_after=3)
await api.create_sessions(
ms_tokens=[], num_sessions=1, sleep_after=3, headless=not headed
)
async for video in api.hashtag(name=hashtag).videos(count=limit):
data.append(video.as_dict)
return data
@@ -157,7 +161,7 @@ class TikTokDownloader:
}
self.hashtags.sort(key=lambda h: last_edited.get(h, 0))
def get_hashtag_posts(self, hashtag: str, limit: int):
def get_hashtag_posts(self, hashtag: str, limit: int, headed: bool):
"""Fetch data about posts that used a specified hashtag and merge with
existing data, if it exists."""
@@ -172,8 +176,20 @@ class TikTokDownloader:
already_fetched_data = []
already_fetched_ids = set(video["id"] for video in already_fetched_data)
# Scrape posts that use the specified hashtag
fetched_data = asyncio.run(_fetch_hashtag_data(hashtag=hashtag, limit=limit))
# Scrape posts that use the specified hashag
# Attempt to be robust against TikTok's countermeasures for headless browsing
try:
fetched_data = asyncio.run(
_fetch_hashtag_data(hashtag=hashtag, limit=limit, headed=headed)
)
except Exception as e:
logger.warning(
"Encountered error {e} when fetching data, retrying in headed mode"
)
fetched_data = asyncio.run(
_fetch_hashtag_data(hashtag=hashtag, limit=limit, headed=True)
)
fetched_ids = set(video["id"] for video in fetched_data)
if len(fetched_data) == 0:
@@ -303,13 +319,21 @@ class TikTokDownloader:
plt.savefig(plot_file, bbox_inches="tight", facecolor="white", dpi=300)
logger.info(f"Plot saved to file: {plot_file}")
def run(self, limit: int, download: bool, plot: bool, table: bool, number: int):
def run(
self,
limit: int,
download: bool,
plot: bool,
table: bool,
number: int,
headed: bool,
):
"""Execute the specified operations on all specified hashtags."""
# Scrape all specified hashtags and perform analyses, depending on if
# `--table`, `--plot`, and `--download` flags are used in the command
for hashtag in self.hashtags:
self.get_hashtag_posts(hashtag=hashtag, limit=limit)
self.get_hashtag_posts(hashtag=hashtag, limit=limit, headed=headed)
if plot:
self.plot(hashtag=hashtag, number=number)
if table:

View File

@@ -77,7 +77,11 @@ def create_parser():
help="Increase output verbosity",
action="store_true",
)
parser.add_argument(
"--headed",
help="Don't use headless version of TikTok scraper",
action="store_true",
)
return parser
@@ -146,6 +150,7 @@ def main():
plot=args.plot,
table=args.table,
number=args.number,
headed=args.headed,
)