added video link to msToken input, improved handling of output directories without write permission (and added relevant unit test), removed unused requirements.txt things

This commit is contained in:
Tristan Lee
2023-09-06 19:51:16 -05:00
parent 6a56c354e1
commit 91a8aaef38
6 changed files with 93 additions and 42 deletions

View File

@@ -15,7 +15,6 @@ class Authorization:
self.config_file = Path.home() / ".tiktok"
self.section = "TikTok"
self.ms_token = None
def get_token(self) -> str:
"""Load the "msToken" cookie taken from TikTok, which the scraper requires."""
@@ -64,7 +63,7 @@ class Authorization:
"""Allow user to manually enter the token in the terminal."""
print(
"\nPlease copy and paste your `msToken` cookie taken from your web browser when visiting the TikTok website. See [THIS VIDEO] for more information.\n"
"\nPlease copy and paste your `msToken` cookie taken from your web browser when visiting the TikTok website. For more information, watch the video: https://tinyurl.com/tiktok-mstoken\n"
)
ms_token = input("msToken: ")

View File

@@ -7,7 +7,7 @@ import warnings
import asyncio
import logging
import re
from typing import List, Dict
from typing import List, Dict, Optional
import yt_dlp
import requests
@@ -101,7 +101,9 @@ def aggregate_cooccurring_hashtags(hashtag_file: Path) -> Counter:
class TikTokDownloader:
"""Main class for scraping data from TikTok."""
def __init__(self, hashtags: List[str], data_dir: str, config_file: str = None):
def __init__(
self, hashtags: List[str], data_dir: Path, config_file: Optional[str] = None
):
self.hashtags = process_hashtag_list(hashtags)
logging.info(f"Hashtags to scrape: {hashtags}")
@@ -146,7 +148,8 @@ class TikTokDownloader:
json_dump(file_path=hashtag_file, data=all_fetched_data)
logging.info(
f"Scraped {len(new_fetched_data)} new posts containing the hashtag "
f"'{hashtag}', with {len(already_fetched_data)} posts previously scraped"
f"'{hashtag}' to output directory {self.data_dir}, with "
f"{len(already_fetched_data)} posts previously scraped"
)
def get_hashtag_videos(self, hashtag: str):

View File

@@ -1,9 +1,12 @@
import os
import logging
import argparse
from pathlib import Path
from typing import Optional
from .base import TikTokDownloader, load_hashtags_from_file
DEFAULT_OUTPUT_DIR = Path.home() / "tiktok_hashtag_data"
def create_parser():
"""Create parser tp parse input command-line arguments."""
@@ -51,7 +54,7 @@ def create_parser():
"--output-dir",
type=str,
help="Directory to save scraped data and visualizations to",
default=Path(".").resolve().parent / "data",
default=None,
)
parser.add_argument(
"--config",
@@ -64,6 +67,29 @@ def create_parser():
return parser
def process_output_dir(
specified_output_dir: Optional[str], parser: argparse.ArgumentParser
) -> Path:
"""Make sure the output directory can be created or has write permissions."""
error_message = (
lambda _output_dir: f"You don't have write permissions for the specified output directory (`{_output_dir}`). Please specify an output directory that you have write access to."
)
if specified_output_dir is None:
return DEFAULT_OUTPUT_DIR
else:
_output_dir = Path(specified_output_dir).resolve()
try:
os.makedirs(_output_dir, exist_ok=True)
if not os.access(path=_output_dir, mode=os.W_OK):
parser.error(error_message(_output_dir))
else:
return _output_dir
except PermissionError:
parser.error(error_message(_output_dir))
def main():
"""Parse and process command-line arguments, scrape specified hashtags, and perform specified analyses."""
@@ -89,8 +115,10 @@ def main():
else:
hashtags = args.hashtags
output_dir = process_output_dir(specified_output_dir=args.output_dir, parser=parser)
downloader = TikTokDownloader(
hashtags=hashtags, data_dir=args.output_dir, config_file=args.config
hashtags=hashtags, data_dir=output_dir, config_file=args.config
)
downloader.run(