From 91a8aaef385a16e93f0db09a3128684986fe96cd Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Wed, 6 Sep 2023 19:51:16 -0500 Subject: [PATCH] added video link to msToken input, improved handling of output directories without write permission (and added relevant unit test), removed unused requirements.txt things --- requirements.txt | 5 --- setup.py | 26 --------------- tests/cli.py | 58 +++++++++++++++++++++++++++++++-- tiktok_hashtag_analysis/auth.py | 3 +- tiktok_hashtag_analysis/base.py | 9 +++-- tiktok_hashtag_analysis/cli.py | 34 +++++++++++++++++-- 6 files changed, 93 insertions(+), 42 deletions(-) delete mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index e4144ef..0000000 --- a/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -seaborn==0.12.2 -matplotlib==3.7.2 -yt-dlp==2023.7.6 -TikTokApi==6.1.1 -requests==2.31.0 \ No newline at end of file diff --git a/setup.py b/setup.py index 5760f41..52d7d35 100644 --- a/setup.py +++ b/setup.py @@ -1,30 +1,6 @@ from setuptools import setup -def read_requirements(filename: str): - with open(filename) as requirements_file: - import re - - def fix_url_dependencies(req: str) -> str: - """Pip and setuptools disagree about how URL dependencies should be handled.""" - m = re.match( - r"^(git\+)?(https|ssh)://(git@)?github\.com/([\w-]+)/(?P[\w-]+)\.git", - req, - ) - if m is None: - return req - else: - return f"{m.group('name')} @ {req}" - - requirements = [] - for line in requirements_file: - line = line.strip() - if line.startswith("#") or len(line) <= 0: - continue - requirements.append(fix_url_dependencies(line)) - return requirements - - with open("README.md", "r", encoding="utf-8") as file: long_description = file.read() @@ -45,8 +21,6 @@ setup( long_description_content_type="text/markdown", url="https://github.com/bellingcat/tiktok-hashtag-analysis", license="MIT License", - # install_requires=read_requirements("requirements.txt"), - # extras_require={"dev": read_requirements("dev-requirements.txt")}, install_requires=["seaborn", "matplotlib", "TikTokApi", "requests", "yt_dlp"], extras_require={"test": ["pytest", "pytest-cov", "pytest-html", "pytest-metadata"]}, classifiers=[ diff --git a/tests/cli.py b/tests/cli.py index dd58f5e..ea7f399 100644 --- a/tests/cli.py +++ b/tests/cli.py @@ -1,8 +1,15 @@ +import os +from pathlib import Path + import pytest -from tiktok_hashtag_analysis.cli import create_parser +from tiktok_hashtag_analysis.cli import ( + create_parser, + process_output_dir, + DEFAULT_OUTPUT_DIR, +) -ARGUMENTS = [ +PARSER_ARGUMENTS = [ ("file", "hashtags.txt", "--file"), ("download", True, "--download"), ("download", True, "-d"), @@ -17,7 +24,7 @@ ARGUMENTS = [ ] -@pytest.mark.parametrize("attribute,value,flag", ARGUMENTS) +@pytest.mark.parametrize("attribute,value,flag", PARSER_ARGUMENTS) def test_parser(hashtags, attribute, value, flag): argument_list = [*hashtags, flag] @@ -29,3 +36,48 @@ def test_parser(hashtags, attribute, value, flag): assert args.get(attribute) == value assert args.get("hashtags") == hashtags + + +def test_process_output_dir(monkeypatch, tmp_path): + + home_dir = Path.home().resolve() + + # Specified nonexistent output directory without write permissions + parser = create_parser() + specified_output_dir = home_dir.parent / "test" + with pytest.raises(SystemExit) as system_exit: + result = process_output_dir( + specified_output_dir=specified_output_dir, parser=parser + ) + assert system_exit.type == SystemExit + + # Specified existing output directory without write permissions + parser = create_parser() + specified_output_dir = home_dir.parent + with pytest.raises(SystemExit) as system_exit: + result = process_output_dir( + specified_output_dir=specified_output_dir, parser=parser + ) + assert system_exit.type == SystemExit + + # Unspecified, in current directory without write permissions + cwd = os.getcwd() + monkeypatch.chdir(specified_output_dir) + result = process_output_dir(specified_output_dir=None, parser=parser) + monkeypatch.chdir(cwd) + assert result == DEFAULT_OUTPUT_DIR + + # Specified nonexisting output directory with write permissions + parser = create_parser() + specified_output_dir = tmp_path / "test" / "tiktok" + result = process_output_dir( + specified_output_dir=specified_output_dir, parser=parser + ) + assert result == specified_output_dir + + # Unspecified, in current directory with write permissions + cwd = os.getcwd() + monkeypatch.chdir(specified_output_dir) + result = process_output_dir(specified_output_dir=None, parser=parser) + monkeypatch.chdir(cwd) + assert result == DEFAULT_OUTPUT_DIR diff --git a/tiktok_hashtag_analysis/auth.py b/tiktok_hashtag_analysis/auth.py index 545e2ce..3255ad9 100644 --- a/tiktok_hashtag_analysis/auth.py +++ b/tiktok_hashtag_analysis/auth.py @@ -15,7 +15,6 @@ class Authorization: self.config_file = Path.home() / ".tiktok" self.section = "TikTok" - self.ms_token = None def get_token(self) -> str: """Load the "msToken" cookie taken from TikTok, which the scraper requires.""" @@ -64,7 +63,7 @@ class Authorization: """Allow user to manually enter the token in the terminal.""" print( - "\nPlease copy and paste your `msToken` cookie taken from your web browser when visiting the TikTok website. See [THIS VIDEO] for more information.\n" + "\nPlease copy and paste your `msToken` cookie taken from your web browser when visiting the TikTok website. For more information, watch the video: https://tinyurl.com/tiktok-mstoken\n" ) ms_token = input("msToken: ") diff --git a/tiktok_hashtag_analysis/base.py b/tiktok_hashtag_analysis/base.py index d7a9e9e..694b82a 100644 --- a/tiktok_hashtag_analysis/base.py +++ b/tiktok_hashtag_analysis/base.py @@ -7,7 +7,7 @@ import warnings import asyncio import logging import re -from typing import List, Dict +from typing import List, Dict, Optional import yt_dlp import requests @@ -101,7 +101,9 @@ def aggregate_cooccurring_hashtags(hashtag_file: Path) -> Counter: class TikTokDownloader: """Main class for scraping data from TikTok.""" - def __init__(self, hashtags: List[str], data_dir: str, config_file: str = None): + def __init__( + self, hashtags: List[str], data_dir: Path, config_file: Optional[str] = None + ): self.hashtags = process_hashtag_list(hashtags) logging.info(f"Hashtags to scrape: {hashtags}") @@ -146,7 +148,8 @@ class TikTokDownloader: json_dump(file_path=hashtag_file, data=all_fetched_data) logging.info( f"Scraped {len(new_fetched_data)} new posts containing the hashtag " - f"'{hashtag}', with {len(already_fetched_data)} posts previously scraped" + f"'{hashtag}' to output directory {self.data_dir}, with " + f"{len(already_fetched_data)} posts previously scraped" ) def get_hashtag_videos(self, hashtag: str): diff --git a/tiktok_hashtag_analysis/cli.py b/tiktok_hashtag_analysis/cli.py index 3c3bbfd..333ed49 100644 --- a/tiktok_hashtag_analysis/cli.py +++ b/tiktok_hashtag_analysis/cli.py @@ -1,9 +1,12 @@ +import os import logging import argparse from pathlib import Path - +from typing import Optional from .base import TikTokDownloader, load_hashtags_from_file +DEFAULT_OUTPUT_DIR = Path.home() / "tiktok_hashtag_data" + def create_parser(): """Create parser tp parse input command-line arguments.""" @@ -51,7 +54,7 @@ def create_parser(): "--output-dir", type=str, help="Directory to save scraped data and visualizations to", - default=Path(".").resolve().parent / "data", + default=None, ) parser.add_argument( "--config", @@ -64,6 +67,29 @@ def create_parser(): return parser +def process_output_dir( + specified_output_dir: Optional[str], parser: argparse.ArgumentParser +) -> Path: + """Make sure the output directory can be created or has write permissions.""" + + error_message = ( + lambda _output_dir: f"You don't have write permissions for the specified output directory (`{_output_dir}`). Please specify an output directory that you have write access to." + ) + + if specified_output_dir is None: + return DEFAULT_OUTPUT_DIR + else: + _output_dir = Path(specified_output_dir).resolve() + try: + os.makedirs(_output_dir, exist_ok=True) + if not os.access(path=_output_dir, mode=os.W_OK): + parser.error(error_message(_output_dir)) + else: + return _output_dir + except PermissionError: + parser.error(error_message(_output_dir)) + + def main(): """Parse and process command-line arguments, scrape specified hashtags, and perform specified analyses.""" @@ -89,8 +115,10 @@ def main(): else: hashtags = args.hashtags + output_dir = process_output_dir(specified_output_dir=args.output_dir, parser=parser) + downloader = TikTokDownloader( - hashtags=hashtags, data_dir=args.output_dir, config_file=args.config + hashtags=hashtags, data_dir=output_dir, config_file=args.config ) downloader.run(