mirror of
https://github.com/bellingcat/tiktok-hashtag-analysis.git
synced 2026-06-11 21:08:31 +03:00
added video link to msToken input, improved handling of output directories without write permission (and added relevant unit test), removed unused requirements.txt things
This commit is contained in:
@@ -1,5 +0,0 @@
|
||||
seaborn==0.12.2
|
||||
matplotlib==3.7.2
|
||||
yt-dlp==2023.7.6
|
||||
TikTokApi==6.1.1
|
||||
requests==2.31.0
|
||||
26
setup.py
26
setup.py
@@ -1,30 +1,6 @@
|
||||
from setuptools import setup
|
||||
|
||||
|
||||
def read_requirements(filename: str):
|
||||
with open(filename) as requirements_file:
|
||||
import re
|
||||
|
||||
def fix_url_dependencies(req: str) -> str:
|
||||
"""Pip and setuptools disagree about how URL dependencies should be handled."""
|
||||
m = re.match(
|
||||
r"^(git\+)?(https|ssh)://(git@)?github\.com/([\w-]+)/(?P<name>[\w-]+)\.git",
|
||||
req,
|
||||
)
|
||||
if m is None:
|
||||
return req
|
||||
else:
|
||||
return f"{m.group('name')} @ {req}"
|
||||
|
||||
requirements = []
|
||||
for line in requirements_file:
|
||||
line = line.strip()
|
||||
if line.startswith("#") or len(line) <= 0:
|
||||
continue
|
||||
requirements.append(fix_url_dependencies(line))
|
||||
return requirements
|
||||
|
||||
|
||||
with open("README.md", "r", encoding="utf-8") as file:
|
||||
long_description = file.read()
|
||||
|
||||
@@ -45,8 +21,6 @@ setup(
|
||||
long_description_content_type="text/markdown",
|
||||
url="https://github.com/bellingcat/tiktok-hashtag-analysis",
|
||||
license="MIT License",
|
||||
# install_requires=read_requirements("requirements.txt"),
|
||||
# extras_require={"dev": read_requirements("dev-requirements.txt")},
|
||||
install_requires=["seaborn", "matplotlib", "TikTokApi", "requests", "yt_dlp"],
|
||||
extras_require={"test": ["pytest", "pytest-cov", "pytest-html", "pytest-metadata"]},
|
||||
classifiers=[
|
||||
|
||||
58
tests/cli.py
58
tests/cli.py
@@ -1,8 +1,15 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from tiktok_hashtag_analysis.cli import create_parser
|
||||
from tiktok_hashtag_analysis.cli import (
|
||||
create_parser,
|
||||
process_output_dir,
|
||||
DEFAULT_OUTPUT_DIR,
|
||||
)
|
||||
|
||||
ARGUMENTS = [
|
||||
PARSER_ARGUMENTS = [
|
||||
("file", "hashtags.txt", "--file"),
|
||||
("download", True, "--download"),
|
||||
("download", True, "-d"),
|
||||
@@ -17,7 +24,7 @@ ARGUMENTS = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("attribute,value,flag", ARGUMENTS)
|
||||
@pytest.mark.parametrize("attribute,value,flag", PARSER_ARGUMENTS)
|
||||
def test_parser(hashtags, attribute, value, flag):
|
||||
argument_list = [*hashtags, flag]
|
||||
|
||||
@@ -29,3 +36,48 @@ def test_parser(hashtags, attribute, value, flag):
|
||||
|
||||
assert args.get(attribute) == value
|
||||
assert args.get("hashtags") == hashtags
|
||||
|
||||
|
||||
def test_process_output_dir(monkeypatch, tmp_path):
|
||||
|
||||
home_dir = Path.home().resolve()
|
||||
|
||||
# Specified nonexistent output directory without write permissions
|
||||
parser = create_parser()
|
||||
specified_output_dir = home_dir.parent / "test"
|
||||
with pytest.raises(SystemExit) as system_exit:
|
||||
result = process_output_dir(
|
||||
specified_output_dir=specified_output_dir, parser=parser
|
||||
)
|
||||
assert system_exit.type == SystemExit
|
||||
|
||||
# Specified existing output directory without write permissions
|
||||
parser = create_parser()
|
||||
specified_output_dir = home_dir.parent
|
||||
with pytest.raises(SystemExit) as system_exit:
|
||||
result = process_output_dir(
|
||||
specified_output_dir=specified_output_dir, parser=parser
|
||||
)
|
||||
assert system_exit.type == SystemExit
|
||||
|
||||
# Unspecified, in current directory without write permissions
|
||||
cwd = os.getcwd()
|
||||
monkeypatch.chdir(specified_output_dir)
|
||||
result = process_output_dir(specified_output_dir=None, parser=parser)
|
||||
monkeypatch.chdir(cwd)
|
||||
assert result == DEFAULT_OUTPUT_DIR
|
||||
|
||||
# Specified nonexisting output directory with write permissions
|
||||
parser = create_parser()
|
||||
specified_output_dir = tmp_path / "test" / "tiktok"
|
||||
result = process_output_dir(
|
||||
specified_output_dir=specified_output_dir, parser=parser
|
||||
)
|
||||
assert result == specified_output_dir
|
||||
|
||||
# Unspecified, in current directory with write permissions
|
||||
cwd = os.getcwd()
|
||||
monkeypatch.chdir(specified_output_dir)
|
||||
result = process_output_dir(specified_output_dir=None, parser=parser)
|
||||
monkeypatch.chdir(cwd)
|
||||
assert result == DEFAULT_OUTPUT_DIR
|
||||
|
||||
@@ -15,7 +15,6 @@ class Authorization:
|
||||
self.config_file = Path.home() / ".tiktok"
|
||||
|
||||
self.section = "TikTok"
|
||||
self.ms_token = None
|
||||
|
||||
def get_token(self) -> str:
|
||||
"""Load the "msToken" cookie taken from TikTok, which the scraper requires."""
|
||||
@@ -64,7 +63,7 @@ class Authorization:
|
||||
"""Allow user to manually enter the token in the terminal."""
|
||||
|
||||
print(
|
||||
"\nPlease copy and paste your `msToken` cookie taken from your web browser when visiting the TikTok website. See [THIS VIDEO] for more information.\n"
|
||||
"\nPlease copy and paste your `msToken` cookie taken from your web browser when visiting the TikTok website. For more information, watch the video: https://tinyurl.com/tiktok-mstoken\n"
|
||||
)
|
||||
|
||||
ms_token = input("msToken: ")
|
||||
|
||||
@@ -7,7 +7,7 @@ import warnings
|
||||
import asyncio
|
||||
import logging
|
||||
import re
|
||||
from typing import List, Dict
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
import yt_dlp
|
||||
import requests
|
||||
@@ -101,7 +101,9 @@ def aggregate_cooccurring_hashtags(hashtag_file: Path) -> Counter:
|
||||
class TikTokDownloader:
|
||||
"""Main class for scraping data from TikTok."""
|
||||
|
||||
def __init__(self, hashtags: List[str], data_dir: str, config_file: str = None):
|
||||
def __init__(
|
||||
self, hashtags: List[str], data_dir: Path, config_file: Optional[str] = None
|
||||
):
|
||||
self.hashtags = process_hashtag_list(hashtags)
|
||||
logging.info(f"Hashtags to scrape: {hashtags}")
|
||||
|
||||
@@ -146,7 +148,8 @@ class TikTokDownloader:
|
||||
json_dump(file_path=hashtag_file, data=all_fetched_data)
|
||||
logging.info(
|
||||
f"Scraped {len(new_fetched_data)} new posts containing the hashtag "
|
||||
f"'{hashtag}', with {len(already_fetched_data)} posts previously scraped"
|
||||
f"'{hashtag}' to output directory {self.data_dir}, with "
|
||||
f"{len(already_fetched_data)} posts previously scraped"
|
||||
)
|
||||
|
||||
def get_hashtag_videos(self, hashtag: str):
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
import os
|
||||
import logging
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
from typing import Optional
|
||||
from .base import TikTokDownloader, load_hashtags_from_file
|
||||
|
||||
DEFAULT_OUTPUT_DIR = Path.home() / "tiktok_hashtag_data"
|
||||
|
||||
|
||||
def create_parser():
|
||||
"""Create parser tp parse input command-line arguments."""
|
||||
@@ -51,7 +54,7 @@ def create_parser():
|
||||
"--output-dir",
|
||||
type=str,
|
||||
help="Directory to save scraped data and visualizations to",
|
||||
default=Path(".").resolve().parent / "data",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
@@ -64,6 +67,29 @@ def create_parser():
|
||||
return parser
|
||||
|
||||
|
||||
def process_output_dir(
|
||||
specified_output_dir: Optional[str], parser: argparse.ArgumentParser
|
||||
) -> Path:
|
||||
"""Make sure the output directory can be created or has write permissions."""
|
||||
|
||||
error_message = (
|
||||
lambda _output_dir: f"You don't have write permissions for the specified output directory (`{_output_dir}`). Please specify an output directory that you have write access to."
|
||||
)
|
||||
|
||||
if specified_output_dir is None:
|
||||
return DEFAULT_OUTPUT_DIR
|
||||
else:
|
||||
_output_dir = Path(specified_output_dir).resolve()
|
||||
try:
|
||||
os.makedirs(_output_dir, exist_ok=True)
|
||||
if not os.access(path=_output_dir, mode=os.W_OK):
|
||||
parser.error(error_message(_output_dir))
|
||||
else:
|
||||
return _output_dir
|
||||
except PermissionError:
|
||||
parser.error(error_message(_output_dir))
|
||||
|
||||
|
||||
def main():
|
||||
"""Parse and process command-line arguments, scrape specified hashtags, and perform specified analyses."""
|
||||
|
||||
@@ -89,8 +115,10 @@ def main():
|
||||
else:
|
||||
hashtags = args.hashtags
|
||||
|
||||
output_dir = process_output_dir(specified_output_dir=args.output_dir, parser=parser)
|
||||
|
||||
downloader = TikTokDownloader(
|
||||
hashtags=hashtags, data_dir=args.output_dir, config_file=args.config
|
||||
hashtags=hashtags, data_dir=output_dir, config_file=args.config
|
||||
)
|
||||
|
||||
downloader.run(
|
||||
|
||||
Reference in New Issue
Block a user