removed auth module and authorization, since msToken isnt actually required to run scraper

This commit is contained in:
Tristan Lee
2023-09-11 21:43:33 -05:00
parent 92861e0e5d
commit b916512bde
4 changed files with 2 additions and 105 deletions

View File

@@ -11,8 +11,6 @@ The tool helps to download posts and videos from TikTok for a given set of hasht
You should now be ready to start using it.
The scraper this tool uses requires an `msToken` taken from the TikTok website on your browser. The first time you run the tool, it will ask for this token. You can see how to retrieve the token by accessing your browser's "Developer Tools", and how to input its value into the tool's command-line interface in [this video](https://github.com/bellingcat/tiktok-hashtag-analysis/assets/18430739/b9d40957-c59e-4b6d-a843-13d210f89055).
## About the tool
### Command-line arguments
```

View File

@@ -1,24 +0,0 @@
import pytest
from tiktok_hashtag_analysis.auth import Authorization
MS_TOKEN = "thisisafakemstokenfortiktok"
def test_auth_input(tmp_path, monkeypatch):
config_file = tmp_path / ".tiktok"
monkeypatch.setattr("builtins.input", lambda _: MS_TOKEN)
auth = Authorization(config_file=config_file)
auth.get_token()
assert auth.ms_token == MS_TOKEN
def test_auth(tmp_path):
config_file = tmp_path / ".tiktok"
auth = Authorization(config_file=config_file)
auth.dump_token(ms_token=MS_TOKEN)
auth.get_token()
assert auth.ms_token == MS_TOKEN

View File

@@ -1,71 +0,0 @@
import os
import configparser
from pathlib import Path
import logging
from typing import Optional
class Authorization:
"""Handle authorization for TikTok, using the `msToken`."""
def __init__(self, config_file: Optional[str] = None):
if config_file:
self.config_file = Path(config_file)
else:
self.config_file = Path.home() / ".tiktok"
self.section = "TikTok"
def get_token(self) -> str:
"""Load the "msToken" cookie taken from TikTok, which the scraper requires."""
# Step 1: check if MS_TOKEN is defined as environment variable
if ms_token := os.environ.get("MS_TOKEN"):
self.ms_token = ms_token
logging.debug("Loaded token from environment variable")
# Step 2: check if MS_TOKEN is defined in config file
elif self.config_file.is_file():
if ms_token := self.load_token():
self.ms_token = ms_token
logging.debug(f"Loaded token from config file: {self.config_file}")
# Step 3: have user enter MS_TOKEN via terminal
else:
ms_token = self.input_token()
self.dump_token(ms_token=ms_token)
self.ms_token = ms_token
logging.debug(
f"Loaded token from user input and saved to config file: {self.config_file}"
)
return self.ms_token
def load_token(self) -> Optional[str]:
"""Parse a config file and extract the token."""
config = configparser.ConfigParser()
config.read(self.config_file)
return config.get(section=self.section, option="MS_TOKEN", fallback=None)
def dump_token(self, ms_token: str):
"""Write the token to a config file."""
config = configparser.ConfigParser()
config.read(self.config_file)
config.add_section(self.section)
config.set(section=self.section, option="MS_TOKEN", value=ms_token)
with open(self.config_file, "w", encoding="utf-8") as f:
config.write(f)
def input_token(self) -> str:
"""Allow user to manually enter the token in the terminal."""
print(
"\nPlease copy and paste your `msToken` cookie taken from your web browser when visiting the TikTok website. For more information, watch the video: https://tinyurl.com/tiktok-mstoken\n"
)
ms_token = input("msToken: ")
return ms_token

View File

@@ -26,7 +26,6 @@ from tenacity import (
from playwright._impl._api_types import Error
from TikTokApi import TikTokApi
from .auth import Authorization
warnings.filterwarnings("ignore", message="Glyph (.*) missing from current font")
sns.set_theme(style="darkgrid")
@@ -53,7 +52,7 @@ def load_hashtags_from_file(file: str) -> List[str]:
# Retry upon encountering transient playwright errors
@retry(retry=retry_if_exception_type(Error), stop=stop_after_attempt(3))
async def _fetch_hashtag_data(hashtag: str, ms_token: str, limit: int) -> List[Dict]:
async def _fetch_hashtag_data(hashtag: str, limit: int) -> List[Dict]:
"""Fetch data for videos containing a specified hashtag, asynchronously."""
data = []
async with TikTokApi() as api:
@@ -148,9 +147,6 @@ class TikTokDownloader:
logger.info(f"Hashtags to scrape: {self.hashtags}")
logger.info(f"Writing data to directory: {self.data_dir}")
self.auth = Authorization(config_file=config_file)
self.ms_token = self.auth.get_token()
def prioritize_hashtags(self):
"""Order hashtags based on whether they've been scraped before, and
the time they were most recently scraped"""
@@ -177,9 +173,7 @@ class TikTokDownloader:
already_fetched_ids = set(video["id"] for video in already_fetched_data)
# Scrape posts that use the specified hashtag
fetched_data = asyncio.run(
_fetch_hashtag_data(hashtag=hashtag, ms_token=self.ms_token, limit=limit)
)
fetched_data = asyncio.run(_fetch_hashtag_data(hashtag=hashtag, limit=limit))
fetched_ids = set(video["id"] for video in fetched_data)
if len(fetched_data) == 0: