mirror of
https://github.com/bellingcat/tiktok-hashtag-analysis.git
synced 2026-06-07 19:08:32 +03:00
removed auth module and authorization, since msToken isnt actually required to run scraper
This commit is contained in:
@@ -11,8 +11,6 @@ The tool helps to download posts and videos from TikTok for a given set of hasht
|
||||
|
||||
You should now be ready to start using it.
|
||||
|
||||
The scraper this tool uses requires an `msToken` taken from the TikTok website on your browser. The first time you run the tool, it will ask for this token. You can see how to retrieve the token by accessing your browser's "Developer Tools", and how to input its value into the tool's command-line interface in [this video](https://github.com/bellingcat/tiktok-hashtag-analysis/assets/18430739/b9d40957-c59e-4b6d-a843-13d210f89055).
|
||||
|
||||
## About the tool
|
||||
### Command-line arguments
|
||||
```
|
||||
|
||||
@@ -1,24 +0,0 @@
|
||||
import pytest
|
||||
|
||||
from tiktok_hashtag_analysis.auth import Authorization
|
||||
|
||||
MS_TOKEN = "thisisafakemstokenfortiktok"
|
||||
|
||||
|
||||
def test_auth_input(tmp_path, monkeypatch):
|
||||
config_file = tmp_path / ".tiktok"
|
||||
monkeypatch.setattr("builtins.input", lambda _: MS_TOKEN)
|
||||
auth = Authorization(config_file=config_file)
|
||||
auth.get_token()
|
||||
|
||||
assert auth.ms_token == MS_TOKEN
|
||||
|
||||
|
||||
def test_auth(tmp_path):
|
||||
config_file = tmp_path / ".tiktok"
|
||||
auth = Authorization(config_file=config_file)
|
||||
|
||||
auth.dump_token(ms_token=MS_TOKEN)
|
||||
auth.get_token()
|
||||
|
||||
assert auth.ms_token == MS_TOKEN
|
||||
@@ -1,71 +0,0 @@
|
||||
import os
|
||||
import configparser
|
||||
from pathlib import Path
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class Authorization:
|
||||
"""Handle authorization for TikTok, using the `msToken`."""
|
||||
|
||||
def __init__(self, config_file: Optional[str] = None):
|
||||
if config_file:
|
||||
self.config_file = Path(config_file)
|
||||
else:
|
||||
self.config_file = Path.home() / ".tiktok"
|
||||
|
||||
self.section = "TikTok"
|
||||
|
||||
def get_token(self) -> str:
|
||||
"""Load the "msToken" cookie taken from TikTok, which the scraper requires."""
|
||||
|
||||
# Step 1: check if MS_TOKEN is defined as environment variable
|
||||
if ms_token := os.environ.get("MS_TOKEN"):
|
||||
self.ms_token = ms_token
|
||||
logging.debug("Loaded token from environment variable")
|
||||
|
||||
# Step 2: check if MS_TOKEN is defined in config file
|
||||
elif self.config_file.is_file():
|
||||
if ms_token := self.load_token():
|
||||
self.ms_token = ms_token
|
||||
logging.debug(f"Loaded token from config file: {self.config_file}")
|
||||
|
||||
# Step 3: have user enter MS_TOKEN via terminal
|
||||
else:
|
||||
ms_token = self.input_token()
|
||||
self.dump_token(ms_token=ms_token)
|
||||
self.ms_token = ms_token
|
||||
logging.debug(
|
||||
f"Loaded token from user input and saved to config file: {self.config_file}"
|
||||
)
|
||||
|
||||
return self.ms_token
|
||||
|
||||
def load_token(self) -> Optional[str]:
|
||||
"""Parse a config file and extract the token."""
|
||||
|
||||
config = configparser.ConfigParser()
|
||||
config.read(self.config_file)
|
||||
return config.get(section=self.section, option="MS_TOKEN", fallback=None)
|
||||
|
||||
def dump_token(self, ms_token: str):
|
||||
"""Write the token to a config file."""
|
||||
|
||||
config = configparser.ConfigParser()
|
||||
config.read(self.config_file)
|
||||
config.add_section(self.section)
|
||||
config.set(section=self.section, option="MS_TOKEN", value=ms_token)
|
||||
|
||||
with open(self.config_file, "w", encoding="utf-8") as f:
|
||||
config.write(f)
|
||||
|
||||
def input_token(self) -> str:
|
||||
"""Allow user to manually enter the token in the terminal."""
|
||||
|
||||
print(
|
||||
"\nPlease copy and paste your `msToken` cookie taken from your web browser when visiting the TikTok website. For more information, watch the video: https://tinyurl.com/tiktok-mstoken\n"
|
||||
)
|
||||
|
||||
ms_token = input("msToken: ")
|
||||
|
||||
return ms_token
|
||||
@@ -26,7 +26,6 @@ from tenacity import (
|
||||
from playwright._impl._api_types import Error
|
||||
from TikTokApi import TikTokApi
|
||||
|
||||
from .auth import Authorization
|
||||
|
||||
warnings.filterwarnings("ignore", message="Glyph (.*) missing from current font")
|
||||
sns.set_theme(style="darkgrid")
|
||||
@@ -53,7 +52,7 @@ def load_hashtags_from_file(file: str) -> List[str]:
|
||||
|
||||
# Retry upon encountering transient playwright errors
|
||||
@retry(retry=retry_if_exception_type(Error), stop=stop_after_attempt(3))
|
||||
async def _fetch_hashtag_data(hashtag: str, ms_token: str, limit: int) -> List[Dict]:
|
||||
async def _fetch_hashtag_data(hashtag: str, limit: int) -> List[Dict]:
|
||||
"""Fetch data for videos containing a specified hashtag, asynchronously."""
|
||||
data = []
|
||||
async with TikTokApi() as api:
|
||||
@@ -148,9 +147,6 @@ class TikTokDownloader:
|
||||
logger.info(f"Hashtags to scrape: {self.hashtags}")
|
||||
logger.info(f"Writing data to directory: {self.data_dir}")
|
||||
|
||||
self.auth = Authorization(config_file=config_file)
|
||||
self.ms_token = self.auth.get_token()
|
||||
|
||||
def prioritize_hashtags(self):
|
||||
"""Order hashtags based on whether they've been scraped before, and
|
||||
the time they were most recently scraped"""
|
||||
@@ -177,9 +173,7 @@ class TikTokDownloader:
|
||||
already_fetched_ids = set(video["id"] for video in already_fetched_data)
|
||||
|
||||
# Scrape posts that use the specified hashtag
|
||||
fetched_data = asyncio.run(
|
||||
_fetch_hashtag_data(hashtag=hashtag, ms_token=self.ms_token, limit=limit)
|
||||
)
|
||||
fetched_data = asyncio.run(_fetch_hashtag_data(hashtag=hashtag, limit=limit))
|
||||
fetched_ids = set(video["id"] for video in fetched_data)
|
||||
|
||||
if len(fetched_data) == 0:
|
||||
|
||||
Reference in New Issue
Block a user