Adds new extractor for tiktok via unofficial API (#237)

* minor update to defaults in api_db

* readme typo

* adds and tests new tikwm tiktok downloader

* addresses PR comments
This commit is contained in:
Miguel Sozinho Ramalho
2025-03-10 17:56:45 +06:00
committed by GitHub
parent ce46a8a7ac
commit 58bd38e292
6 changed files with 256 additions and 3 deletions

View File

@@ -24,9 +24,9 @@
"help": "which group of users have access to the archive in case public=false as author",
},
"use_api_cache": {
"default": True,
"default": False,
"type": "bool",
"help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived",
"help": "if True then the API database will be queried prior to any archiving operations and stop if the link has already been archived",
},
"store_results": {
"default": True,

View File

@@ -0,0 +1 @@
from .tiktok_tikwm_extractor import TiktokTikwmExtractor

View File

@@ -0,0 +1,23 @@
{
"name": "Tiktok Tikwm Extractor",
"type": ["extractor"],
"requires_setup": False,
"dependencies": {
"python": ["loguru", "requests"],
"bin": []
},
"description": """
Uses an unofficial TikTok video download platform's API to download videos: https://tikwm.com/
This extractor complements the generic_extractor which can already get TikTok videos, but this one can extract special videos like those marked as sensitive.
### Features
- Downloads the video and, if possible, also the video cover.
- Stores extra metadata about the post like author information, and more as returned by tikwm.com.
### Notes
- If tikwm.com is down, this extractor will not work.
- If tikwm.com changes their API, this extractor may break.
- If no video is found, this extractor will consider the extraction failed.
"""
}

View File

@@ -0,0 +1,75 @@
import re
import requests
from loguru import logger
from datetime import datetime, timezone
from yt_dlp.extractor.tiktok import TikTokIE
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media
class TiktokTikwmExtractor(Extractor):
"""
Extractor for TikTok that uses an unofficial API and can capture content that requires a login, like sensitive content.
"""
TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}"
def download(self, item: Metadata) -> Metadata:
url = item.get_url()
if not re.match(TikTokIE._VALID_URL, url):
return False
endpoint = TiktokTikwmExtractor.TIKWM_ENDPOINT.format(url=url)
r = requests.get(endpoint)
if r.status_code != 200:
logger.error(f"unexpected status code '{r.status_code}' from tikwm.com for {url=}:")
return False
try:
json_response = r.json()
except ValueError:
logger.error(f"failed to parse JSON response from tikwm.com for {url=}")
return False
if not json_response.get('msg') == 'success' or not (api_data := json_response.get('data', {})):
logger.error(f"failed to get a valid response from tikwm.com for {url=}: {json_response}")
return False
# tries to get the non-watermarked version first
video_url = api_data.pop("play", api_data.pop("wmplay", None))
if not video_url:
logger.error(f"no valid video URL found in response from tikwm.com for {url=}")
return False
# prepare result, start by downloading video
result = Metadata()
# get the cover if possible
cover_url = api_data.pop("origin_cover", api_data.pop("cover", api_data.pop("ai_dynamic_cover", None)))
if cover_url and (cover_downloaded := self.download_from_url(cover_url)):
result.add_media(Media(cover_downloaded))
# get the video or fail
video_downloaded = self.download_from_url(video_url, f"vid_{api_data.get('id', '')}")
if not video_downloaded:
logger.error(f"failed to download video from {video_url}")
return False
video_media = Media(video_downloaded)
if duration := api_data.pop("duration", None):
video_media.set("duration", duration)
result.add_media(video_media)
# add remaining metadata
result.set_title(api_data.pop("title", ""))
if created_at := api_data.pop("create_time", None):
result.set_timestamp(datetime.fromtimestamp(created_at, tz=timezone.utc))
if (author := api_data.pop("author", None)):
result.set("author", author)
result.set("api_data", api_data)
return result.success("tikwm")