mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 20:58:29 +03:00
Merge branch 'main' into linting_etc
This commit is contained in:
@@ -14,6 +14,10 @@ from auto_archiver.core.extractor import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
|
||||
class SkipYtdlp(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class GenericExtractor(Extractor):
|
||||
_dropins = {}
|
||||
|
||||
@@ -336,7 +340,8 @@ class GenericExtractor(Extractor):
|
||||
|
||||
try:
|
||||
if dropin_submodule and dropin_submodule.skip_ytdlp_download(info_extractor, url):
|
||||
raise Exception(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
|
||||
logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
|
||||
raise SkipYtdlp()
|
||||
|
||||
# don't download since it can be a live stream
|
||||
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
|
||||
@@ -351,17 +356,21 @@ class GenericExtractor(Extractor):
|
||||
# don't clutter the logs with issues about the 'generic' extractor not having a dropin
|
||||
return False
|
||||
|
||||
logger.debug(
|
||||
f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead'
|
||||
)
|
||||
if not isinstance(e, SkipYtdlp):
|
||||
logger.debug(
|
||||
f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead'
|
||||
)
|
||||
|
||||
try:
|
||||
result = self.get_metadata_for_post(info_extractor, url, ydl)
|
||||
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
|
||||
logger.error(f"Error downloading metadata for post: {post_e}")
|
||||
logger.error("Error downloading metadata for post: {error}", error=str(post_e))
|
||||
return False
|
||||
except Exception as generic_e:
|
||||
logger.debug(
|
||||
f'Attempt to extract using ytdlp extractor "{info_extractor.IE_NAME}" failed: \n {repr(generic_e)}',
|
||||
'Attempt to extract using ytdlp extractor "{name}" failed: \n {error}',
|
||||
name=info_extractor.IE_NAME,
|
||||
error=str(generic_e),
|
||||
exc_info=True,
|
||||
)
|
||||
return False
|
||||
@@ -387,7 +396,7 @@ class GenericExtractor(Extractor):
|
||||
item.set("replaced_url", url)
|
||||
|
||||
ydl_options = {
|
||||
"outtmpl": os.path.join(self.tmp_dir, "%(id)s.%(ext)s"),
|
||||
"outtmpl": os.path.join(self.tmp_dir, f"%(id)s.%(ext)s"),
|
||||
"quiet": False,
|
||||
"noplaylist": not self.allow_playlist,
|
||||
"writesubtitles": self.subtitles,
|
||||
|
||||
72
src/auto_archiver/modules/generic_extractor/tiktok.py
Normal file
72
src/auto_archiver/modules/generic_extractor/tiktok.py
Normal file
@@ -0,0 +1,72 @@
|
||||
import requests
|
||||
from loguru import logger
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from datetime import datetime, timezone
|
||||
from .dropin import GenericDropin
|
||||
|
||||
|
||||
class Tiktok(GenericDropin):
|
||||
"""
|
||||
TikTok droping for the Generic Extractor that uses an unofficial API if/when ytdlp fails.
|
||||
It's useful for capturing content that requires a login, like sensitive content.
|
||||
"""
|
||||
|
||||
TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}"
|
||||
|
||||
def extract_post(self, url: str, ie_instance):
|
||||
logger.debug(f"Using Tikwm API to attempt to download tiktok video from {url=}")
|
||||
|
||||
endpoint = self.TIKWM_ENDPOINT.format(url=url)
|
||||
|
||||
r = requests.get(endpoint)
|
||||
if r.status_code != 200:
|
||||
raise ValueError(f"unexpected status code '{r.status_code}' from tikwm.com for {url=}:")
|
||||
|
||||
try:
|
||||
json_response = r.json()
|
||||
except ValueError:
|
||||
raise ValueError(f"failed to parse JSON response from tikwm.com for {url=}")
|
||||
|
||||
if not json_response.get("msg") == "success" or not (api_data := json_response.get("data", {})):
|
||||
raise ValueError(f"failed to get a valid response from tikwm.com for {url=}: {repr(json_response)}")
|
||||
|
||||
# tries to get the non-watermarked version first
|
||||
video_url = api_data.pop("play", api_data.pop("wmplay", None))
|
||||
if not video_url:
|
||||
raise ValueError(f"no valid video URL found in response from tikwm.com for {url=}")
|
||||
|
||||
api_data["video_url"] = video_url
|
||||
return api_data
|
||||
|
||||
def create_metadata(self, post: dict, ie_instance, archiver, url):
|
||||
# prepare result, start by downloading video
|
||||
result = Metadata()
|
||||
video_url = post.pop("video_url")
|
||||
|
||||
# get the cover if possible
|
||||
cover_url = post.pop("origin_cover", post.pop("cover", post.pop("ai_dynamic_cover", None)))
|
||||
if cover_url and (cover_downloaded := archiver.download_from_url(cover_url)):
|
||||
result.add_media(Media(cover_downloaded))
|
||||
|
||||
# get the video or fail
|
||||
video_downloaded = archiver.download_from_url(video_url, f"vid_{post.get('id', '')}")
|
||||
if not video_downloaded:
|
||||
logger.error(f"failed to download video from {video_url}")
|
||||
return False
|
||||
video_media = Media(video_downloaded)
|
||||
if duration := post.pop("duration", None):
|
||||
video_media.set("duration", duration)
|
||||
result.add_media(video_media)
|
||||
|
||||
# add remaining metadata
|
||||
result.set_title(post.pop("title", ""))
|
||||
|
||||
if created_at := post.pop("create_time", None):
|
||||
result.set_timestamp(datetime.fromtimestamp(created_at, tz=timezone.utc))
|
||||
|
||||
if author := post.pop("author", None):
|
||||
result.set("author", author)
|
||||
|
||||
result.set("api_data", post)
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user