From 8ec053ed1bddf6c09ffc9146335d5e91d25fe259 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 18 Mar 2025 08:56:22 +0000 Subject: [PATCH 1/4] Refactor the dropin 'is_suitable' method + fix tikwm implementation Makes it easier to maintain/understand. --- .../modules/generic_extractor/dropin.py | 17 +++++++++++++---- .../modules/generic_extractor/facebook.py | 2 +- .../generic_extractor/generic_extractor.py | 12 ++++-------- .../modules/generic_extractor/tiktok.py | 8 ++++++++ 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/src/auto_archiver/modules/generic_extractor/dropin.py b/src/auto_archiver/modules/generic_extractor/dropin.py index 8395f09..eb8cc71 100644 --- a/src/auto_archiver/modules/generic_extractor/dropin.py +++ b/src/auto_archiver/modules/generic_extractor/dropin.py @@ -59,9 +59,18 @@ class GenericDropin: """ return metadata - def is_suitable(self, url, info_extractor: InfoExtractor): + def suitable(self, url, info_extractor: InfoExtractor): """ - Used to override the InfoExtractor's 'is_suitable' method. Dropins should override this method to return True if the url is suitable for the extractor - (based on being able to parse other URLs) + A method to allow dropins to override their InfoExtractor's 'suitable' method. + Dropins should override this method and return True if the url is suitable for the extractor + (based on being able to parse other URLs). See the `suitable_extractors` method in the + `GenericExtractor` class for how this is implemented. + + The default behaviour of this method is to return the result of the InfoExtractor's 'suitable' method. + + ### Example: An example of where this is useful is for the FacebookIE extractor in yt-dlp. By default, + it's 'suitable' method only returns True for video URLs. However, we can override this method in the + Facebook dropin to return True for all Facebook URLs (photo/post types). This way, the Facebook dropin + can be used for all Facebook URLs. """ - return False + return info_extractor.suitable(url) diff --git a/src/auto_archiver/modules/generic_extractor/facebook.py b/src/auto_archiver/modules/generic_extractor/facebook.py index e04a862..5b264c6 100644 --- a/src/auto_archiver/modules/generic_extractor/facebook.py +++ b/src/auto_archiver/modules/generic_extractor/facebook.py @@ -142,7 +142,7 @@ class Facebook(GenericDropin): result.set_url(url) return result - def is_suitable(self, url, info_extractor: FacebookIE): + def suitable(self, url, info_extractor: FacebookIE): regex = r"(?:https?://(?:[\w-]+\.)?(?:facebook\.com||facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd\.onion)/)" return re.match(regex, url) diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 2f44ba8..e7d6be0 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -13,6 +13,7 @@ from loguru import logger from auto_archiver.core.extractor import Extractor from auto_archiver.core import Metadata, Media +from .dropin import GenericDropin class SkipYtdlp(Exception): @@ -71,14 +72,9 @@ class GenericExtractor(Extractor): continue # check if there's a dropin and see if that declares whether it's suitable - dropin = self.dropin_for_name(info_extractor.ie_key()) - if dropin and dropin.is_suitable(url, info_extractor): + dropin: GenericDropin = self.dropin_for_name(info_extractor.ie_key()) + if dropin and dropin.suitable(url, info_extractor): yield info_extractor - continue - - if info_extractor.suitable(url): - yield info_extractor - continue def suitable(self, url: str) -> bool: """ @@ -300,7 +296,7 @@ class GenericExtractor(Extractor): return self.add_metadata(data, info_extractor, url, result) - def dropin_for_name(self, dropin_name: str, additional_paths=[], package=__package__) -> Type[InfoExtractor]: + def dropin_for_name(self, dropin_name: str, additional_paths=[], package=__package__) -> GenericDropin: dropin_name = dropin_name.lower() if dropin_name == "generic": diff --git a/src/auto_archiver/modules/generic_extractor/tiktok.py b/src/auto_archiver/modules/generic_extractor/tiktok.py index b25abca..e545ba9 100644 --- a/src/auto_archiver/modules/generic_extractor/tiktok.py +++ b/src/auto_archiver/modules/generic_extractor/tiktok.py @@ -1,5 +1,8 @@ import requests from loguru import logger + +from yt_dlp.extractor.tiktok import TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE + from auto_archiver.core import Metadata, Media from datetime import datetime, timezone from .dropin import GenericDropin @@ -13,6 +16,11 @@ class Tiktok(GenericDropin): TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}" + def suitable(self, url, info_extractor): + """This dropin (which uses Tikvm) is suitable for *all* Tiktok type URLs - videos, lives, VMs, and users. + Return the 'suitable' method from the TikTokIE class.""" + return any(extractor().suitable(url) for extractor in (TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE)) + def extract_post(self, url: str, ie_instance): logger.debug(f"Using Tikwm API to attempt to download tiktok video from {url=}") From 89e387030dd861384298636f18ad668595d3e3fa Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 18 Mar 2025 09:59:59 +0000 Subject: [PATCH 2/4] Tests for suitable URLs for tikwm --- .../generic_extractor/generic_extractor.py | 2 ++ .../modules/generic_extractor/tiktok.py | 2 +- tests/conftest.py | 2 +- .../extractors/test_tiktok_tikwm_extractor.py | 28 ++++++++++++++++++- 4 files changed, 31 insertions(+), 3 deletions(-) diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index e7d6be0..80556bf 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -75,6 +75,8 @@ class GenericExtractor(Extractor): dropin: GenericDropin = self.dropin_for_name(info_extractor.ie_key()) if dropin and dropin.suitable(url, info_extractor): yield info_extractor + elif info_extractor.suitable(url): + yield info_extractor def suitable(self, url: str) -> bool: """ diff --git a/src/auto_archiver/modules/generic_extractor/tiktok.py b/src/auto_archiver/modules/generic_extractor/tiktok.py index e545ba9..e44714e 100644 --- a/src/auto_archiver/modules/generic_extractor/tiktok.py +++ b/src/auto_archiver/modules/generic_extractor/tiktok.py @@ -16,7 +16,7 @@ class Tiktok(GenericDropin): TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}" - def suitable(self, url, info_extractor): + def suitable(self, url, info_extractor) -> bool: """This dropin (which uses Tikvm) is suitable for *all* Tiktok type URLs - videos, lives, VMs, and users. Return the 'suitable' method from the TikTokIE class.""" return any(extractor().suitable(url) for extractor in (TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE)) diff --git a/tests/conftest.py b/tests/conftest.py index 379bfc2..6e87e26 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -118,7 +118,7 @@ def pytest_runtest_setup(item): pytest.xfail(f"previous test failed ({test_name})") -@pytest.fixture() +@pytest.fixture def unpickle(): """ Returns a helper function that unpickles a file diff --git a/tests/extractors/test_tiktok_tikwm_extractor.py b/tests/extractors/test_tiktok_tikwm_extractor.py index d04d7e4..81f29a5 100644 --- a/tests/extractors/test_tiktok_tikwm_extractor.py +++ b/tests/extractors/test_tiktok_tikwm_extractor.py @@ -4,6 +4,8 @@ import pytest import yt_dlp from auto_archiver.modules.generic_extractor.generic_extractor import GenericExtractor +from auto_archiver.modules.generic_extractor.tiktok import Tiktok, TikTokIE + from .test_extractor_base import TestExtractorBase @@ -17,11 +19,16 @@ def skip_ytdlp_own_methods(mocker): ) -@pytest.fixture() +@pytest.fixture def mock_get(mocker): return mocker.patch("auto_archiver.modules.generic_extractor.tiktok.requests.get") +@pytest.fixture +def tiktok_dropin() -> Tiktok: + return Tiktok() + + class TestTiktokTikwmExtractor(TestExtractorBase): """ Test suite for TestTiktokTikwmExtractor. @@ -34,6 +41,25 @@ class TestTiktokTikwmExtractor(TestExtractorBase): VALID_EXAMPLE_URL = "https://www.tiktok.com/@example/video/1234" + @pytest.mark.parametrize( + "url, is_suitable", + [ + ("https://bellingcat.com", False), + ("https://youtube.com", False), + ("https://tiktok.co/", False), + ("https://tiktok.com/", False), + ("https://www.tiktok.com/", False), + ("https://api.cool.tiktok.com/", False), + (VALID_EXAMPLE_URL, True), + ("https://www.tiktok.com/@bbcnews/video/7478038212070411542", True), + ("https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375", True), + ("https://www.tiktok.com/t/ZP8YQ8e5j/", True), + ("https://vt.tiktok.com/ZSMTJeqRP/", True), + ], + ) + def test_is_suitable(self, url, is_suitable, tiktok_dropin): + assert tiktok_dropin.suitable(url, TikTokIE()) == is_suitable + def test_invalid_json_responses(self, mock_get, make_item, caplog): mock_get.return_value.status_code = 200 mock_get.return_value.json.side_effect = ValueError From d03ecdb037174823aef53a193be2c25ad90a2866 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 18 Mar 2025 10:22:58 +0000 Subject: [PATCH 3/4] Standardise parse dates to get_datetime_from_str --- .../modules/generic_extractor/generic_extractor.py | 3 ++- src/auto_archiver/modules/generic_extractor/twitter.py | 5 ++--- .../modules/twitter_api_extractor/twitter_api_extractor.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 80556bf..e7b75d9 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -13,6 +13,7 @@ from loguru import logger from auto_archiver.core.extractor import Extractor from auto_archiver.core import Metadata, Media +from auto_archiver.utils import get_datetime_from_str from .dropin import GenericDropin @@ -223,7 +224,7 @@ class GenericExtractor(Extractor): timestamp = datetime.datetime.fromtimestamp(timestamp, tz=datetime.timezone.utc).isoformat() result.set_timestamp(timestamp) if upload_date := video_data.pop("upload_date", None) and not result.get("upload_date"): - upload_date = datetime.datetime.strptime(upload_date, "%Y%m%d").replace(tzinfo=datetime.timezone.utc) + upload_date = get_datetime_from_str(upload_date, "%Y%m%d").replace(tzinfo=datetime.timezone.utc) result.set("upload_date", upload_date) # then clean away any keys we don't want diff --git a/src/auto_archiver/modules/generic_extractor/twitter.py b/src/auto_archiver/modules/generic_extractor/twitter.py index e4cbe74..e27a0c1 100644 --- a/src/auto_archiver/modules/generic_extractor/twitter.py +++ b/src/auto_archiver/modules/generic_extractor/twitter.py @@ -1,13 +1,12 @@ import re import mimetypes import json -from datetime import datetime from loguru import logger from slugify import slugify from auto_archiver.core.metadata import Metadata, Media -from auto_archiver.utils import url as UrlUtil +from auto_archiver.utils import url as UrlUtil, get_datetime_from_str from auto_archiver.core.extractor import Extractor from .dropin import GenericDropin, InfoExtractor @@ -38,7 +37,7 @@ class Twitter(GenericDropin): try: if not tweet.get("user") or not tweet.get("created_at"): raise ValueError("Error retreiving post. Are you sure it exists?") - timestamp = datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y") + timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y") except (ValueError, KeyError) as ex: logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}") return False diff --git a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py index 1c08235..1b9eb75 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py +++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py @@ -2,7 +2,6 @@ import json import re import mimetypes import requests -from datetime import datetime from loguru import logger from pytwitter import Api @@ -10,6 +9,7 @@ from slugify import slugify from auto_archiver.core import Extractor from auto_archiver.core import Metadata, Media +from auto_archiver.utils import get_datetime_from_str class TwitterApiExtractor(Extractor): @@ -91,7 +91,7 @@ class TwitterApiExtractor(Extractor): result = Metadata() result.set_title(tweet.data.text) - result.set_timestamp(datetime.strptime(tweet.data.created_at, "%Y-%m-%dT%H:%M:%S.%fZ")) + result.set_timestamp(get_datetime_from_str(tweet.data.created_at, "%Y-%m-%dT%H:%M:%S.%fZ")) urls = [] if tweet.includes: From 23e74803eea3fe19f001cf4b7420727595d2e372 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 18 Mar 2025 10:50:48 +0000 Subject: [PATCH 4/4] Version bump --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6896e6d..89bd4eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [project] name = "auto-archiver" -version = "0.13.6" +version = "0.13.7" description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)." requires-python = ">=3.10,<3.13"