From a8c1ef3912ff3728891b52d8565cf1e352b3a359 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Sat, 5 Jul 2025 16:54:58 +0100 Subject: [PATCH] generic_extractor config to use proxy only when needed to avoid overzealousness --- .../modules/generic_extractor/__manifest__.py | 4 ++++ .../generic_extractor/generic_extractor.py | 18 ++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/auto_archiver/modules/generic_extractor/__manifest__.py b/src/auto_archiver/modules/generic_extractor/__manifest__.py index 62bd4c8..06fc3ac 100644 --- a/src/auto_archiver/modules/generic_extractor/__manifest__.py +++ b/src/auto_archiver/modules/generic_extractor/__manifest__.py @@ -60,6 +60,10 @@ If you are having issues with the extractor, you can review the version of `yt-d "default": "", "help": "http/https/socks proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port", }, + "proxy_on_failure_only": { + "default": True, + "help": "Applies only if a proxy is set. In that case if this setting is True, the extractor will only use the proxy if the initial request fails; if it is False, the extractor will always use the proxy.", + }, "end_means_success": { "default": True, "help": "if True, any archived content will mean a 'success', if False this extractor will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent extractors can retrieve.", diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index e0d3f04..e536391 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -502,6 +502,9 @@ class GenericExtractor(Extractor): try: result = self.get_metadata_for_post(info_extractor, url, ydl) except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e: + if "NSFW tweet requires authentication." in str(post_e): + logger.warning(str(post_e)) + return False logger.error("Error downloading metadata for post: {error}", error=str(post_e)) return False except Exception as generic_e: @@ -525,13 +528,24 @@ class GenericExtractor(Extractor): return result - def download(self, item: Metadata) -> Metadata: + def download(self, item: Metadata, skip_proxy: bool = False) -> Metadata: url = item.get_url() # TODO: this is a temporary hack until this issue is closed: https://github.com/yt-dlp/yt-dlp/issues/11025 if url.startswith("https://ya.ru"): url = url.replace("https://ya.ru", "https://yandex.ru") item.set("replaced_url", url) + logger.debug(f"{skip_proxy=}, {self.proxy_on_failure_only=}, {self.proxy=}") + + # proxy_on_failure_only logic + if self.proxy and self.proxy_on_failure_only and not skip_proxy: + # when proxy_on_failure_only is True, we first try to download without a proxy and only continue with execution if that fails + try: + if without_proxy := self.download(item, skip_proxy=True): + logger.info("Downloaded successfully without proxy.") + return without_proxy + except Exception: + logger.debug("Download without proxy failed, trying with proxy...") ydl_options = [ "-o", @@ -546,7 +560,7 @@ class GenericExtractor(Extractor): ] # proxy handling - if self.proxy: + if self.proxy and not skip_proxy: ydl_options.extend(["--proxy", self.proxy]) # max_downloads handling