From a8c1ef3912ff3728891b52d8565cf1e352b3a359 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Sat, 5 Jul 2025 16:54:58 +0100
Subject: [PATCH] generic_extractor config to use proxy only when needed to
 avoid overzealousness

---
 .../modules/generic_extractor/__manifest__.py  |  4 ++++
 .../generic_extractor/generic_extractor.py     | 18 ++++++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/auto_archiver/modules/generic_extractor/__manifest__.py b/src/auto_archiver/modules/generic_extractor/__manifest__.py
index 62bd4c8..06fc3ac 100644
--- a/src/auto_archiver/modules/generic_extractor/__manifest__.py
+++ b/src/auto_archiver/modules/generic_extractor/__manifest__.py
@@ -60,6 +60,10 @@ If you are having issues with the extractor, you can review the version of `yt-d
             "default": "",
             "help": "http/https/socks proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port",
         },
+        "proxy_on_failure_only": {
+            "default": True,
+            "help": "Applies only if a proxy is set. In that case if this setting is True, the extractor will only use the proxy if the initial request fails; if it is False, the extractor will always use the proxy.",
+        },
         "end_means_success": {
             "default": True,
             "help": "if True, any archived content will mean a 'success', if False this extractor will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent extractors can retrieve.",
diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
index e0d3f04..e536391 100644
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -502,6 +502,9 @@ class GenericExtractor(Extractor):
             try:
                 result = self.get_metadata_for_post(info_extractor, url, ydl)
             except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
+                if "NSFW tweet requires authentication." in str(post_e):
+                    logger.warning(str(post_e))
+                    return False
                 logger.error("Error downloading metadata for post: {error}", error=str(post_e))
                 return False
             except Exception as generic_e:
@@ -525,13 +528,24 @@ class GenericExtractor(Extractor):
 
         return result
 
-    def download(self, item: Metadata) -> Metadata:
+    def download(self, item: Metadata, skip_proxy: bool = False) -> Metadata:
         url = item.get_url()
 
         # TODO: this is a temporary hack until this issue is closed: https://github.com/yt-dlp/yt-dlp/issues/11025
         if url.startswith("https://ya.ru"):
             url = url.replace("https://ya.ru", "https://yandex.ru")
             item.set("replaced_url", url)
+        logger.debug(f"{skip_proxy=}, {self.proxy_on_failure_only=}, {self.proxy=}")
+
+        # proxy_on_failure_only logic
+        if self.proxy and self.proxy_on_failure_only and not skip_proxy:
+            # when proxy_on_failure_only is True, we first try to download without a proxy and only continue with execution if that fails
+            try:
+                if without_proxy := self.download(item, skip_proxy=True):
+                    logger.info("Downloaded successfully without proxy.")
+                    return without_proxy
+            except Exception:
+                logger.debug("Download without proxy failed, trying with proxy...")
 
         ydl_options = [
             "-o",
@@ -546,7 +560,7 @@ class GenericExtractor(Extractor):
         ]
 
         # proxy handling
-        if self.proxy:
+        if self.proxy and not skip_proxy:
             ydl_options.extend(["--proxy", self.proxy])
 
         # max_downloads handling