Merge pull request #246 from bellingcat/webdriver-cookies

Better checking of cookies to add to webdriver + generic extractor tweaks
2026-06-07 19:08:30 +03:00 · 2025-03-14 13:35:58 +00:00
parent a01a873f37 562d06916e
commit 733aef0b08
3 changed files with 55 additions and 32 deletions
--- a/src/auto_archiver/modules/generic_extractor/manifest.py
+++ b/src/auto_archiver/modules/generic_extractor/manifest.py
@@ -76,5 +76,12 @@ If you are having issues with the extractor, you can review the version of `yt-d
            "help": "How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.",
            "type": "int",
        },
+        "ytdlp_args": {
+            "default": "",
+            "help": "Additional arguments to pass to yt-dlp, e.g. --no-check-certificate or --plugin-dirs.\
+See yt-dlp documentation here for more information: https://github.com/yt-dlp/yt-dlp?tab=readme-ov-file#general-options\
+Note: this is not to be confused with 'extractor_args' which are specific to the extractor itself.",
+            "type": "str",
+        },
    },
 }
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -2,6 +2,7 @@ import datetime
 import os
 import importlib
 import subprocess
+
 from typing import Generator, Type

 import yt_dlp
@@ -234,7 +235,7 @@ class GenericExtractor(Extractor):

        if not dropin:
            # TODO: add a proper link to 'how to create your own dropin'
-            logger.debug(f"""Could not find valid dropin for {info_extractor.IE_NAME}.
+            logger.debug(f"""Could not find valid dropin for {info_extractor.ie_key()}.
                     Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""")
            return False

@@ -352,7 +353,7 @@ class GenericExtractor(Extractor):
            result = self.get_metadata_for_video(data, info_extractor, url, ydl)

        except Exception as e:
-            if info_extractor.ie_key() == "generic":
+            if info_extractor.IE_NAME == "generic":
                # don't clutter the logs with issues about the 'generic' extractor not having a dropin
                return False

@@ -395,17 +396,19 @@ class GenericExtractor(Extractor):
            url = url.replace("https://ya.ru", "https://yandex.ru")
            item.set("replaced_url", url)

-        ydl_options = {
-            "outtmpl": os.path.join(self.tmp_dir, "%(id)s.%(ext)s"),
-            "quiet": False,
-            "noplaylist": not self.allow_playlist,
-            "writesubtitles": self.subtitles,
-            "writeautomaticsub": self.subtitles,
-            "live_from_start": self.live_from_start,
-            "proxy": self.proxy,
-            "max_downloads": self.max_downloads,
-            "playlistend": self.max_downloads,
-        }
+        ydl_options = [
+            "-o",
+            os.path.join(self.tmp_dir, "%(id)s.%(ext)s"),
+            "--quiet",
+            "--no-playlist" if not self.allow_playlist else "--yes-playlist",
+            "--write-subs" if self.subtitles else "--no-write-subs",
+            "--write-auto-subs" if self.subtitles else "--no-write-auto-subs",
+            "--live-from-start" if self.live_from_start else "--no-live-from-start",
+            "--proxy",
+            self.proxy if self.proxy else "",
+            f"--max-downloads {self.max_downloads}" if self.max_downloads != "inf" else "",
+            f"--playlist-end {self.max_downloads}" if self.max_downloads != "inf" else "",
+        ]

        # set up auth
        auth = self.auth_for_site(url, extract_cookies=False)
@@ -414,20 +417,25 @@ class GenericExtractor(Extractor):
        if auth:
            if "username" in auth and "password" in auth:
                logger.debug(f"Using provided auth username and password for {url}")
-                ydl_options["username"] = auth["username"]
-                ydl_options["password"] = auth["password"]
+                ydl_options.extend(("--username", auth["username"]))
+                ydl_options.extend(("--password", auth["password"]))
            elif "cookie" in auth:
                logger.debug(f"Using provided auth cookie for {url}")
                yt_dlp.utils.std_headers["cookie"] = auth["cookie"]
            elif "cookies_from_browser" in auth:
                logger.debug(f"Using extracted cookies from browser {auth['cookies_from_browser']} for {url}")
-                ydl_options["cookiesfrombrowser"] = auth["cookies_from_browser"]
+                ydl_options.extend(("--cookies-from-browser", auth["cookies_from_browser"]))
            elif "cookies_file" in auth:
                logger.debug(f"Using cookies from file {auth['cookies_file']} for {url}")
-                ydl_options["cookiefile"] = auth["cookies_file"]
+                ydl_options.extend(("--cookies", auth["cookies_file"]))

+        if self.ytdlp_args:
+            logger.debug("Adding additional ytdlp arguments: {self.ytdlp_args}")
+            ydl_options += self.ytdlp_args.split(" ")
+
+        *_, validated_options = yt_dlp.parse_options(ydl_options)
        ydl = yt_dlp.YoutubeDL(
-            ydl_options
+            validated_options
        )  # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"

        for info_extractor in self.suitable_extractors(url):
--- a/src/auto_archiver/utils/webdriver.py
+++ b/src/auto_archiver/utils/webdriver.py
@@ -4,6 +4,7 @@ from __future__ import annotations

 import os
 import time
+import re

 # import domain_for_url
 from urllib.parse import urlparse, urlunparse
@@ -47,9 +48,10 @@ class CookieSettingDriver(webdriver.Firefox):
                    for name, value in cookie.split("="):
                        self.driver.add_cookie({"name": name, "value": value})
            elif self.cookiejar:
-                domain = urlparse(url).netloc.lstrip("www.")
+                domain = urlparse(url).netloc
+                regex = re.compile(f"(www)?\.?{domain}$")
                for cookie in self.cookiejar:
-                    if domain in cookie.domain:
+                    if regex.match(cookie.domain):
                        try:
                            self.add_cookie(
                                {
@@ -62,27 +64,33 @@ class CookieSettingDriver(webdriver.Firefox):
                                }
                            )
                        except Exception as e:
-                            logger.warning(f"Failed to add cookie to webdriver: {e}")
+                            logger.warning(f"Failed to add cookie ({cookie.domain}) to webdriver for url {domain}: {e}")

-        if self.facebook_accept_cookies:
+        super(CookieSettingDriver, self).get(url)
+        time.sleep(2)
+
+        # Try and use some common button text to reject/accept cookies
+        for text in [
+            "Refuse non-essential cookies",
+            "Decline optional cookies",
+            "Reject additional cookies",
+            "Reject all",
+            "Accept all cookies",
+        ]:
            try:
-                logger.debug("Trying fb click accept cookie popup.")
-                super(CookieSettingDriver, self).get("http://www.facebook.com")
-                essential_only = self.find_element(By.XPATH, "//span[contains(text(), 'Decline optional cookies')]")
-                essential_only.click()
-                logger.debug("fb click worked")
-                # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
+                xpath = f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text.lower()}')]"
+                self.find_element(By.XPATH, xpath).click()
                time.sleep(2)
-            except Exception as e:
-                logger.warning("Failed on fb accept cookies.", e)
+            except selenium_exceptions.NoSuchElementException:
+                pass

        # now get the actual URL
-        super(CookieSettingDriver, self).get(url)
        if self.facebook_accept_cookies:
            # try and click the 'close' button on the 'login' window to close it
            try:
                xpath = "//div[@role='dialog']//div[@aria-label='Close']"
-                WebDriverWait(self, 5).until(EC.element_to_be_clickable((By.XPATH, xpath))).click()
+                self.find_element(By.XPATH, xpath).click()
+                time.sleep(2)
            except selenium_exceptions.NoSuchElementException:
                logger.warning("Unable to find the 'close' button on the facebook login window")
                pass