diff --git a/src/auto_archiver/modules/generic_extractor/__manifest__.py b/src/auto_archiver/modules/generic_extractor/__manifest__.py index 1d3b365..274a4ba 100644 --- a/src/auto_archiver/modules/generic_extractor/__manifest__.py +++ b/src/auto_archiver/modules/generic_extractor/__manifest__.py @@ -76,5 +76,12 @@ If you are having issues with the extractor, you can review the version of `yt-d "help": "How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.", "type": "int", }, + "ytdlp_args": { + "default": "", + "help": "Additional arguments to pass to yt-dlp, e.g. --no-check-certificate or --plugin-dirs.\ +See yt-dlp documentation here for more information: https://github.com/yt-dlp/yt-dlp?tab=readme-ov-file#general-options\ +Note: this is not to be confused with 'extractor_args' which are specific to the extractor itself.", + "type": "str", + }, }, } diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index c06e622..534fb71 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -2,6 +2,7 @@ import datetime import os import importlib import subprocess + from typing import Generator, Type import yt_dlp @@ -234,7 +235,7 @@ class GenericExtractor(Extractor): if not dropin: # TODO: add a proper link to 'how to create your own dropin' - logger.debug(f"""Could not find valid dropin for {info_extractor.IE_NAME}. + logger.debug(f"""Could not find valid dropin for {info_extractor.ie_key()}. Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""") return False @@ -352,7 +353,7 @@ class GenericExtractor(Extractor): result = self.get_metadata_for_video(data, info_extractor, url, ydl) except Exception as e: - if info_extractor.ie_key() == "generic": + if info_extractor.IE_NAME == "generic": # don't clutter the logs with issues about the 'generic' extractor not having a dropin return False @@ -395,17 +396,19 @@ class GenericExtractor(Extractor): url = url.replace("https://ya.ru", "https://yandex.ru") item.set("replaced_url", url) - ydl_options = { - "outtmpl": os.path.join(self.tmp_dir, "%(id)s.%(ext)s"), - "quiet": False, - "noplaylist": not self.allow_playlist, - "writesubtitles": self.subtitles, - "writeautomaticsub": self.subtitles, - "live_from_start": self.live_from_start, - "proxy": self.proxy, - "max_downloads": self.max_downloads, - "playlistend": self.max_downloads, - } + ydl_options = [ + "-o", + os.path.join(self.tmp_dir, "%(id)s.%(ext)s"), + "--quiet", + "--no-playlist" if not self.allow_playlist else "--yes-playlist", + "--write-subs" if self.subtitles else "--no-write-subs", + "--write-auto-subs" if self.subtitles else "--no-write-auto-subs", + "--live-from-start" if self.live_from_start else "--no-live-from-start", + "--proxy", + self.proxy if self.proxy else "", + f"--max-downloads {self.max_downloads}" if self.max_downloads != "inf" else "", + f"--playlist-end {self.max_downloads}" if self.max_downloads != "inf" else "", + ] # set up auth auth = self.auth_for_site(url, extract_cookies=False) @@ -414,20 +417,25 @@ class GenericExtractor(Extractor): if auth: if "username" in auth and "password" in auth: logger.debug(f"Using provided auth username and password for {url}") - ydl_options["username"] = auth["username"] - ydl_options["password"] = auth["password"] + ydl_options.extend(("--username", auth["username"])) + ydl_options.extend(("--password", auth["password"])) elif "cookie" in auth: logger.debug(f"Using provided auth cookie for {url}") yt_dlp.utils.std_headers["cookie"] = auth["cookie"] elif "cookies_from_browser" in auth: logger.debug(f"Using extracted cookies from browser {auth['cookies_from_browser']} for {url}") - ydl_options["cookiesfrombrowser"] = auth["cookies_from_browser"] + ydl_options.extend(("--cookies-from-browser", auth["cookies_from_browser"])) elif "cookies_file" in auth: logger.debug(f"Using cookies from file {auth['cookies_file']} for {url}") - ydl_options["cookiefile"] = auth["cookies_file"] + ydl_options.extend(("--cookies", auth["cookies_file"])) + if self.ytdlp_args: + logger.debug("Adding additional ytdlp arguments: {self.ytdlp_args}") + ydl_options += self.ytdlp_args.split(" ") + + *_, validated_options = yt_dlp.parse_options(ydl_options) ydl = yt_dlp.YoutubeDL( - ydl_options + validated_options ) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en" for info_extractor in self.suitable_extractors(url): diff --git a/src/auto_archiver/utils/webdriver.py b/src/auto_archiver/utils/webdriver.py index 2ba185e..c866c25 100644 --- a/src/auto_archiver/utils/webdriver.py +++ b/src/auto_archiver/utils/webdriver.py @@ -4,6 +4,7 @@ from __future__ import annotations import os import time +import re # import domain_for_url from urllib.parse import urlparse, urlunparse @@ -47,9 +48,10 @@ class CookieSettingDriver(webdriver.Firefox): for name, value in cookie.split("="): self.driver.add_cookie({"name": name, "value": value}) elif self.cookiejar: - domain = urlparse(url).netloc.lstrip("www.") + domain = urlparse(url).netloc + regex = re.compile(f"(www)?\.?{domain}$") for cookie in self.cookiejar: - if domain in cookie.domain: + if regex.match(cookie.domain): try: self.add_cookie( { @@ -62,27 +64,33 @@ class CookieSettingDriver(webdriver.Firefox): } ) except Exception as e: - logger.warning(f"Failed to add cookie to webdriver: {e}") + logger.warning(f"Failed to add cookie ({cookie.domain}) to webdriver for url {domain}: {e}") - if self.facebook_accept_cookies: + super(CookieSettingDriver, self).get(url) + time.sleep(2) + + # Try and use some common button text to reject/accept cookies + for text in [ + "Refuse non-essential cookies", + "Decline optional cookies", + "Reject additional cookies", + "Reject all", + "Accept all cookies", + ]: try: - logger.debug("Trying fb click accept cookie popup.") - super(CookieSettingDriver, self).get("http://www.facebook.com") - essential_only = self.find_element(By.XPATH, "//span[contains(text(), 'Decline optional cookies')]") - essential_only.click() - logger.debug("fb click worked") - # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page + xpath = f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text.lower()}')]" + self.find_element(By.XPATH, xpath).click() time.sleep(2) - except Exception as e: - logger.warning("Failed on fb accept cookies.", e) + except selenium_exceptions.NoSuchElementException: + pass # now get the actual URL - super(CookieSettingDriver, self).get(url) if self.facebook_accept_cookies: # try and click the 'close' button on the 'login' window to close it try: xpath = "//div[@role='dialog']//div[@aria-label='Close']" - WebDriverWait(self, 5).until(EC.element_to_be_clickable((By.XPATH, xpath))).click() + self.find_element(By.XPATH, xpath).click() + time.sleep(2) except selenium_exceptions.NoSuchElementException: logger.warning("Unable to find the 'close' button on the facebook login window") pass