From 7a81ab617a665a7768e6d9984a16b9ee8b77baa2 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 11 Mar 2025 11:57:25 +0000 Subject: [PATCH 1/7] Better checking of cookies to add to webdriver --- src/auto_archiver/utils/webdriver.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/auto_archiver/utils/webdriver.py b/src/auto_archiver/utils/webdriver.py index cb4e2a9..af3b7dd 100644 --- a/src/auto_archiver/utils/webdriver.py +++ b/src/auto_archiver/utils/webdriver.py @@ -3,6 +3,7 @@ from __future__ import annotations import os import time +import re #import domain_for_url from urllib.parse import urlparse, urlunparse @@ -48,8 +49,9 @@ class CookieSettingDriver(webdriver.Firefox): self.driver.add_cookie({'name': name, 'value': value}) elif self.cookiejar: domain = urlparse(url).netloc.lstrip("www.") + regex = re.compile(f"(www)?\.?{domain}$") for cookie in self.cookiejar: - if domain in cookie.domain: + if regex.match(cookie.domain): try: self.add_cookie({ 'name': cookie.name, @@ -60,7 +62,7 @@ class CookieSettingDriver(webdriver.Firefox): 'expiry': cookie.expires }) except Exception as e: - logger.warning(f"Failed to add cookie to webdriver: {e}") + logger.warning(f"Failed to add cookie ({cookie.domain}) to webdriver for url {domain}: {e}") if self.facebook_accept_cookies: try: @@ -81,7 +83,7 @@ class CookieSettingDriver(webdriver.Firefox): # try and click the 'close' button on the 'login' window to close it try: xpath = "//div[@role='dialog']//div[@aria-label='Close']" - WebDriverWait(self, 5).until(EC.element_to_be_clickable((By.XPATH, xpath))).click() + WebDriverWait(self, 2).until(EC.element_to_be_clickable((By.XPATH, xpath))).click() except selenium_exceptions.NoSuchElementException: logger.warning("Unable to find the 'close' button on the facebook login window") pass From 0efeaaabb181f34e0a692443c4651b2f171a2eb5 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 11 Mar 2025 12:24:52 +0000 Subject: [PATCH 2/7] Revert to using time.sleep and .click() - since we only want to be waiting the first time (for the page to load) --- src/auto_archiver/utils/webdriver.py | 38 ++++++++++------------------ 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/src/auto_archiver/utils/webdriver.py b/src/auto_archiver/utils/webdriver.py index af3b7dd..ccfead5 100644 --- a/src/auto_archiver/utils/webdriver.py +++ b/src/auto_archiver/utils/webdriver.py @@ -64,41 +64,31 @@ class CookieSettingDriver(webdriver.Firefox): except Exception as e: logger.warning(f"Failed to add cookie ({cookie.domain}) to webdriver for url {domain}: {e}") - if self.facebook_accept_cookies: - try: - logger.debug(f'Trying fb click accept cookie popup.') - super(CookieSettingDriver, self).get("http://www.facebook.com") - essential_only = self.find_element(By.XPATH, "//span[contains(text(), 'Decline optional cookies')]") - essential_only.click() - logger.debug(f'fb click worked') - # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page - time.sleep(2) - except Exception as e: - logger.warning(f'Failed on fb accept cookies.', e) + + super(CookieSettingDriver, self).get(url) + time.sleep(2) + + # Try and use some common button text to reject/accept cookies + for text in ["Refuse non-essential cookies", "Decline optional cookies", "Reject additional cookies", "Reject all", "Accept all cookies"]: + try: + xpath = f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text.lower()}')]" + self.find_element(By.XPATH, xpath).click() + time.sleep(2) + except selenium_exceptions.NoSuchElementException: + pass # now get the actual URL - super(CookieSettingDriver, self).get(url) if self.facebook_accept_cookies: # try and click the 'close' button on the 'login' window to close it try: xpath = "//div[@role='dialog']//div[@aria-label='Close']" - WebDriverWait(self, 2).until(EC.element_to_be_clickable((By.XPATH, xpath))).click() + self.find_element(By.XPATH, xpath).click() + time.sleep(2) except selenium_exceptions.NoSuchElementException: logger.warning("Unable to find the 'close' button on the facebook login window") pass - else: - - # for all other sites, try and use some common button text to reject/accept cookies - for text in ["Refuse non-essential cookies", "Decline optional cookies", "Reject additional cookies", "Reject all", "Accept all cookies"]: - try: - xpath = f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text.lower()}')]" - WebDriverWait(self, 5).until(EC.element_to_be_clickable((By.XPATH, xpath))).click() - break - except selenium_exceptions.WebDriverException: - pass - class Webdriver: def __init__(self, width: int, height: int, timeout_seconds: int, From 589c834047b21fe804b81997208e01f666142ee7 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 11 Mar 2025 12:25:19 +0000 Subject: [PATCH 3/7] Fix parsing ytdlp args - we should first run them through the parse_options method --- .../modules/generic_extractor/__manifest__.py | 7 ++++ .../generic_extractor/generic_extractor.py | 38 ++++++++++++------- 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/src/auto_archiver/modules/generic_extractor/__manifest__.py b/src/auto_archiver/modules/generic_extractor/__manifest__.py index 1d3b365..274a4ba 100644 --- a/src/auto_archiver/modules/generic_extractor/__manifest__.py +++ b/src/auto_archiver/modules/generic_extractor/__manifest__.py @@ -76,5 +76,12 @@ If you are having issues with the extractor, you can review the version of `yt-d "help": "How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.", "type": "int", }, + "ytdlp_args": { + "default": "", + "help": "Additional arguments to pass to yt-dlp, e.g. --no-check-certificate or --plugin-dirs.\ +See yt-dlp documentation here for more information: https://github.com/yt-dlp/yt-dlp?tab=readme-ov-file#general-options\ +Note: this is not to be confused with 'extractor_args' which are specific to the extractor itself.", + "type": "str", + }, }, } diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 5acce46..56164ff 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -1,6 +1,7 @@ import datetime, os import importlib import subprocess + from typing import Generator, Type import yt_dlp @@ -166,7 +167,7 @@ class GenericExtractor(Extractor): if not dropin: # TODO: add a proper link to 'how to create your own dropin' - logger.debug(f"""Could not find valid dropin for {info_extractor.IE_NAME}. + logger.debug(f"""Could not find valid dropin for {info_extractor.ie_key()}. Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""") return False @@ -279,18 +280,18 @@ class GenericExtractor(Extractor): result = self.get_metadata_for_video(data, info_extractor, url, ydl) except Exception as e: - if info_extractor.ie_key() == "generic": + if info_extractor.IE_NAME == "generic": # don't clutter the logs with issues about the 'generic' extractor not having a dropin return False - logger.debug(f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead') + logger.debug(f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead') try: result = self.get_metadata_for_post(info_extractor, url, ydl) except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e: logger.error(f'Error downloading metadata for post: {post_e}') return False except Exception as generic_e: - logger.debug(f'Attempt to extract using ytdlp extractor "{info_extractor.IE_NAME}" failed: \n {repr(generic_e)}', exc_info=True) + logger.debug(f'Attempt to extract using ytdlp dropin "{info_extractor.IE_NAME}" failed: \n {repr(generic_e)}', exc_info=True) return False if result: @@ -314,11 +315,16 @@ class GenericExtractor(Extractor): item.set("replaced_url", url) - ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'), - 'quiet': False, 'noplaylist': not self.allow_playlist , - 'writesubtitles': self.subtitles,'writeautomaticsub': self.subtitles, - "live_from_start": self.live_from_start, "proxy": self.proxy, - "max_downloads": self.max_downloads, "playlistend": self.max_downloads} + ydl_options = ["-o", os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'), + "--quiet", + "--no-playlist" if not self.allow_playlist else "--yes-playlist", + "--write-subs" if self.subtitles else "--no-write-subs", + "--write-auto-subs" if self.subtitles else "--no-write-auto-subs", + "--live-from-start" if self.live_from_start else "--no-live-from-start", + "--proxy", self.proxy if self.proxy else '', + f"--max-downloads {self.max_downloads}" if self.max_downloads != "inf" else '', + f"--playlist-end {self.max_downloads}" if self.max_downloads != "inf" else '' + ] # set up auth auth = self.auth_for_site(url, extract_cookies=False) @@ -327,19 +333,23 @@ class GenericExtractor(Extractor): if auth: if 'username' in auth and 'password' in auth: logger.debug(f'Using provided auth username and password for {url}') - ydl_options['username'] = auth['username'] - ydl_options['password'] = auth['password'] + ydl_options.extend(('--username', auth['username'])) + ydl_options.extend(('--password', auth['password'])) elif 'cookie' in auth: logger.debug(f'Using provided auth cookie for {url}') yt_dlp.utils.std_headers['cookie'] = auth['cookie'] elif 'cookies_from_browser' in auth: logger.debug(f'Using extracted cookies from browser {auth["cookies_from_browser"]} for {url}') - ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser'] + ydl_options.extend(('--cookies-from-browser', auth['cookies_from_browser'])) elif 'cookies_file' in auth: logger.debug(f'Using cookies from file {auth["cookies_file"]} for {url}') - ydl_options['cookiefile'] = auth['cookies_file'] + ydl_options.extend(('--cookies', auth['cookies_file'])) - ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en" + if self.ytdlp_args: + ydl_options += self.ytdlp_args.split(" ") + + _, _, _, validated_options = yt_dlp.parse_options(ydl_options) + ydl = yt_dlp.YoutubeDL(validated_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en" for info_extractor in self.suitable_extractors(url): result = self.download_for_extractor(info_extractor, url, ydl) From f6b13327f0329b771709a0550895810b91f6cf39 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Thu, 13 Mar 2025 16:03:52 +0000 Subject: [PATCH 4/7] Tweaks and additional debug logging --- .../modules/generic_extractor/generic_extractor.py | 3 ++- src/auto_archiver/utils/webdriver.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 56164ff..a75e874 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -346,9 +346,10 @@ class GenericExtractor(Extractor): ydl_options.extend(('--cookies', auth['cookies_file'])) if self.ytdlp_args: + logger.debug("Adding additional ytdlp arguments: {self.ytdlp_args}") ydl_options += self.ytdlp_args.split(" ") - _, _, _, validated_options = yt_dlp.parse_options(ydl_options) + *_, validated_options = yt_dlp.parse_options(ydl_options) ydl = yt_dlp.YoutubeDL(validated_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en" for info_extractor in self.suitable_extractors(url): diff --git a/src/auto_archiver/utils/webdriver.py b/src/auto_archiver/utils/webdriver.py index ccfead5..57f2cf1 100644 --- a/src/auto_archiver/utils/webdriver.py +++ b/src/auto_archiver/utils/webdriver.py @@ -48,7 +48,7 @@ class CookieSettingDriver(webdriver.Firefox): for name, value in cookie.split("="): self.driver.add_cookie({'name': name, 'value': value}) elif self.cookiejar: - domain = urlparse(url).netloc.lstrip("www.") + domain = urlparse(url).netloc regex = re.compile(f"(www)?\.?{domain}$") for cookie in self.cookiejar: if regex.match(cookie.domain): From abaeec0cc6342ee7e843b05b6cc2d029d2103465 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Fri, 14 Mar 2025 12:48:06 +0000 Subject: [PATCH 5/7] Add ruff check --- .pre-commit-config.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 78421d7..833a540 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,10 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.9.10 hooks: - - id: ruff-format + - id: ruff + args: [ --fix ] + - id: ruff-format + # Runs Ruff linting - just checks without fixing, but blocks commit if errors are found. # - id: ruff From a8e5585e6c40c5dad8fad32e591fb90d7c52217e Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Fri, 14 Mar 2025 12:51:47 +0000 Subject: [PATCH 6/7] github format --- .pre-commit-config.yaml | 2 +- .../modules/generic_extractor/generic_extractor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 833a540..0ec35a5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,7 @@ repos: rev: v0.9.10 hooks: - id: ruff - args: [ --fix ] + args: [ --fix, --output-format=github] - id: ruff-format diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 72b526d..534fb71 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -398,7 +398,7 @@ class GenericExtractor(Extractor): ydl_options = [ "-o", - os.path.join(self.tmp_dir, f"%(id)s.%(ext)s"), + os.path.join(self.tmp_dir, "%(id)s.%(ext)s"), "--quiet", "--no-playlist" if not self.allow_playlist else "--yes-playlist", "--write-subs" if self.subtitles else "--no-write-subs", From 562d06916ecd5bf8d1d17c17497e902f4e475a39 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 14 Mar 2025 13:08:57 +0000 Subject: [PATCH 7/7] Revert pre commit --- .pre-commit-config.yaml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0ec35a5..78421d7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,10 +3,7 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.9.10 hooks: - - id: ruff - args: [ --fix, --output-format=github] - - id: ruff-format - + - id: ruff-format # Runs Ruff linting - just checks without fixing, but blocks commit if errors are found. # - id: ruff