Merge pull request #246 from bellingcat/webdriver-cookies

Better checking of cookies to add to webdriver + generic extractor tweaks
This commit is contained in:
Patrick Robertson
2025-03-14 13:35:58 +00:00
committed by GitHub
3 changed files with 55 additions and 32 deletions

View File

@@ -76,5 +76,12 @@ If you are having issues with the extractor, you can review the version of `yt-d
"help": "How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.",
"type": "int",
},
"ytdlp_args": {
"default": "",
"help": "Additional arguments to pass to yt-dlp, e.g. --no-check-certificate or --plugin-dirs.\
See yt-dlp documentation here for more information: https://github.com/yt-dlp/yt-dlp?tab=readme-ov-file#general-options\
Note: this is not to be confused with 'extractor_args' which are specific to the extractor itself.",
"type": "str",
},
},
}

View File

@@ -2,6 +2,7 @@ import datetime
import os
import importlib
import subprocess
from typing import Generator, Type
import yt_dlp
@@ -234,7 +235,7 @@ class GenericExtractor(Extractor):
if not dropin:
# TODO: add a proper link to 'how to create your own dropin'
logger.debug(f"""Could not find valid dropin for {info_extractor.IE_NAME}.
logger.debug(f"""Could not find valid dropin for {info_extractor.ie_key()}.
Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""")
return False
@@ -352,7 +353,7 @@ class GenericExtractor(Extractor):
result = self.get_metadata_for_video(data, info_extractor, url, ydl)
except Exception as e:
if info_extractor.ie_key() == "generic":
if info_extractor.IE_NAME == "generic":
# don't clutter the logs with issues about the 'generic' extractor not having a dropin
return False
@@ -395,17 +396,19 @@ class GenericExtractor(Extractor):
url = url.replace("https://ya.ru", "https://yandex.ru")
item.set("replaced_url", url)
ydl_options = {
"outtmpl": os.path.join(self.tmp_dir, "%(id)s.%(ext)s"),
"quiet": False,
"noplaylist": not self.allow_playlist,
"writesubtitles": self.subtitles,
"writeautomaticsub": self.subtitles,
"live_from_start": self.live_from_start,
"proxy": self.proxy,
"max_downloads": self.max_downloads,
"playlistend": self.max_downloads,
}
ydl_options = [
"-o",
os.path.join(self.tmp_dir, "%(id)s.%(ext)s"),
"--quiet",
"--no-playlist" if not self.allow_playlist else "--yes-playlist",
"--write-subs" if self.subtitles else "--no-write-subs",
"--write-auto-subs" if self.subtitles else "--no-write-auto-subs",
"--live-from-start" if self.live_from_start else "--no-live-from-start",
"--proxy",
self.proxy if self.proxy else "",
f"--max-downloads {self.max_downloads}" if self.max_downloads != "inf" else "",
f"--playlist-end {self.max_downloads}" if self.max_downloads != "inf" else "",
]
# set up auth
auth = self.auth_for_site(url, extract_cookies=False)
@@ -414,20 +417,25 @@ class GenericExtractor(Extractor):
if auth:
if "username" in auth and "password" in auth:
logger.debug(f"Using provided auth username and password for {url}")
ydl_options["username"] = auth["username"]
ydl_options["password"] = auth["password"]
ydl_options.extend(("--username", auth["username"]))
ydl_options.extend(("--password", auth["password"]))
elif "cookie" in auth:
logger.debug(f"Using provided auth cookie for {url}")
yt_dlp.utils.std_headers["cookie"] = auth["cookie"]
elif "cookies_from_browser" in auth:
logger.debug(f"Using extracted cookies from browser {auth['cookies_from_browser']} for {url}")
ydl_options["cookiesfrombrowser"] = auth["cookies_from_browser"]
ydl_options.extend(("--cookies-from-browser", auth["cookies_from_browser"]))
elif "cookies_file" in auth:
logger.debug(f"Using cookies from file {auth['cookies_file']} for {url}")
ydl_options["cookiefile"] = auth["cookies_file"]
ydl_options.extend(("--cookies", auth["cookies_file"]))
if self.ytdlp_args:
logger.debug("Adding additional ytdlp arguments: {self.ytdlp_args}")
ydl_options += self.ytdlp_args.split(" ")
*_, validated_options = yt_dlp.parse_options(ydl_options)
ydl = yt_dlp.YoutubeDL(
ydl_options
validated_options
) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
for info_extractor in self.suitable_extractors(url):

View File

@@ -4,6 +4,7 @@ from __future__ import annotations
import os
import time
import re
# import domain_for_url
from urllib.parse import urlparse, urlunparse
@@ -47,9 +48,10 @@ class CookieSettingDriver(webdriver.Firefox):
for name, value in cookie.split("="):
self.driver.add_cookie({"name": name, "value": value})
elif self.cookiejar:
domain = urlparse(url).netloc.lstrip("www.")
domain = urlparse(url).netloc
regex = re.compile(f"(www)?\.?{domain}$")
for cookie in self.cookiejar:
if domain in cookie.domain:
if regex.match(cookie.domain):
try:
self.add_cookie(
{
@@ -62,27 +64,33 @@ class CookieSettingDriver(webdriver.Firefox):
}
)
except Exception as e:
logger.warning(f"Failed to add cookie to webdriver: {e}")
logger.warning(f"Failed to add cookie ({cookie.domain}) to webdriver for url {domain}: {e}")
if self.facebook_accept_cookies:
super(CookieSettingDriver, self).get(url)
time.sleep(2)
# Try and use some common button text to reject/accept cookies
for text in [
"Refuse non-essential cookies",
"Decline optional cookies",
"Reject additional cookies",
"Reject all",
"Accept all cookies",
]:
try:
logger.debug("Trying fb click accept cookie popup.")
super(CookieSettingDriver, self).get("http://www.facebook.com")
essential_only = self.find_element(By.XPATH, "//span[contains(text(), 'Decline optional cookies')]")
essential_only.click()
logger.debug("fb click worked")
# linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
xpath = f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text.lower()}')]"
self.find_element(By.XPATH, xpath).click()
time.sleep(2)
except Exception as e:
logger.warning("Failed on fb accept cookies.", e)
except selenium_exceptions.NoSuchElementException:
pass
# now get the actual URL
super(CookieSettingDriver, self).get(url)
if self.facebook_accept_cookies:
# try and click the 'close' button on the 'login' window to close it
try:
xpath = "//div[@role='dialog']//div[@aria-label='Close']"
WebDriverWait(self, 5).until(EC.element_to_be_clickable((By.XPATH, xpath))).click()
self.find_element(By.XPATH, xpath).click()
time.sleep(2)
except selenium_exceptions.NoSuchElementException:
logger.warning("Unable to find the 'close' button on the facebook login window")
pass