From 7a2be5a0da13713980ced0a34aed37cc0b891979 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 3 Feb 2025 16:03:07 +0100 Subject: [PATCH] Add cookie extraction to 'authentication' options, get generic_extractor working using this info --- src/auto_archiver/core/base_module.py | 57 +++++++++++++++---- src/auto_archiver/core/orchestrator.py | 3 +- .../generic_extractor/generic_extractor.py | 29 ++++++---- 3 files changed, 66 insertions(+), 23 deletions(-) diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index 2c1e8a3..d23643c 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -1,5 +1,4 @@ - from urllib.parse import urlparse from typing import Mapping, Any from abc import ABC @@ -80,25 +79,63 @@ class BaseModule(ABC): for key, val in config.get(self.name, {}).items(): setattr(self, key, val) - def auth_for_site(self, site: str) -> dict: + def auth_for_site(self, site: str, extract_cookies=True) -> Mapping[str, Any]: + """ + Returns the authentication information for a given site. This is used to authenticate + with a site before extracting data. The site should be the domain of the site, e.g. 'twitter.com' + + extract_cookies: bool - whether or not to extract cookies from the given browser and return the + cookie jar (disabling can speed up) processing if you don't actually need the cookies jar + + Currently, the dict can have keys of the following types: + - username: str - the username to use for login + - password: str - the password to use for login + - api_key: str - the API key to use for login + - api_secret: str - the API secret to use for login + - cookie: str - a cookie string to use for login (specific to this site) + - cookies_jar: YoutubeDLCookieJar | http.cookiejar.MozillaCookieJar - a cookie jar compatible with requests (e.g. `requests.get(cookies=cookie_jar)`) + """ # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com) - # for now, just hard code those. + # for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code? # SECURITY: parse the domain using urllib site = urlparse(site).netloc # add the 'www' version of the site to the list of sites to check + authdict = {} + + for to_try in [site, f"www.{site}"]: if to_try in self.authentication: - return self.authentication[to_try] + authdict.update(self.authentication[to_try]) + break # do a fuzzy string match just to print a warning - don't use it since it's insecure - for key in self.authentication.keys(): - if key in site or site in key: - logger.warning(f"Could not find exact authentication information for site '{site}'. \ - did find information for '{key}' which is close, is this what you meant? \ - If so, edit your authentication settings to make sure it exactly matches.") + if not authdict: + for key in self.authentication.keys(): + if key in site or site in key: + logger.debug(f"Could not find exact authentication information for site '{site}'. \ + did find information for '{key}' which is close, is this what you meant? \ + If so, edit your authentication settings to make sure it exactly matches.") - return {} + + def get_ytdlp_cookiejar(args): + import yt_dlp + from yt_dlp import parse_options + + # parse_options returns a named tuple as follows, we only need the ydl_options part + # collections.namedtuple('ParsedOptions', ('parser', 'options', 'urls', 'ydl_opts')) + ytdlp_opts = getattr(parse_options(args), 'ydl_opts') + return yt_dlp.YoutubeDL(ytdlp_opts).cookiejar + + # get the cookies jar, prefer the browser cookies than the file + if 'cookies_from_browser' in self.authentication: + authdict['cookies_from_browser'] = self.authentication['cookies_from_browser'] + authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies-from-browser', self.authentication['cookies_from_browser']]) + elif 'cookies_file' in self.authentication: + authdict['cookies_file'] = self.authentication['cookies_file'] + authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies', self.authentication['cookies_file']]) + + return authdict def repr(self): return f"Module<'{self.display_name}' (config: {self.config[self.name]})>" \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index f046bfe..85b3d61 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -259,8 +259,7 @@ class ArchivingOrchestrator: if module == 'cli_feeder': urls = self.config['urls'] if not urls: - logger.error("No URLs provided. Please provide at least one URL to archive, or set up a feeder.") - self.basic_parser.print_help() + logger.error("No URLs provided. Please provide at least one URL to archive, or set up a feeder. Use --help for more information.") exit() # cli_feeder is a pseudo module, it just takes the command line args def feed(self) -> Generator[Metadata]: diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 4838489..bc884a6 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -266,23 +266,30 @@ class GenericExtractor(Extractor): def download(self, item: Metadata) -> Metadata: url = item.get_url() - if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie: - logger.debug('Using Facebook cookie') - yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles,'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads} - - if item.netloc in ['youtube.com', 'www.youtube.com']: - if self.cookies_from_browser: - logger.debug(f'Extracting cookies from browser {self.cookies_from_browser} for Youtube') - ydl_options['cookiesfrombrowser'] = (self.cookies_from_browser,) - elif self.cookie_file: - logger.debug(f'Using cookies from file {self.cookie_file}') - ydl_options['cookiefile'] = self.cookie_file + + # set up auth + auth = self.auth_for_site(url) + # order of importance: username/pasword -> api_key -> cookie -> cookie_from_browser -> cookies_file + if auth: + if 'username' in auth and 'password' in auth: + logger.debug(f'Using provided auth username and password for {url}') + ydl_options['username'] = auth['username'] + ydl_options['password'] = auth['password'] + elif 'cookie' in auth: + logger.debug(f'Using provided auth cookie for {url}') + yt_dlp.utils.std_headers['cookie'] = auth['cookie'] + elif 'cookie_from_browser' in auth: + logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}') + ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser'] + elif 'cookies_file' in auth: + logger.debug(f'Using cookies from file {self.cookie_file} for {url}') + ydl_options['cookiesfile'] = auth['cookies_file'] ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"