Add cookie extraction to 'authentication' options, get generic_extractor working using this info

2026-06-11 12:48:28 +03:00 · 2025-02-03 16:03:07 +01:00
parent 9c9e9b370e
commit 7a2be5a0da
3 changed files with 66 additions and 23 deletions
--- a/src/auto_archiver/core/base_module.py
+++ b/src/auto_archiver/core/base_module.py
@@ -1,5 +1,4 @@

-
 from urllib.parse import urlparse
 from typing import  Mapping, Any
 from abc import ABC
@@ -80,25 +79,63 @@ class BaseModule(ABC):
        for key, val in config.get(self.name, {}).items():
            setattr(self, key, val)
    
-    def auth_for_site(self, site: str) -> dict:
+    def auth_for_site(self, site: str, extract_cookies=True) -> Mapping[str, Any]:
+        """
+        Returns the authentication information for a given site. This is used to authenticate
+        with a site before extracting data. The site should be the domain of the site, e.g. 'twitter.com'
+        
+        extract_cookies: bool - whether or not to extract cookies from the given browser and return the 
+        cookie jar (disabling can speed up) processing if you don't actually need the cookies jar
+
+        Currently, the dict can have keys of the following types:
+        - username: str - the username to use for login
+        - password: str - the password to use for login
+        - api_key: str - the API key to use for login
+        - api_secret: str - the API secret to use for login
+        - cookie: str - a cookie string to use for login (specific to this site)
+        - cookies_jar: YoutubeDLCookieJar | http.cookiejar.MozillaCookieJar - a cookie jar compatible with requests (e.g. `requests.get(cookies=cookie_jar)`)
+        """
        # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
-        # for now, just hard code those.
+        # for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?

        # SECURITY: parse the domain using urllib
        site = urlparse(site).netloc
        # add the 'www' version of the site to the list of sites to check
+        authdict = {}
+
+
        for to_try in [site, f"www.{site}"]:
            if to_try in self.authentication:
-                return self.authentication[to_try]
+                authdict.update(self.authentication[to_try])
+                break

        # do a fuzzy string match just to print a warning - don't use it since it's insecure
-        for key in self.authentication.keys():
-            if key in site or site in key:
-                logger.warning(f"Could not find exact authentication information for site '{site}'. \
-                                did find information for '{key}' which is close, is this what you meant? \
-                                If so, edit your authentication settings to make sure it exactly matches.")
+        if not authdict:
+            for key in self.authentication.keys():
+                if key in site or site in key:
+                    logger.debug(f"Could not find exact authentication information for site '{site}'. \
+                                    did find information for '{key}' which is close, is this what you meant? \
+                                    If so, edit your authentication settings to make sure it exactly matches.")
        
-        return {}
+
+        def get_ytdlp_cookiejar(args):
+            import yt_dlp
+            from yt_dlp import parse_options
+
+            # parse_options returns a named tuple as follows, we only need the ydl_options part
+            # collections.namedtuple('ParsedOptions', ('parser', 'options', 'urls', 'ydl_opts'))
+            ytdlp_opts = getattr(parse_options(args), 'ydl_opts')
+            return yt_dlp.YoutubeDL(ytdlp_opts).cookiejar
+
+        # get the cookies jar, prefer the browser cookies than the file
+        if 'cookies_from_browser' in self.authentication:
+            authdict['cookies_from_browser'] = self.authentication['cookies_from_browser']
+            authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies-from-browser', self.authentication['cookies_from_browser']])
+        elif 'cookies_file' in self.authentication:
+            authdict['cookies_file'] = self.authentication['cookies_file']
+            authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies', self.authentication['cookies_file']])
+        
+        return authdict
    
    def repr(self):
        return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -259,8 +259,7 @@ class ArchivingOrchestrator:
                if module == 'cli_feeder':
                    urls = self.config['urls']
                    if not urls:
-                        logger.error("No URLs provided. Please provide at least one URL to archive, or set up a feeder.")
-                        self.basic_parser.print_help()
+                        logger.error("No URLs provided. Please provide at least one URL to archive, or set up a feeder. Use --help for more information.")
                        exit()
                    # cli_feeder is a pseudo module, it just takes the command line args
                    def feed(self) -> Generator[Metadata]:
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -266,23 +266,30 @@ class GenericExtractor(Extractor):
    def download(self, item: Metadata) -> Metadata:
        url = item.get_url()

-        if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
-            logger.debug('Using Facebook cookie')
-            yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie

        ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'), 
                       'quiet': False, 'noplaylist': not self.allow_playlist ,
                       'writesubtitles': self.subtitles,'writeautomaticsub': self.subtitles,
                       "live_from_start": self.live_from_start, "proxy": self.proxy,
                       "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
-
-        if item.netloc in ['youtube.com', 'www.youtube.com']:
-            if self.cookies_from_browser:
-                logger.debug(f'Extracting cookies from browser {self.cookies_from_browser} for Youtube')
-                ydl_options['cookiesfrombrowser'] = (self.cookies_from_browser,)
-            elif self.cookie_file:
-                logger.debug(f'Using cookies from file {self.cookie_file}')
-                ydl_options['cookiefile'] = self.cookie_file
+        
+        # set up auth
+        auth = self.auth_for_site(url)
+        # order of importance: username/pasword -> api_key -> cookie -> cookie_from_browser -> cookies_file
+        if auth:
+            if 'username' in auth and 'password' in auth:
+                logger.debug(f'Using provided auth username and password for {url}')
+                ydl_options['username'] = auth['username']
+                ydl_options['password'] = auth['password']
+            elif 'cookie' in auth:
+                logger.debug(f'Using provided auth cookie for {url}')
+                yt_dlp.utils.std_headers['cookie'] = auth['cookie']
+            elif 'cookie_from_browser' in auth:
+                logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}')
+                ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
+            elif 'cookies_file' in auth:
+                logger.debug(f'Using cookies from file {self.cookie_file} for {url}')
+                ydl_options['cookiesfile'] = auth['cookies_file']

        ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"