Option to provide cookies for use by ytdl, fixes #150

adds proxy_server option to wacz
numpy version downgrade
2026-06-11 12:48:28 +03:00 · 2024-12-20 07:14:49 +01:00 · 2024-10-06 10:45:34 +06:00 · 2024-10-06 10:10:04 +06:00 · 2024-10-05 17:43:07 +06:00
6 changed files with 988 additions and 1008 deletions
--- a/4
+++ b/4
@@ -30,10 +30,10 @@ tqdm = "*"
 jinja2 = "*"
 cryptography = "*"
 dataclasses-json = "*"
-yt-dlp = "*"
+yt-dlp = "2024.09.27"
 vk-url-scraper = "*"
 requests = {extras = ["socks"], version = "*"}
-numpy = "*"
+numpy = "1.26.4"
 warcio = "*"
 jsonlines = "*"
 pysubs2 = "*"
--- a/Pipfile.lock
+++ b/Pipfile.lock
--- a/example.orchestration.yaml
+++ b/example.orchestration.yaml
@@ -94,6 +94,14 @@ configurations:
    password: "vk pass"
    session_file: "secrets/vk_config.v2.json"

+  youtubedl_archiver:
+    subtitles: true
+    # use one of the following two methods to authenticate in youtube - either provide a cookies file or use the cookies of the given browser
+    # for more information, see https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp
+    # cookie_file: "secrets/youtube_cookies.txt"
+    # cookies_from_browser: firefox
+    # proxy: socks5://proxy-user:password@proxy-ip:port
+
  screenshot_enricher:
    width: 1280
    height: 2300
--- a/src/auto_archiver/archivers/youtubedl_archiver.py
+++ b/src/auto_archiver/archivers/youtubedl_archiver.py
@@ -30,6 +30,8 @@ class YoutubeDLArchiver(Archiver):
            "end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
            'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
            "max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
+            "cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
+            "cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"},
        }

    def download(self, item: Metadata) -> Metadata:
@@ -38,8 +40,17 @@ class YoutubeDLArchiver(Archiver):
        if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
            logger.debug('Using Facebook cookie')
            yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
-
+        
        ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
+
+        if item.netloc in ['youtube.com', 'www.youtube.com']:
+            if self.cookies_from_browser:
+                logger.debug(f'Extracting cookies from browser {self.cookies_from_browser} for Youtube')
+                ydl_options['cookiesfrombrowser'] = (self.cookies_from_browser,)
+            elif self.cookie_file:
+                logger.debug(f'Using cookies from file {self.cookie_file}')
+                ydl_options['cookiefile'] = self.cookie_file
+
        ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"

        try:
--- a/src/auto_archiver/enrichers/wacz_enricher.py
+++ b/src/auto_archiver/enrichers/wacz_enricher.py
@@ -34,6 +34,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
            "extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
            "socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
            "socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
+            "proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"},
        }
    
    def setup(self) -> None:
@@ -113,7 +114,10 @@ class WaczArchiverEnricher(Enricher, Archiver):
        try:
            logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
            my_env = os.environ.copy()
-            if self.socks_proxy_host and self.socks_proxy_port:
+            if self.proxy_server:
+                logger.debug("Using PROXY_SERVER proxy for browsertrix-crawler")
+                my_env["PROXY_SERVER"] = self.proxy_server
+            elif self.socks_proxy_host and self.socks_proxy_port:
                logger.debug("Using SOCKS proxy for browsertrix-crawler")
                my_env["SOCKS_HOST"] = self.socks_proxy_host
                my_env["SOCKS_PORT"] = str(self.socks_proxy_port)
--- a/src/auto_archiver/version.py
+++ b/src/auto_archiver/version.py
@@ -1,9 +1,9 @@

 _MAJOR = "0"
-_MINOR = "12"
+_MINOR = "13"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "0"
+_PATCH = "1"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""
Author	SHA1	Message	Date
Patrick Robertson	82c00d491d	Option to provide cookies for use by ytdl, fixes #150	2024-12-20 07:14:49 +01:00
msramalho	e49550163f	adds proxy_server option to wacz	2024-10-06 10:45:34 +06:00
msramalho	e6f5981afc	numpy version downgrade	2024-10-06 10:10:04 +06:00
msramalho	c62bf1a34d	yt-dlp version bump	2024-10-05 17:43:07 +06:00