Option to provide cookies for use by ytdl, fixes #150

adds proxy_server option to wacz
numpy version downgrade
2026-06-11 12:48:28 +03:00 · 2024-12-20 07:14:49 +01:00 · 2024-10-06 10:45:34 +06:00 · 2024-10-06 10:10:04 +06:00 · 2024-10-05 17:43:07 +06:00 · 2024-08-21 13:34:34 +01:00
11 changed files with 1076 additions and 1039 deletions
--- a/4
+++ b/4
@@ -30,10 +30,10 @@ tqdm = "*"
 jinja2 = "*"
 cryptography = "*"
 dataclasses-json = "*"
-yt-dlp = "*"
+yt-dlp = "2024.09.27"
 vk-url-scraper = "*"
 requests = {extras = ["socks"], version = "*"}
-numpy = "*"
+numpy = "1.26.4"
 warcio = "*"
 jsonlines = "*"
 pysubs2 = "*"
--- a/Pipfile.lock
+++ b/Pipfile.lock
--- a/README.md
+++ b/README.md
@@ -47,7 +47,7 @@ Docker works like a virtual machine running inside your computer, it isolates ev

 <details><summary><code>Python package instructions</code></summary>

-1. make sure you have python 3.8 or higher installed
+1. make sure you have python 3.10 or higher installed
 2. install the package `pip/pipenv/conda install auto-archiver`
 3. test it's installed with `auto-archiver --help`
 4. run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml` if your orchestration file is inside a `secrets/`, which we advise
@@ -108,7 +108,7 @@ configurations:
  # ... configurations for the other steps here ...
 ```

-To see all available `steps` (which archivers, storages, databses, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).
+To see all available `steps` (which archivers, storages, databases, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).

 All the `configurations` in the `orchestration.yaml` file (you can name it differently but need to pass it in the `--config FILENAME` argument) can be seen in the console by using the `--help` flag. They can also be overwritten, for example if you are using the `cli_feeder` to archive from the command line and want to provide the URLs you should do:

--- a/example.orchestration.yaml
+++ b/example.orchestration.yaml
@@ -16,8 +16,13 @@ steps:
    # - wacz_archiver_enricher
  enrichers:
    - hash_enricher
+    # - meta_enricher
    # - metadata_enricher
    # - screenshot_enricher
+    # - pdq_hash_enricher
+    # - ssl_enricher
+    # - timestamping_enricher
+    # - whisper_enricher
    # - thumbnail_enricher
    # - wayback_archiver_enricher
    # - wacz_archiver_enricher
@@ -89,6 +94,14 @@ configurations:
    password: "vk pass"
    session_file: "secrets/vk_config.v2.json"

+  youtubedl_archiver:
+    subtitles: true
+    # use one of the following two methods to authenticate in youtube - either provide a cookies file or use the cookies of the given browser
+    # for more information, see https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp
+    # cookie_file: "secrets/youtube_cookies.txt"
+    # cookies_from_browser: firefox
+    # proxy: socks5://proxy-user:password@proxy-ip:port
+
  screenshot_enricher:
    width: 1280
    height: 2300
--- a/setup.cfg
+++ b/setup.cfg
@@ -27,7 +27,7 @@ package_dir=
    =src
 packages=find:
 find_packages=true
-python_requires = >=3.8
+python_requires = >=3.10

 [options.package_data]
 * = *.html
--- a/src/auto_archiver/archivers/instagram_tbot_archiver.py
+++ b/src/auto_archiver/archivers/instagram_tbot_archiver.py
@@ -54,8 +54,9 @@ class InstagramTbotArchiver(Archiver):

    def cleanup(self) -> None:
        logger.info(f"CLEANUP {self.name}.")
-        if os.path.exists(self.session_file):
-            os.remove(self.session_file)
+        session_file_name = self.session_file + ".session"
+        if os.path.exists(session_file_name):
+            os.remove(session_file_name)
        
    def download(self, item: Metadata) -> Metadata:
        url = item.get_url()
--- a/src/auto_archiver/archivers/telethon_archiver.py
+++ b/src/auto_archiver/archivers/telethon_archiver.py
@@ -101,8 +101,9 @@ class TelethonArchiver(Archiver):

    def cleanup(self) -> None:
        logger.info(f"CLEANUP {self.name}.")
-        if os.path.exists(self.session_file):
-            os.remove(self.session_file)
+        session_file_name = self.session_file + ".session"
+        if os.path.exists(session_file_name):
+            os.remove(session_file_name)

    def download(self, item: Metadata) -> Metadata:
        """
--- a/src/auto_archiver/archivers/twitter_archiver.py
+++ b/src/auto_archiver/archivers/twitter_archiver.py
@@ -1,7 +1,10 @@
 import re, requests, mimetypes, json
+from typing import Union
 from datetime import datetime
 from loguru import logger
 from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
+from yt_dlp import YoutubeDL
+from yt_dlp.extractor.twitter import TwitterIE
 from slugify import slugify

 from . import Archiver
@@ -29,7 +32,7 @@ class TwitterArchiver(Archiver):
        # expand URL if t.co and clean tracker GET params
        if 'https://t.co/' in url:
            try:
-                r = requests.get(url)
+                r = requests.get(url, timeout=30)
                logger.debug(f'Expanded url {url} to {r.url}')
                url = r.url
            except:
@@ -43,19 +46,31 @@ class TwitterArchiver(Archiver):
        can handle private/public channels
        """
        url = item.get_url()
-        # detect URLs that we definitely cannot handle
        username, tweet_id = self.get_username_tweet_id(url)
        if not username: return False

-        result = Metadata()
+        strategies = [self.download_yt_dlp, self.download_snscrape, self.download_syndication]
+        for strategy in strategies:
+            logger.debug(f"Trying {strategy.__name__} for {url=}")
+            try:
+                result = strategy(item, url, tweet_id)
+                if result: return result
+            except Exception as ex:
+                logger.error(f"Failed to download {url} with {strategy.__name__}: {type(ex).__name__} occurred. args: {ex.args}")
+        
+        logger.warning(f"No free strategy worked for {url}")
+        return False

+        
+    def download_snscrape(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
        scr = TwitterTweetScraper(tweet_id)
        try:
            tweet = next(scr.get_items())
        except Exception as ex:
-            logger.warning(f"can't get tweet: {type(ex).__name__} occurred. args: {ex.args}")
-            return self.download_alternative(item, url, tweet_id)
-
+            logger.warning(f"SNSCRAPE FAILED, can't get tweet: {type(ex).__name__} occurred. args: {ex.args}")
+            return False
+        
+        result = Metadata()
        result.set_title(tweet.content).set_content(tweet.json()).set_timestamp(tweet.date)
        if tweet.media is None:
            logger.debug(f'No media found, archiving tweet text only')
@@ -85,7 +100,7 @@ class TwitterArchiver(Archiver):

        return result.success("twitter-snscrape")

-    def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata:
+    def download_syndication(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
        """
        Hack alternative working again.
        https://stackoverflow.com/a/71867055/6196010 (OUTDATED URL)
@@ -93,12 +108,13 @@ class TwitterArchiver(Archiver):
        next to test: https://cdn.embedly.com/widgets/media.html?&schema=twitter&url=https://twitter.com/bellingcat/status/1674700676612386816
        """

-        logger.debug(f"Trying twitter hack for {url=}")
-        result = Metadata()
-
        hack_url = f"https://cdn.syndication.twimg.com/tweet-result?id={tweet_id}"
        r = requests.get(hack_url)
-        if r.status_code != 200: return False
+        if r.status_code != 200 or r.json()=={}: 
+            logger.warning(f"SyndicationHack: Failed to get tweet information from {hack_url}.")
+            return False
+        
+        result = Metadata()
        tweet = r.json()

        urls = []
@@ -108,7 +124,7 @@ class TwitterArchiver(Archiver):
        # 1 tweet has 1 video max
        if "video" in tweet:
            v = tweet["video"]
-            urls.append(self.choose_variant(v.get("variants", [])))
+            urls.append(self.choose_variant(v.get("variants", []))['url'])

        logger.debug(f"Twitter hack got {urls=}")

@@ -124,7 +140,39 @@ class TwitterArchiver(Archiver):
            result.add_media(media)

        result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
-        return result.success("twitter-hack")
+        return result.success("twitter-syndication")
+
+    def download_yt_dlp(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
+        downloader = YoutubeDL()
+        tie = TwitterIE(downloader)
+        tweet = tie._extract_status(tweet_id)
+        result = Metadata()
+        result\
+            .set_title(tweet.get('full_text', ''))\
+            .set_content(json.dumps(tweet, ensure_ascii=False))\
+            .set_timestamp(datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"))
+        if not tweet.get("entities", {}).get("media"):
+            logger.debug('No media found, archiving tweet text only')
+            result.status = "twitter-ytdl"
+            return result
+        for i, tw_media in enumerate(tweet["entities"]["media"]):
+            media = Media(filename="")
+            mimetype = ""
+            if tw_media["type"] == "photo":
+                media.set("src", UrlUtil.twitter_best_quality_url(tw_media['media_url_https']))
+                mimetype = "image/jpeg"
+            elif tw_media["type"] == "video":
+                variant = self.choose_variant(tw_media['video_info']['variants'])
+                media.set("src", variant['url'])
+                mimetype = variant['content_type']
+            elif tw_media["type"] == "animated_gif":
+                variant = tw_media['video_info']['variants'][0]
+                media.set("src", variant['url'])
+                mimetype = variant['content_type']
+            ext = mimetypes.guess_extension(mimetype)
+            media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
+            result.add_media(media)
+        return result.success("twitter-ytdl")

    def get_username_tweet_id(self, url):
        # detect URLs that we definitely cannot handle
@@ -140,13 +188,13 @@ class TwitterArchiver(Archiver):
        # choosing the highest quality possible
        variant, width, height = None, 0, 0
        for var in variants:
-            if var.get("type", "") == "video/mp4":
-                width_height = re.search(r"\/(\d+)x(\d+)\/", var["src"])
+            if var.get("content_type", "") == "video/mp4":
+                width_height = re.search(r"\/(\d+)x(\d+)\/", var["url"])
                if width_height:
                    w, h = int(width_height[1]), int(width_height[2])
                    if w > width or h > height:
                        width, height = w, h
-                        variant = var.get("src", variant)
+                        variant = var
            else:
-                variant = var.get("src") if not variant else variant
+                variant = var if not variant else variant
        return variant
--- a/src/auto_archiver/archivers/youtubedl_archiver.py
+++ b/src/auto_archiver/archivers/youtubedl_archiver.py
@@ -30,6 +30,8 @@ class YoutubeDLArchiver(Archiver):
            "end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
            'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
            "max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
+            "cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
+            "cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"},
        }

    def download(self, item: Metadata) -> Metadata:
@@ -38,8 +40,17 @@ class YoutubeDLArchiver(Archiver):
        if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
            logger.debug('Using Facebook cookie')
            yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
-
+        
        ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
+
+        if item.netloc in ['youtube.com', 'www.youtube.com']:
+            if self.cookies_from_browser:
+                logger.debug(f'Extracting cookies from browser {self.cookies_from_browser} for Youtube')
+                ydl_options['cookiesfrombrowser'] = (self.cookies_from_browser,)
+            elif self.cookie_file:
+                logger.debug(f'Using cookies from file {self.cookie_file}')
+                ydl_options['cookiefile'] = self.cookie_file
+
        ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"

        try:
@@ -98,11 +109,12 @@ class YoutubeDLArchiver(Archiver):
            result.set("comments", [{
                "text": c["text"],
                "author": c["author"], 
-                "timestamp": datetime.datetime.utcfromtimestamp(c.get("timestamp")).replace(tzinfo=datetime.timezone.utc)
+                "timestamp": datetime.datetime.fromtimestamp(c.get("timestamp"), tz = datetime.timezone.utc)
            } for c in info.get("comments", [])])

        if (timestamp := info.get("timestamp")):
-            timestamp = datetime.datetime.utcfromtimestamp(timestamp).replace(tzinfo=datetime.timezone.utc).isoformat()
+            #TODO: fix deprecated timestamp, 
+            timestamp = datetime.datetime.fromtimestamp(timestamp, tz = datetime.timezone.utc).isoformat()
            result.set_timestamp(timestamp)
        if (upload_date := info.get("upload_date")):
            upload_date = datetime.datetime.strptime(upload_date, '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
--- a/src/auto_archiver/enrichers/wacz_enricher.py
+++ b/src/auto_archiver/enrichers/wacz_enricher.py
@@ -34,6 +34,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
            "extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
            "socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
            "socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
+            "proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"},
        }
    
    def setup(self) -> None:
@@ -113,7 +114,10 @@ class WaczArchiverEnricher(Enricher, Archiver):
        try:
            logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
            my_env = os.environ.copy()
-            if self.socks_proxy_host and self.socks_proxy_port:
+            if self.proxy_server:
+                logger.debug("Using PROXY_SERVER proxy for browsertrix-crawler")
+                my_env["PROXY_SERVER"] = self.proxy_server
+            elif self.socks_proxy_host and self.socks_proxy_port:
                logger.debug("Using SOCKS proxy for browsertrix-crawler")
                my_env["SOCKS_HOST"] = self.socks_proxy_host
                my_env["SOCKS_PORT"] = str(self.socks_proxy_port)
--- a/src/auto_archiver/version.py
+++ b/src/auto_archiver/version.py
@@ -1,6 +1,6 @@

 _MAJOR = "0"
-_MINOR = "11"
+_MINOR = "13"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
 _PATCH = "1"
Author	SHA1	Message	Date
Patrick Robertson	82c00d491d	Option to provide cookies for use by ytdl, fixes #150	2024-12-20 07:14:49 +01:00
msramalho	e49550163f	adds proxy_server option to wacz	2024-10-06 10:45:34 +06:00
msramalho	e6f5981afc	numpy version downgrade	2024-10-06 10:10:04 +06:00
msramalho	c62bf1a34d	yt-dlp version bump	2024-10-05 17:43:07 +06:00
msramalho	b166d57e61	v0.12.0 bump	2024-08-21 13:34:34 +01:00
msramalho	11c3288267	closes #146	2024-08-21 13:33:58 +01:00
msramalho	004143a58a	version bump v0.11.6	2024-07-18 11:27:39 +01:00
msramalho	686f0027c4	adds new entries to example orchestration file	2024-07-18 11:27:15 +01:00
dependabot[bot]	b03cf32c73	Bump authlib from 1.3.0 to 1.3.1 (#144 ) Bumps [authlib](https://github.com/lepture/authlib) from 1.3.0 to 1.3.1. - [Release notes](https://github.com/lepture/authlib/releases) - [Changelog](https://github.com/lepture/authlib/blob/master/docs/changelog.rst) - [Commits](https://github.com/lepture/authlib/compare/v1.3.0...v1.3.1) --- updated-dependencies: - dependency-name: authlib dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-07-18 11:26:22 +01:00
msramalho	dc9e64397e	bumping yt-dlp	2024-07-18 11:23:09 +01:00
msramalho	c7bc5e2988	cleanup	2024-05-15 11:04:29 +01:00
msramalho	1e375bd740	version bump	2024-05-14 16:42:15 +01:00
Miguel Sozinho Ramalho	f8824691dd	refactors free twitter archiver strategies (#142 )	2024-05-14 16:23:33 +01:00
msramalho	012cc36609	removes deprecated datetime method	2024-05-14 15:54:50 +01:00
Miguel Sozinho Ramalho	7cfe1e39cc	#135 fix cleanup of telethon session files (#139 ) * closes #135 * version bump	2024-04-16 12:45:45 +01:00
Jett Chen	cf8691bad7	Add yt-dlp based archiving for TwitterArchiver (#138 ) * Add ytdlp archiving capability * Add type annotation * version bump --------- Co-authored-by: msramalho <19508417+msramalho@users.noreply.github.com>	2024-04-15 19:54:55 +01:00