version bump

closes #135
2026-06-11 04:38:29 +03:00 · 2024-04-16 12:44:42 +01:00 · 2024-04-16 12:44:32 +01:00
9 changed files with 1034 additions and 1033 deletions
--- a/4
+++ b/4
@@ -30,10 +30,10 @@ tqdm = "*"
 jinja2 = "*"
 cryptography = "*"
 dataclasses-json = "*"
-yt-dlp = "2024.09.27"
+yt-dlp = "*"
 vk-url-scraper = "*"
 requests = {extras = ["socks"], version = "*"}
-numpy = "1.26.4"
+numpy = "*"
 warcio = "*"
 jsonlines = "*"
 pysubs2 = "*"
--- a/Pipfile.lock
+++ b/Pipfile.lock
--- a/README.md
+++ b/README.md
@@ -47,7 +47,7 @@ Docker works like a virtual machine running inside your computer, it isolates ev

 <details><summary><code>Python package instructions</code></summary>

-1. make sure you have python 3.10 or higher installed
+1. make sure you have python 3.8 or higher installed
 2. install the package `pip/pipenv/conda install auto-archiver`
 3. test it's installed with `auto-archiver --help`
 4. run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml` if your orchestration file is inside a `secrets/`, which we advise
@@ -108,7 +108,7 @@ configurations:
  # ... configurations for the other steps here ...
 ```

-To see all available `steps` (which archivers, storages, databases, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).
+To see all available `steps` (which archivers, storages, databses, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).

 All the `configurations` in the `orchestration.yaml` file (you can name it differently but need to pass it in the `--config FILENAME` argument) can be seen in the console by using the `--help` flag. They can also be overwritten, for example if you are using the `cli_feeder` to archive from the command line and want to provide the URLs you should do:

--- a/example.orchestration.yaml
+++ b/example.orchestration.yaml
@@ -16,13 +16,8 @@ steps:
    # - wacz_archiver_enricher
  enrichers:
    - hash_enricher
-    # - meta_enricher
    # - metadata_enricher
    # - screenshot_enricher
-    # - pdq_hash_enricher
-    # - ssl_enricher
-    # - timestamping_enricher
-    # - whisper_enricher
    # - thumbnail_enricher
    # - wayback_archiver_enricher
    # - wacz_archiver_enricher
@@ -94,14 +89,6 @@ configurations:
    password: "vk pass"
    session_file: "secrets/vk_config.v2.json"

-  youtubedl_archiver:
-    subtitles: true
-    # use one of the following two methods to authenticate in youtube - either provide a cookies file or use the cookies of the given browser
-    # for more information, see https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp
-    # cookie_file: "secrets/youtube_cookies.txt"
-    # cookies_from_browser: firefox
-    # proxy: socks5://proxy-user:password@proxy-ip:port
-
  screenshot_enricher:
    width: 1280
    height: 2300
--- a/setup.cfg
+++ b/setup.cfg
@@ -27,7 +27,7 @@ package_dir=
    =src
 packages=find:
 find_packages=true
-python_requires = >=3.10
+python_requires = >=3.8

 [options.package_data]
 * = *.html
--- a/src/auto_archiver/archivers/twitter_archiver.py
+++ b/src/auto_archiver/archivers/twitter_archiver.py
@@ -1,5 +1,4 @@
 import re, requests, mimetypes, json
-from typing import Union
 from datetime import datetime
 from loguru import logger
 from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
@@ -32,7 +31,7 @@ class TwitterArchiver(Archiver):
        # expand URL if t.co and clean tracker GET params
        if 'https://t.co/' in url:
            try:
-                r = requests.get(url, timeout=30)
+                r = requests.get(url)
                logger.debug(f'Expanded url {url} to {r.url}')
                url = r.url
            except:
@@ -46,31 +45,19 @@ class TwitterArchiver(Archiver):
        can handle private/public channels
        """
        url = item.get_url()
+        # detect URLs that we definitely cannot handle
        username, tweet_id = self.get_username_tweet_id(url)
        if not username: return False

-        strategies = [self.download_yt_dlp, self.download_snscrape, self.download_syndication]
-        for strategy in strategies:
-            logger.debug(f"Trying {strategy.__name__} for {url=}")
-            try:
-                result = strategy(item, url, tweet_id)
-                if result: return result
-            except Exception as ex:
-                logger.error(f"Failed to download {url} with {strategy.__name__}: {type(ex).__name__} occurred. args: {ex.args}")
-        
-        logger.warning(f"No free strategy worked for {url}")
-        return False
+        result = Metadata()

-        
-    def download_snscrape(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
        scr = TwitterTweetScraper(tweet_id)
        try:
            tweet = next(scr.get_items())
        except Exception as ex:
-            logger.warning(f"SNSCRAPE FAILED, can't get tweet: {type(ex).__name__} occurred. args: {ex.args}")
-            return False
-        
-        result = Metadata()
+            logger.warning(f"can't get tweet: {type(ex).__name__} occurred. args: {ex.args}")
+            return self.download_alternative(item, url, tweet_id)
+
        result.set_title(tweet.content).set_content(tweet.json()).set_timestamp(tweet.date)
        if tweet.media is None:
            logger.debug(f'No media found, archiving tweet text only')
@@ -100,7 +87,7 @@ class TwitterArchiver(Archiver):

        return result.success("twitter-snscrape")

-    def download_syndication(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
+    def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata:
        """
        Hack alternative working again.
        https://stackoverflow.com/a/71867055/6196010 (OUTDATED URL)
@@ -108,13 +95,14 @@ class TwitterArchiver(Archiver):
        next to test: https://cdn.embedly.com/widgets/media.html?&schema=twitter&url=https://twitter.com/bellingcat/status/1674700676612386816
        """

+        logger.debug(f"Trying twitter hack for {url=}")
+        result = Metadata()
+
        hack_url = f"https://cdn.syndication.twimg.com/tweet-result?id={tweet_id}"
        r = requests.get(hack_url)
        if r.status_code != 200 or r.json()=={}: 
-            logger.warning(f"SyndicationHack: Failed to get tweet information from {hack_url}.")
-            return False
-        
-        result = Metadata()
+            logger.warning(f"Failed to get tweet information from {hack_url}, trying ytdl")
+            return self.download_ytdl(item, url, tweet_id)
        tweet = r.json()

        urls = []
@@ -140,9 +128,9 @@ class TwitterArchiver(Archiver):
            result.add_media(media)

        result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
-        return result.success("twitter-syndication")
-
-    def download_yt_dlp(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
+        return result.success("twitter-hack")
+    
+    def download_ytdl(self, item: Metadata, url:str, tweet_id:str) -> Metadata:
        downloader = YoutubeDL()
        tie = TwitterIE(downloader)
        tweet = tie._extract_status(tweet_id)
@@ -153,7 +141,6 @@ class TwitterArchiver(Archiver):
            .set_timestamp(datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"))
        if not tweet.get("entities", {}).get("media"):
            logger.debug('No media found, archiving tweet text only')
-            result.status = "twitter-ytdl"
            return result
        for i, tw_media in enumerate(tweet["entities"]["media"]):
            media = Media(filename="")
@@ -173,6 +160,7 @@ class TwitterArchiver(Archiver):
            media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
            result.add_media(media)
        return result.success("twitter-ytdl")
+        

    def get_username_tweet_id(self, url):
        # detect URLs that we definitely cannot handle
--- a/src/auto_archiver/archivers/youtubedl_archiver.py
+++ b/src/auto_archiver/archivers/youtubedl_archiver.py
@@ -30,8 +30,6 @@ class YoutubeDLArchiver(Archiver):
            "end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
            'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
            "max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
-            "cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
-            "cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"},
        }

    def download(self, item: Metadata) -> Metadata:
@@ -40,17 +38,8 @@ class YoutubeDLArchiver(Archiver):
        if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
            logger.debug('Using Facebook cookie')
            yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
-        
+
        ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
-
-        if item.netloc in ['youtube.com', 'www.youtube.com']:
-            if self.cookies_from_browser:
-                logger.debug(f'Extracting cookies from browser {self.cookies_from_browser} for Youtube')
-                ydl_options['cookiesfrombrowser'] = (self.cookies_from_browser,)
-            elif self.cookie_file:
-                logger.debug(f'Using cookies from file {self.cookie_file}')
-                ydl_options['cookiefile'] = self.cookie_file
-
        ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"

        try:
@@ -109,12 +98,11 @@ class YoutubeDLArchiver(Archiver):
            result.set("comments", [{
                "text": c["text"],
                "author": c["author"], 
-                "timestamp": datetime.datetime.fromtimestamp(c.get("timestamp"), tz = datetime.timezone.utc)
+                "timestamp": datetime.datetime.utcfromtimestamp(c.get("timestamp")).replace(tzinfo=datetime.timezone.utc)
            } for c in info.get("comments", [])])

        if (timestamp := info.get("timestamp")):
-            #TODO: fix deprecated timestamp, 
-            timestamp = datetime.datetime.fromtimestamp(timestamp, tz = datetime.timezone.utc).isoformat()
+            timestamp = datetime.datetime.utcfromtimestamp(timestamp).replace(tzinfo=datetime.timezone.utc).isoformat()
            result.set_timestamp(timestamp)
        if (upload_date := info.get("upload_date")):
            upload_date = datetime.datetime.strptime(upload_date, '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
--- a/src/auto_archiver/enrichers/wacz_enricher.py
+++ b/src/auto_archiver/enrichers/wacz_enricher.py
@@ -34,7 +34,6 @@ class WaczArchiverEnricher(Enricher, Archiver):
            "extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
            "socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
            "socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
-            "proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"},
        }
    
    def setup(self) -> None:
@@ -114,10 +113,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
        try:
            logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
            my_env = os.environ.copy()
-            if self.proxy_server:
-                logger.debug("Using PROXY_SERVER proxy for browsertrix-crawler")
-                my_env["PROXY_SERVER"] = self.proxy_server
-            elif self.socks_proxy_host and self.socks_proxy_port:
+            if self.socks_proxy_host and self.socks_proxy_port:
                logger.debug("Using SOCKS proxy for browsertrix-crawler")
                my_env["SOCKS_HOST"] = self.socks_proxy_host
                my_env["SOCKS_PORT"] = str(self.socks_proxy_port)
--- a/src/auto_archiver/version.py
+++ b/src/auto_archiver/version.py
@@ -1,9 +1,9 @@

 _MAJOR = "0"
-_MINOR = "13"
+_MINOR = "11"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "1"
+_PATCH = "3"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""
Author	SHA1	Message	Date
msramalho	a455728673	version bump	2024-04-16 12:44:42 +01:00
msramalho	8d4357a22c	closes #135	2024-04-16 12:44:32 +01:00