0.8.0 new features and dependency updates (#119)

* wacz can extract_screenshot only * new meta enricher * twitter api can use multiple authentication tokens in sequence * cleanup non-dup logic * meta info on archive duration * minor html report update * updated dependencies * new version
2026-06-12 05:08:28 +03:00 · 2023-12-20 14:13:22 +00:00
parent 499832d146
commit e6b6b83007
8 changed files with 413 additions and 332 deletions
--- a/src/auto_archiver/archivers/twitter_api_archiver.py
+++ b/src/auto_archiver/archivers/twitter_api_archiver.py
@@ -16,36 +16,55 @@ class TwitterApiArchiver(TwitterArchiver, Archiver):
    def __init__(self, config: dict) -> None:
        super().__init__(config)

+        self.api_index = 0
+        self.apis = []
+        if len(self.bearer_tokens):
+            self.apis.extend([Api(bearer_token=bearer_token) for bearer_token in self.bearer_tokens])
        if self.bearer_token:
            self.assert_valid_string("bearer_token")
-            self.api = Api(bearer_token=self.bearer_token)
-        elif self.consumer_key and self.consumer_secret and self.access_token and self.access_secret:
+            self.apis.append(Api(bearer_token=self.bearer_token))
+        if self.consumer_key and self.consumer_secret and self.access_token and self.access_secret:
            self.assert_valid_string("consumer_key")
            self.assert_valid_string("consumer_secret")
            self.assert_valid_string("access_token")
            self.assert_valid_string("access_secret")
-            self.api = Api(
-                consumer_key=self.consumer_key, consumer_secret=self.consumer_secret, access_token=self.access_token, access_secret=self.access_secret)
-        assert hasattr(self, "api") and self.api is not None, "Missing Twitter API configurations, please provide either bearer_token OR (consumer_key, consumer_secret, access_token, access_secret) to use this archiver."
+            self.apis.append(Api(consumer_key=self.consumer_key, consumer_secret=self.consumer_secret,
+                             access_token=self.access_token, access_secret=self.access_secret))
+        assert self.api_client is not None, "Missing Twitter API configurations, please provide either AND/OR (consumer_key, consumer_secret, access_token, access_secret) to use this archiver, you can provide both for better rate-limit results."

    @staticmethod
    def configs() -> dict:
        return {
-            "bearer_token": {"default": None, "help": "twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
+            "bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
+            "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))},
            "consumer_key": {"default": None, "help": "twitter API consumer_key"},
            "consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
            "access_token": {"default": None, "help": "twitter API access_token"},
            "access_secret": {"default": None, "help": "twitter API access_secret"},
        }
+    
+    @property  # getter .mimetype
+    def api_client(self) -> str:
+        return self.apis[self.api_index]
+    

    def download(self, item: Metadata) -> Metadata:
+        # call download retry until success or no more apis
+        while self.api_index < len(self.apis):
+            if res := self.download_retry(item): return res
+            self.api_index += 1
+        self.api_index = 0
+        return False
+
+    def download_retry(self, item: Metadata) -> Metadata:
        url = item.get_url()
        # detect URLs that we definitely cannot handle
        username, tweet_id = self.get_username_tweet_id(url)
        if not username: return False

        try:
-            tweet = self.api.get_tweet(tweet_id, expansions=["attachments.media_keys"], media_fields=["type", "duration_ms", "url", "variants"], tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"])
+            tweet = self.api_client.get_tweet(tweet_id, expansions=["attachments.media_keys"], media_fields=["type", "duration_ms", "url", "variants"], tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"])
+            logger.debug(tweet)
        except Exception as e:
            logger.error(f"Could not get tweet: {e}")
            return False
--- a/src/auto_archiver/enrichers/init.py
+++ b/src/auto_archiver/enrichers/init.py
@@ -6,4 +6,5 @@ from .thumbnail_enricher import ThumbnailEnricher
 from .wacz_enricher import WaczArchiverEnricher
 from .whisper_enricher import WhisperEnricher
 from .pdq_hash_enricher import PdqHashEnricher
-from .metadata_enricher import MetadataEnricher
+from .metadata_enricher import MetadataEnricher
+from .meta_enricher import MetaEnricher
--- a/src/auto_archiver/enrichers/meta_enricher.py
+++ b/src/auto_archiver/enrichers/meta_enricher.py
@@ -0,0 +1,55 @@
+import datetime
+import os
+from loguru import logger
+
+from . import Enricher
+from ..core import Metadata
+
+
+class MetaEnricher(Enricher):
+    """
+    Adds metadata information about the archive operations, to be included at the end of all enrichments
+    """
+    name = "meta_enricher"
+
+
+    def __init__(self, config: dict) -> None:
+        # without this STEP.__init__ is not called
+        super().__init__(config)
+
+    @staticmethod
+    def configs() -> dict:
+        return {
+        }
+
+    def enrich(self, to_enrich: Metadata) -> None:
+        logger.debug(f"calculating archive metadata information for url={to_enrich.get_url()}")
+
+        self.enrich_file_sizes(to_enrich)
+        self.enrich_archive_duration(to_enrich)
+
+    def enrich_file_sizes(self, to_enrich):
+        logger.debug(f"calculating archive file sizes for url={to_enrich.get_url()} ({len(to_enrich.media)} media files)")
+        total_size = 0
+        for i, m in enumerate(to_enrich.media):
+            file_stats = os.stat(m.filename)
+            to_enrich.media[i].set("bytes", file_stats.st_size)
+            to_enrich.media[i].set("size", self.human_readable_bytes(file_stats.st_size))
+            total_size += file_stats.st_size
+        
+        to_enrich.set("total_bytes", total_size)
+        to_enrich.set("total_size", self.human_readable_bytes(total_size))
+        
+
+    def human_readable_bytes(self, size: int) -> str:
+        # receives number of bytes and returns human readble size
+        for unit in ["bytes", "KB", "MB", "GB", "TB"]:
+            if size < 1024:
+                return f"{size:.1f} {unit}"
+            size /= 1024
+
+    def enrich_archive_duration(self, to_enrich):
+        logger.debug(f"calculating archive duration for url={to_enrich.get_url()} ")
+
+        archive_duration = datetime.datetime.utcnow() - to_enrich.get("_processed_at")
+        to_enrich.set("archive_duration_seconds", archive_duration.seconds)
--- a/src/auto_archiver/enrichers/wacz_enricher.py
+++ b/src/auto_archiver/enrichers/wacz_enricher.py
@@ -30,7 +30,8 @@ class WaczArchiverEnricher(Enricher, Archiver):
            "profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
            "docker_commands": {"default": None, "help":"if a custom docker invocation is needed"},
            "timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"},
-            "extract_media": {"default": True, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media. The .wacz file will be kept untouched."}
+            "extract_media": {"default": False, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
+            "extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."}
        }

    def download(self, item: Metadata) -> Metadata:
@@ -105,7 +106,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
            return False

        to_enrich.add_media(Media(wacz_fn), "browsertrix")
-        if self.extract_media:
+        if self.extract_media or self.extract_screenshot:
            self.extract_media_from_wacz(to_enrich, wacz_fn)

        if use_docker:
@@ -155,12 +156,13 @@ class WaczArchiverEnricher(Enricher, Archiver):
        with open(warc_filename, 'rb') as warc_stream:
            for record in ArchiveIterator(warc_stream):
                # only include fetched resources
-                if record.rec_type == "resource":  # screenshots
+                if record.rec_type == "resource" and self.extract_screenshot:  # screenshots
                    fn = os.path.join(tmp_dir, f"warc-file-{counter}.png")
                    with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
                    m = Media(filename=fn)
                    to_enrich.add_media(m, "browsertrix-screenshot")
                    counter += 1
+                if not self.extract_media: continue

                if record.rec_type != 'response': continue
                record_url = record.rec_headers.get_header('WARC-Target-URI')
--- a/src/auto_archiver/formatters/templates/html_template.html
+++ b/src/auto_archiver/formatters/templates/html_template.html
@@ -115,7 +115,7 @@
    <table class="content">
        <tr>
            <th>about</th>
-            <th>preview(s)</th>
+            <th>files and preview</th>
        </tr>
        <tbody>
            {% for m in media %}
--- a/src/auto_archiver/storages/s3.py
+++ b/src/auto_archiver/storages/s3.py
@@ -52,20 +52,6 @@ class S3Storage(Storage):
    def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> None:
        if not self.is_upload_needed(media): return True

-        if self.random_no_duplicate:
-            # checks if a folder with the hash already exists, if so it skips the upload
-            he = HashEnricher({"hash_enricher": {"algorithm": "SHA-256", "chunksize": 1.6e7}})
-            hd = he.calculate_hash(media.filename)
-            path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
-
-            if existing_key:=self.file_in_folder(path):
-                media.key = existing_key
-                logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}")
-                return True
-            
-            _, ext = os.path.splitext(media.key)
-            media.key = os.path.join(path, f"{random_str(24)}{ext}")
-
        extra_args = kwargs.get("extra_args", {})
        if not self.private and 'ACL' not in extra_args:
            extra_args['ACL'] = 'public-read'
@@ -89,6 +75,7 @@ class S3Storage(Storage):

            if existing_key:=self.file_in_folder(path):
                media.key = existing_key
+                media.set("previously archived", True)
                logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}")
                return False
            
--- a/src/auto_archiver/version.py
+++ b/src/auto_archiver/version.py
@@ -1,9 +1,9 @@

 _MAJOR = "0"
-_MINOR = "7"
+_MINOR = "8"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "10"
+_PATCH = "0"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""