Bump version to v0.5.12 for release

fix: vk-url-scraper version update
fix: max chars on sheets cell
2026-06-10 12:18:30 +03:00 · 2023-05-10 18:58:34 +01:00 · 2023-05-10 18:57:45 +01:00 · 2023-05-10 18:57:33 +01:00 · 2023-05-10 18:57:17 +01:00 · 2023-05-10 16:05:06 +02:00
35 changed files with 1633 additions and 692 deletions
--- a/.github/workflows/docker-publish.yaml
+++ b/.github/workflows/docker-publish.yaml
@@ -26,6 +26,14 @@ jobs:
    steps:
      - name: Check out the repo
        uses: actions/checkout@v3
+        
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v1
+      # https://github.com/docker/setup-buildx-action
+      
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v1
      
      - name: Log in to Docker Hub
        uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
@@ -40,9 +48,10 @@ jobs:
          images: bellingcat/auto-archiver
      
      - name: Build and push Docker image
-        uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
+        uses: docker/build-push-action@v2
        with:
          context: .
-          push: true
+          platforms: linux/amd64,linux/arm64
+          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
+          labels: ${{ steps.meta.outputs.labels }}
--- a/33
+++ b/33
@@ -1,35 +1,36 @@
-# stage 1 - all dependencies
-From python:3.10
+FROM webrecorder/browsertrix-crawler:latest
+
+ENV RUNNING_IN_DOCKER=1

 WORKDIR /app

 # TODO: use custom ffmpeg builds instead of apt-get install
 RUN pip install --upgrade pip && \
 	pip install pipenv && \
+	add-apt-repository ppa:mozillateam/ppa && \
 	apt-get update && \
-	apt-get install -y gcc ffmpeg fonts-noto firefox-esr && \
-	wget https://github.com/mozilla/geckodriver/releases/download/v0.32.0/geckodriver-v0.32.0-linux64.tar.gz && \
+	apt-get install -y gcc ffmpeg fonts-noto && \
+	apt-get install -y --no-install-recommends firefox-esr && \
+	ln -s /usr/bin/firefox-esr /usr/bin/firefox && \
+	wget https://github.com/mozilla/geckodriver/releases/download/v0.33.0/geckodriver-v0.33.0-linux64.tar.gz && \
 	tar -xvzf geckodriver* -C /usr/local/bin && \
 	chmod +x /usr/local/bin/geckodriver && \
-	rm geckodriver-v* 
+	rm geckodriver-v*


-# install docker for WACZ
-# TODO: currently disabled see https://github.com/bellingcat/auto-archiver/issues/66
-# RUN curl -fsSL https://get.docker.com | sh
-
 # TODO: avoid copying unnecessary files, including .git
-COPY Pipfile Pipfile.lock ./
-RUN pipenv install --python=3.10 --system --deploy
-# ENV IS_DOCKER=1
+COPY Pipfile* ./
+RUN pipenv install
+
 # doing this at the end helps during development, builds are quick
 COPY ./src/ . 

 # TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
 # RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
 # USER archiver
-ENTRYPOINT ["python"]
-# ENTRYPOINT ["docker-entrypoint.sh"]

-# should be executed with 2 volumes (3 if local_storage)
-# docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/secrets  -v $PWD/local_archive:/app/local_archive aa --help
+
+ENTRYPOINT ["pipenv", "run", "python3", "-m", "auto_archiver"]
+
+# should be executed with 2 volumes (3 if local_storage is used)
+# docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml
--- a/6
+++ b/6
@@ -30,9 +30,13 @@ cryptography = "==38.0.4"
 dataclasses-json = "*"
 yt-dlp = ">=2023.2.17"
 vk-url-scraper = "*"
+uwsgi = "*"
+requests = {extras = ["socks"], version = "*"}
+# wacz = "==0.4.8"
+pywb = ">=2.7.3"

 [requires]
-python_version = "3.9"
+python_version = "3.10"

 [dev-packages]
 autopep8 = "*"
--- a/Pipfile.lock
+++ b/Pipfile.lock
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@ Docker works like a virtual machine running inside your computer, it isolates ev

 1. install [docker](https://docs.docker.com/get-docker/)
 2. pull the auto-archiver docker [image](https://hub.docker.com/r/bellingcat/auto-archiver) with `docker pull bellingcat/auto-archiver`
-3. run the docker image locally in a container: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver -m auto_archiver  --config secrets/orchestration.yaml` breaking this command down:
+3. run the docker image locally in a container: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml` breaking this command down:
   1. `docker run` tells docker to start a new container (an instance of the image)
   2. `--rm` makes sure this container is removed after execution (less garbage locally)
   3. `-v $PWD/secrets:/app/secrets` - your secrets folder
@@ -87,11 +87,9 @@ The archiver work is orchestrated by the following workflow (we call each a **st
 4. **Formatter** creates a report from all the archived content (HTML, PDF, ...)
 5. **Database** knows what's been archived and also stores the archive result (spreadsheet, CSV, or just the console)

-To check all available steps (which archivers, storages, databses, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).
+To setup an auto-archiver instance, instance, create an `orchestration.yaml` which contains the workflow you would like. We advise you put this file into a `secrets/` folder and do not share it with others because it will contain passwords and other secrets. 

-The great thing is you configure all the workflow in your `orchestration.yaml` file which we advise you put into a `secrets/` folder and don't share it with others because it will contain passwords and other secrets. 
-
-The structure of orchestration file is split into 2 parts: `steps` (what **steps** to use) and `configs` (how those steps should behave), here's a simplification:
+The structure of orchestration file is split into 2 parts: `steps` (what **steps** to use) and `configurations` (how those steps should behave), here's a simplification:
 ```yaml
 # orchestration.yaml content
 steps:
@@ -113,10 +111,12 @@ configurations:
  # ... configurations for the other steps here ...
 ```

+To see all available `steps` (which archivers, storages, databses, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).
+
 All the `configurations` in the `orchestration.yaml` file (you can name it differently but need to pass it in the `--config FILENAME` argument) can be seen in the console by using the `--help` flag. They can also be overwritten, for example if you are using the `cli_feeder` to archive from the command line and want to provide the URLs you should do:

 ```bash
-auto-archiver --config orchestration.yaml --cli_feeder.urls="url1,url2,url3"
+auto-archiver --config secrets/orchestration.yaml --cli_feeder.urls="url1,url2,url3"
 ```

 Here's the complete workflow that the auto-archiver goes through:
@@ -193,7 +193,7 @@ Use `python -m src.auto_archiver --config secrets/orchestration.yaml` to run fro
 #### Docker development
 working with docker locally:
  * `docker build . -t auto-archiver` to build a local image
-  * `docker run --rm -v $PWD/secrets:/app/secrets aa --config secrets/config.yaml`
+  * `docker run --rm -v $PWD/secrets:/app/secrets aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml`
    * to use local archive, also create a volume `-v` for it by adding `-v $PWD/local_archive:/app/local_archive`


--- a/example.orchestration.yaml
+++ b/example.orchestration.yaml
@@ -45,11 +45,9 @@ configurations:
      archive: archive location
      date: archive date
      thumbnail: thumbnail
-      thumbnail_index: thumbnail index
      timestamp: upload timestamp
      title: upload title
      text: textual content
-      duration: duration
      screenshot: screenshot
      hash: hash
      wacz: wacz
--- a/scripts/create_update_gdrive_oauth_token.py
+++ b/scripts/create_update_gdrive_oauth_token.py
@@ -10,6 +10,7 @@ from googleapiclient.errors import HttpError
 # You can run this code to get a new token and verify it belongs to the correct user
 # This token will be refresh automatically by the auto-archiver
 # Code below from https://developers.google.com/drive/api/quickstart/python
+# Example invocation: py scripts/create_update_gdrive_oauth_token.py -c secrets/credentials.json -t secrets/gd-token.json

 SCOPES = ['https://www.googleapis.com/auth/drive']

--- a/src/auto_archiver/archivers/archiver.py
+++ b/src/auto_archiver/archivers/archiver.py
@@ -3,8 +3,8 @@ from abc import abstractmethod
 from dataclasses import dataclass
 import os
 import mimetypes, requests
-from ..core import Metadata
-from ..core import Step
+
+from ..core import Metadata, Step, ArchivingContext


@dataclass
@@ -51,7 +51,7 @@ class Archiver(Step):
            if len(to_filename) > 64:
                to_filename = to_filename[-64:]
        if item:
-            to_filename = os.path.join(item.get_tmp_dir(), to_filename)
+            to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
        }
--- a/src/auto_archiver/archivers/instagram_tbot_archiver.py
+++ b/src/auto_archiver/archivers/instagram_tbot_archiver.py
@@ -4,7 +4,7 @@ from loguru import logger
 import time, os
 from sqlite3 import OperationalError
 from . import Archiver
-from ..core import Metadata, Media
+from ..core import Metadata, Media, ArchivingContext


 class InstagramTbotArchiver(Archiver):
@@ -31,7 +31,7 @@ class InstagramTbotArchiver(Archiver):
            "api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
            "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
            "session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
-            "timeout": {"default": 15, "help": "timeout to fetch the instagram content in seconds."},
+            "timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."},
        }

    def setup(self) -> None:
@@ -44,7 +44,7 @@ class InstagramTbotArchiver(Archiver):
        if not "instagram.com" in url: return False

        result = Metadata()
-        tmp_dir = item.get_tmp_dir()
+        tmp_dir = ArchivingContext.get_tmp_dir()
        with self.client.start():
            chat = self.client.get_entity("instagram_load_bot")
            since_id = self.client.send_message(entity=chat, message=url).id
@@ -52,9 +52,9 @@ class InstagramTbotArchiver(Archiver):
            attempts = 0
            seen_media = []
            message = ""
-            time.sleep(4)
+            time.sleep(3)
            # media is added before text by the bot so it can be used as a stop-logic mechanism
-            while attempts < self.timeout and (not message or not len(seen_media)):
+            while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
                attempts += 1
                time.sleep(1)
                for post in self.client.iter_messages(chat, min_id=since_id):
--- a/src/auto_archiver/archivers/telethon_archiver.py
+++ b/src/auto_archiver/archivers/telethon_archiver.py
@@ -8,7 +8,7 @@ from tqdm import tqdm
 import re, time, json, os

 from . import Archiver
-from ..core import Metadata, Media
+from ..core import Metadata, Media, ArchivingContext


 class TelethonArchiver(Archiver):
@@ -128,7 +128,7 @@ class TelethonArchiver(Archiver):
            media_posts = self._get_media_posts_in_group(chat, post)
            logger.debug(f'got {len(media_posts)=} for {url=}')

-            tmp_dir = item.get_tmp_dir()
+            tmp_dir = ArchivingContext.get_tmp_dir()

            group_id = post.grouped_id if post.grouped_id is not None else post.id
            title = post.message
--- a/src/auto_archiver/archivers/tiktok_archiver.py
+++ b/src/auto_archiver/archivers/tiktok_archiver.py
@@ -3,7 +3,7 @@ import tiktok_downloader
 from loguru import logger

 from . import Archiver
-from ..core import Metadata, Media
+from ..core import Metadata, Media, ArchivingContext


 class TiktokArchiver(Archiver):
@@ -41,7 +41,7 @@ class TiktokArchiver(Archiver):
            logger.warning(f'Other Tiktok error {error}')

        try:
-            filename = os.path.join(item.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4')
+            filename = os.path.join(ArchivingContext.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4')
            tiktok_media = tiktok_downloader.snaptik(url).get_media()

            if len(tiktok_media) <= 0:
--- a/src/auto_archiver/archivers/vk_archiver.py
+++ b/src/auto_archiver/archivers/vk_archiver.py
@@ -3,7 +3,7 @@ from vk_url_scraper import VkScraper

 from ..utils.misc import dump_payload
 from . import Archiver
-from ..core import Metadata, Media
+from ..core import Metadata, Media, ArchivingContext


 class VkArchiver(Archiver):
@@ -50,7 +50,7 @@ class VkArchiver(Archiver):

        result.set_content(dump_payload(vk_scrapes))

-        filenames = self.vks.download_media(vk_scrapes, item.get_tmp_dir())
+        filenames = self.vks.download_media(vk_scrapes, ArchivingContext.get_tmp_dir())
        for filename in filenames:
            result.add_media(Media(filename))

--- a/src/auto_archiver/archivers/youtubedl_archiver.py
+++ b/src/auto_archiver/archivers/youtubedl_archiver.py
@@ -2,7 +2,7 @@ import datetime, os, yt_dlp
 from loguru import logger

 from . import Archiver
-from ..core import Metadata, Media
+from ..core import Metadata, Media, ArchivingContext


 class YoutubeDLArchiver(Archiver):
@@ -25,7 +25,7 @@ class YoutubeDLArchiver(Archiver):
            logger.debug('Using Facebook cookie')
            yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie

-        ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(item.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False})
+        ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False})

        try:
            # don'd download since it can be a live stream
--- a/src/auto_archiver/auto_auto_archive.py
+++ b/src/auto_archiver/auto_auto_archive.py
@@ -1,34 +0,0 @@
-
-#TODO: refactor GDriveStorage before merging to main
-# is it possible to have something like this with the new pipeline?
-
-
-# # import tempfile
-# import auto_archive
-# from loguru import logger
-# from configs import Config
-# from storages import Storage
-
-
-# def main():
-#     c = Config()
-#     c.parse()
-#     logger.info(f'Opening document {c.sheet} to look for sheet names to archive')
-
-#     gc = c.gsheets_client
-#     sh = gc.open(c.sheet)
-
-#     wks = sh.get_worksheet(0)
-#     values = wks.get_all_values()
-
-#     with tempfile.TemporaryDirectory(dir="./") as tmpdir:
-#         Storage.TMP_FOLDER = tmpdir
-#         for i in range(11, len(values)):
-#             c.sheet = values[i][0]
-#             logger.info(f"Processing {c.sheet}")
-#             auto_archive.process_sheet(c)
-#         c.destroy_webdriver()
-
-
-# if __name__ == "__main__":
-#     main()
--- a/src/auto_archiver/core/init.py
+++ b/src/auto_archiver/core/init.py
@@ -1,6 +1,7 @@
-from .media import Media
 from .metadata import Metadata
+from .media import Media
 from .step import Step
+from .context import ArchivingContext

 # cannot import ArchivingOrchestrator/Config to avoid circular dep
 # from .orchestrator import ArchivingOrchestrator
--- a/src/auto_archiver/core/context.py
+++ b/src/auto_archiver/core/context.py
@@ -0,0 +1,52 @@
+from loguru import logger
+
+
+class ArchivingContext:
+    """
+    Singleton context class.
+    ArchivingContext._get_instance() to retrieve it if needed
+    otherwise just 
+    ArchivingContext.set(key, value)
+    and 
+    ArchivingContext.get(key, default)
+
+    When reset is called, all values are cleared EXCEPT if they were .set(keep_on_reset=True)
+        reset(full_reset=True) will recreate everything including the keep_on_reset status
+    """
+    _instance = None
+
+    def __init__(self):
+        self.configs = {}
+        self.keep_on_reset = set()
+
+    @staticmethod
+    def get_instance():
+        if ArchivingContext._instance is None:
+            ArchivingContext._instance = ArchivingContext()
+        return ArchivingContext._instance
+
+    @staticmethod
+    def set(key, value, keep_on_reset: bool = False):
+        ac = ArchivingContext.get_instance()
+        ac.configs[key] = value
+        if keep_on_reset: ac.keep_on_reset.add(key)
+
+    @staticmethod
+    def get(key: str, default=None):
+        return ArchivingContext.get_instance().configs.get(key, default)
+
+    @staticmethod
+    def reset(full_reset: bool = False):
+        ac = ArchivingContext.get_instance()
+        if full_reset: ac.keep_on_reset = set()
+        ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset}
+
+    # ---- custom getters/setters for widely used context values
+
+    @staticmethod
+    def set_tmp_dir(tmp_dir: str):
+        ArchivingContext.get_instance().configs["tmp_dir"] = tmp_dir
+
+    @staticmethod
+    def get_tmp_dir() -> str:
+        return ArchivingContext.get_instance().configs.get("tmp_dir")
--- a/src/auto_archiver/core/media.py
+++ b/src/auto_archiver/core/media.py
@@ -3,18 +3,46 @@ from __future__ import annotations
 from ast import List
 from typing import Any
 from dataclasses import dataclass, field
-from dataclasses_json import dataclass_json
+from dataclasses_json import dataclass_json, config
 import mimetypes

-# annotation order matters
-@dataclass_json
+from .context import ArchivingContext
+
+from loguru import logger
+
+
+@dataclass_json  # annotation order matters
@dataclass
 class Media:
    filename: str
    key: str = None
    urls: List[str] = field(default_factory=list)
-    _mimetype: str = None  # eg: image/jpeg
    properties: dict = field(default_factory=dict)
+    _mimetype: str = None  # eg: image/jpeg
+    _stored: bool = field(default=False, repr=False, metadata=config(exclude=lambda _: True))  # always exclude
+
+    def store(self: Media, override_storages: List = None, url: str = "url-not-available"):
+        # stores the media into the provided/available storages [Storage]
+        # repeats the process for its properties, in case they have inner media themselves
+        # for now it only goes down 1 level but it's easy to make it recursive if needed
+        storages = override_storages or ArchivingContext.get("storages")
+        if not len(storages):
+            logger.warning(f"No storages found in local context or provided directly for {self.filename}.")
+            return
+
+        for s in storages:
+            s.store(self, url)
+            # Media can be inside media properties, examples include transformations on original media
+            for prop in self.properties.values():
+                if isinstance(prop, Media):
+                    s.store(prop, url)
+                if isinstance(prop, list):
+                    for prop_media in prop:
+                        if isinstance(prop_media, Media):
+                            s.store(prop_media, url)
+
+    def is_stored(self) -> bool:
+        return len(self.urls) > 0 and len(self.urls) == len(ArchivingContext.get("storages"))

    def set(self, key: str, value: Any) -> Media:
        self.properties[key] = value
@@ -40,3 +68,6 @@ class Media:

    def is_video(self) -> bool:
        return self.mimetype.startswith("video")
+
+    def is_audio(self) -> bool:
+        return self.mimetype.startswith("audio")
--- a/src/auto_archiver/core/metadata.py
+++ b/src/auto_archiver/core/metadata.py
@@ -3,24 +3,25 @@ from __future__ import annotations
 from ast import List, Set
 from typing import Any, Union, Dict
 from dataclasses import dataclass, field
-from dataclasses_json import dataclass_json
+from dataclasses_json import dataclass_json, config
 import datetime
 from urllib.parse import urlparse
 from dateutil.parser import parse as parse_dt
 from .media import Media
+from .context import ArchivingContext


-# annotation order matters
-@dataclass_json
+@dataclass_json  # annotation order matters
@dataclass
 class Metadata:
    status: str = "no archiver"
-    _processed_at: datetime = field(default_factory=datetime.datetime.utcnow)
    metadata: Dict[str, Any] = field(default_factory=dict)
-    tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude": True})  # keys that are not to be saved in DBs
    media: List[Media] = field(default_factory=list)
    rearchivable: bool = True  # defaults to true, archivers can overwrite

+    def __post_init__(self):
+        self.set("_processed_at", datetime.datetime.utcnow())
+
    def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
        """
        merges two Metadata instances, will overwrite according to overwrite_left flag
@@ -30,7 +31,6 @@ class Metadata:
            if right.status and len(right.status):
                self.status = right.status
            self.rearchivable |= right.rearchivable
-            self.tmp_keys |= right.tmp_keys
            for k, v in right.metadata.items():
                assert k not in self.metadata or type(v) == type(self.get(k))
                if type(v) not in [dict, list, set] or k not in self.metadata:
@@ -43,10 +43,14 @@ class Metadata:
            return right.merge(self)
        return self

-    def set(self, key: str, val: Any, is_tmp=False) -> Metadata:
-        # if not self.metadata: self.metadata = {}
+    def store(self: Metadata, override_storages: List = None):
+        # calls .store for all contained media. storages [Storage]
+        storages = override_storages or ArchivingContext.get("storages")
+        for media in self.media:
+            media.store(override_storages=storages, url=self.get_url())
+
+    def set(self, key: str, val: Any) -> Metadata:
        self.metadata[key] = val
-        if is_tmp: self.tmp_keys.add(key)
        return self

    def get(self, key: str, default: Any = None, create_if_missing=False) -> Union[Metadata, str]:
@@ -64,7 +68,7 @@ class Metadata:
        return "success" in self.status

    def is_empty(self) -> bool:
-        return not self.is_success() and len(self.media) == 0 and len(self.get_clean_metadata()) <= 2  # url, processed_at
+        return not self.is_success() and len(self.media) == 0 and len(self.metadata) <= 2  # url, processed_at

    @property  # getter .netloc
    def netloc(self) -> str:
@@ -85,7 +89,8 @@ class Metadata:

    def set_content(self, content: str) -> Metadata:
        # a dump with all the relevant content
-        return self.set("content", content)
+        append_content = (self.get("content", "") + content + "\n").strip()
+        return self.set("content", append_content)

    def set_title(self, title: str) -> Metadata:
        return self.set("title", title)
@@ -93,12 +98,6 @@ class Metadata:
    def get_title(self) -> str:
        return self.get("title")

-    def set_tmp_dir(self, tmp_dir: str) -> Metadata:
-        return self.set("tmp_dir", tmp_dir, True)
-
-    def get_tmp_dir(self) -> str:
-        return self.get("tmp_dir")
-
    def set_timestamp(self, timestamp: datetime.datetime) -> Metadata:
        if type(timestamp) == str:
            timestamp = parse_dt(timestamp)
@@ -139,8 +138,5 @@ class Metadata:
        _default = self.media[0] if len(self.media) else None
        return self.get_media_by_id("_final_media", _default)

-    def get_clean_metadata(self) -> Metadata:
-        return dict(
-            {k: v for k, v in self.metadata.items() if k not in self.tmp_keys},
-            **{"processed_at": self._processed_at}
-        )
+    def __str__(self) -> str:
+        return self.__repr__()
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -2,6 +2,8 @@ from __future__ import annotations
 from ast import List
 from typing import Union

+from .context import ArchivingContext
+
 from ..archivers import Archiver
 from ..feeders import Feeder
 from ..formatters import Formatter
@@ -23,6 +25,7 @@ class ArchivingOrchestrator:
        self.archivers: List[Archiver] = config.archivers
        self.databases: List[Database] = config.databases
        self.storages: List[Storage] = config.storages
+        ArchivingContext.set("storages", self.storages, keep_on_reset=True)

        for a in self.archivers: a.setup()

@@ -32,8 +35,9 @@ class ArchivingOrchestrator:

    def feed_item(self, item: Metadata) -> Metadata:
        try:
+            ArchivingContext.reset()
            with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
-                item.set_tmp_dir(tmp_dir)
+                ArchivingContext.set_tmp_dir(tmp_dir)
                return self.archive(item)
        except KeyboardInterrupt:
            # catches keyboard interruptions to do a clean exit
@@ -89,7 +93,7 @@ class ArchivingOrchestrator:
                # Q: should this be refactored so it's just a.download(result)?
                result.merge(a.download(result))
                if result.is_success(): break
-            except Exception as e: logger.error(f"Unexpected error with archiver {a.name}: {e}")
+            except Exception as e: logger.error(f"Unexpected error with archiver {a.name}: {e}: {traceback.format_exc()}")

        # what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
        # should it call the HTMLgenerator as if it's not an enrichment?
@@ -101,26 +105,16 @@ class ArchivingOrchestrator:
        # eg: screenshot, wacz, webarchive, thumbnails
        for e in self.enrichers:
            try: e.enrich(result)
-            except Exception as exc: logger.error(f"Unexpected error with enricher {e.name}: {exc}")
+            except Exception as exc: logger.error(f"Unexpected error with enricher {e.name}: {exc}: {traceback.format_exc()}")

        # 5 - store media
        # looks for Media in result.media and also result.media[x].properties (as list or dict values)
-        for s in self.storages:
-            for m in result.media:
-                s.store(m, result)  # modifies media
-                # Media can be inside media properties, examples include transformations on original media
-                for prop in m.properties.values():
-                    if isinstance(prop, Media):
-                        s.store(prop, result)
-                    if isinstance(prop, list) and len(prop) > 0 and isinstance(prop[0], Media):
-                        for prop_media in prop:
-                            s.store(prop_media, result)
+        result.store()

        # 6 - format and store formatted if needed
        # enrichers typically need access to already stored URLs etc
        if (final_media := self.formatter.format(result)):
-            for s in self.storages:
-                s.store(final_media, result)
+            final_media.store(url=url)
            result.set_final_media(final_media)

        if result.is_empty():
--- a/src/auto_archiver/databases/gsheet_db.py
+++ b/src/auto_archiver/databases/gsheet_db.py
@@ -5,8 +5,7 @@ from urllib.parse import quote
 from loguru import logger

 from . import Database
-from ..core import Metadata
-from ..core import Media
+from ..core import Metadata, Media, ArchivingContext
 from ..utils import GWorksheet


@@ -63,8 +62,9 @@ class GsheetsDb(Database):
            batch_if_valid('archive', "\n".join(media.urls))
        batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
        batch_if_valid('title', item.get_title())
-        batch_if_valid('text', item.get("content", "")[:500])
+        batch_if_valid('text', item.get("content", ""))
        batch_if_valid('timestamp', item.get_timestamp())
+        batch_if_valid('hash', media.get("hash", "not-calculated"))
        if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
            batch_if_valid('screenshot', "\n".join(screenshot.urls))

@@ -86,7 +86,7 @@ class GsheetsDb(Database):
            logger.debug(f"Unable to update sheet: {e}")

    def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
-        # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now
-        gw: GWorksheet = item.get("gsheet").get("worksheet")
-        row: int = item.get("gsheet").get("row")
+        # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
+        gw: GWorksheet = ArchivingContext.get("gsheet").get("worksheet")
+        row: int = ArchivingContext.get("gsheet").get("row")
        return gw, row
--- a/src/auto_archiver/enrichers/init.py
+++ b/src/auto_archiver/enrichers/init.py
@@ -3,4 +3,5 @@ from .screenshot_enricher import ScreenshotEnricher
 from .wayback_enricher import WaybackArchiverEnricher
 from .hash_enricher import HashEnricher
 from .thumbnail_enricher import ThumbnailEnricher
-from .wacz_enricher import WaczEnricher
+from .wacz_enricher import WaczEnricher
+from .whisper_enricher import WhisperEnricher
--- a/src/auto_archiver/enrichers/hash_enricher.py
+++ b/src/auto_archiver/enrichers/hash_enricher.py
@@ -2,7 +2,7 @@ import hashlib
 from loguru import logger

 from . import Enricher
-from ..core import Metadata
+from ..core import Metadata, ArchivingContext


 class HashEnricher(Enricher):
@@ -16,11 +16,14 @@ class HashEnricher(Enricher):
        super().__init__(config)
        algo_choices = self.configs()["algorithm"]["choices"]
        assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
+        self.chunksize = int(self.chunksize)
+        ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True)

    @staticmethod
    def configs() -> dict:
        return {
-            "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}
+            "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
+            "chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
        }

    def enrich(self, to_enrich: Metadata) -> None:
@@ -28,12 +31,19 @@ class HashEnricher(Enricher):
        logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")

        for i, m in enumerate(to_enrich.media):
-            with open(m.filename, "rb") as f:
-                bytes = f.read()  # read entire file as bytes
-                hash = None
-                if self.algorithm == "SHA-256":
-                    hash = hashlib.sha256(bytes)
-                elif self.algorithm == "SHA3-512":
-                    hash = hashlib.sha3_512(bytes)
-                else: continue
-                to_enrich.media[i].set("hash", f"{self.algorithm}:{hash.hexdigest()}")
+            if len(hd := self.calculate_hash(m.filename)):
+                to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
+
+    def calculate_hash(self, filename):
+        hash = None
+        if self.algorithm == "SHA-256":
+            hash = hashlib.sha256()
+        elif self.algorithm == "SHA3-512":
+            hash = hashlib.sha3_512()
+        else: return ""
+        with open(filename, "rb") as f:
+            while True:
+                buf = f.read(self.chunksize)
+                if not buf: break
+                hash.update(buf)
+        return hash.hexdigest()
--- a/src/auto_archiver/enrichers/screenshot_enricher.py
+++ b/src/auto_archiver/enrichers/screenshot_enricher.py
@@ -4,7 +4,7 @@ from selenium.common.exceptions import TimeoutException

 from . import Enricher
 from ..utils import Webdriver, UrlUtil
-from ..core import Media, Metadata
+from ..core import Media, Metadata, ArchivingContext

 class ScreenshotEnricher(Enricher):
    name = "screenshot_enricher"
@@ -14,7 +14,8 @@ class ScreenshotEnricher(Enricher):
        return {
            "width": {"default": 1280, "help": "width of the screenshots"},
            "height": {"default": 720, "help": "height of the screenshots"},
-            "timeout": {"default": 60, "help": "timeout for taking the screenshot"}
+            "timeout": {"default": 60, "help": "timeout for taking the screenshot"},
+            "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}
        }

    def enrich(self, to_enrich: Metadata) -> None:
@@ -27,12 +28,11 @@ class ScreenshotEnricher(Enricher):
        with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver:
            try:
                driver.get(url)
-                time.sleep(2)
-                screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
+                time.sleep(int(self.sleep_before_screenshot))
+                screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
                driver.save_screenshot(screenshot_file)
                to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
            except TimeoutException:
                logger.info("TimeoutException loading page for screenshot")
            except Exception as e:
                logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
-        # return None
--- a/src/auto_archiver/enrichers/thumbnail_enricher.py
+++ b/src/auto_archiver/enrichers/thumbnail_enricher.py
@@ -2,7 +2,7 @@ import ffmpeg, os, uuid
 from loguru import logger

 from . import Enricher
-from ..core import Media, Metadata
+from ..core import Media, Metadata, ArchivingContext


 class ThumbnailEnricher(Enricher):
@@ -23,7 +23,7 @@ class ThumbnailEnricher(Enricher):
        logger.debug(f"generating thumbnails")
        for i, m in enumerate(to_enrich.media[::]):
            if m.is_video():
-                folder = os.path.join(to_enrich.get_tmp_dir(), str(uuid.uuid4()))
+                folder = os.path.join(ArchivingContext.get_tmp_dir(), str(uuid.uuid4()))
                os.makedirs(folder, exist_ok=True)
                logger.debug(f"generating thumbnails for {m.filename}")
                fps, duration = 0.5, m.get("duration")
--- a/src/auto_archiver/enrichers/wacz_enricher.py
+++ b/src/auto_archiver/enrichers/wacz_enricher.py
@@ -1,7 +1,7 @@
 import os, shutil, subprocess, uuid
 from loguru import logger

-from ..core import Media, Metadata
+from ..core import Media, Metadata, ArchivingContext
 from . import Enricher
 from ..utils import UrlUtil

@@ -26,36 +26,58 @@ class WaczEnricher(Enricher):

    def enrich(self, to_enrich: Metadata) -> bool:
        # TODO: figure out support for browsertrix in docker
+
        url = to_enrich.get_url()

        if UrlUtil.is_auth_wall(url):
            logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
            return
-
-        logger.debug(f"generating WACZ for {url=}")
+        
        collection = str(uuid.uuid4())[0:8]
-        browsertrix_home = os.path.abspath(to_enrich.get_tmp_dir())
-        cmd = [
-            "docker", "run",
-            "--rm",  # delete container once it has completed running
-            "-v", f"{browsertrix_home}:/crawls/",
-            # "-it", # this leads to "the input device is not a TTY"
-            "webrecorder/browsertrix-crawler", "crawl",
-            "--url", url,
-            "--scopeType", "page",
-            "--generateWACZ",
-            "--text",
-            "--collection", collection,
-            "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
-            "--behaviorTimeout", str(self.timeout),
-            "--timeout", str(self.timeout)
-        ]
-        if self.profile:
-            profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
-            shutil.copyfile(self.profile, profile_fn)
-            # TODO: test which is right
-            cmd.extend(["--profile", profile_fn])
-            # cmd.extend(["--profile", "/crawls/profile.tar.gz"])
+        browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
+        
+        if os.getenv('RUNNING_IN_DOCKER'):
+            logger.debug(f"generating WACZ without Docker for {url=}")
+
+            cmd = [
+                "crawl",
+                "--url", url,
+                "--scopeType", "page",
+                "--generateWACZ",
+                "--text",
+                "--collection", collection,
+                "--id", collection,
+                "--saveState", "never",
+                "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
+                "--behaviorTimeout", str(self.timeout),
+                "--timeout", str(self.timeout),
+                "--profile", str(self.profile)
+            ]
+        else:
+            logger.debug(f"generating WACZ in Docker for {url=}")
+            
+            cmd = [
+                "docker", "run",
+                "--rm",  # delete container once it has completed running
+                "-v", f"{browsertrix_home}:/crawls/",
+                # "-it", # this leads to "the input device is not a TTY"
+                "webrecorder/browsertrix-crawler", "crawl",
+                "--url", url,
+                "--scopeType", "page",
+                "--generateWACZ",
+                "--text",
+                "--collection", collection,
+                "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
+                "--behaviorTimeout", str(self.timeout),
+                "--timeout", str(self.timeout)
+            ]
+
+            if self.profile:
+                profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
+                shutil.copyfile(self.profile, profile_fn)
+                # TODO: test which is right
+                cmd.extend(["--profile", profile_fn])
+                # cmd.extend(["--profile", "/crawls/profile.tar.gz"])

        try:
            logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
@@ -64,7 +86,13 @@ class WaczEnricher(Enricher):
            logger.error(f"WACZ generation failed: {e}")
            return False

-        filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
+        
+
+        if os.getenv('RUNNING_IN_DOCKER'):
+            filename = os.path.join("collections", collection, f"{collection}.wacz")
+        else:
+            filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
+        
        if not os.path.exists(filename):
            logger.warning(f"Unable to locate and upload WACZ  {filename=}")
            return False
--- a/src/auto_archiver/enrichers/whisper_enricher.py
+++ b/src/auto_archiver/enrichers/whisper_enricher.py
@@ -0,0 +1,130 @@
+import traceback
+import requests, time
+from loguru import logger
+
+from . import Enricher
+from ..core import Metadata, Media, ArchivingContext
+from ..storages import S3Storage
+
+
+class WhisperEnricher(Enricher):
+    """
+    Connects with a Whisper API service to get texts out of audio
+    whisper API repository: TODO
+    Only works if an S3 compatible storage is used
+    """
+    name = "whisper_enricher"
+
+    def __init__(self, config: dict) -> None:
+        # without this STEP.__init__ is not called
+        super().__init__(config)
+        assert type(self.api_key) == str and len(self.api_key) > 0, "please provide a value for the whisper_enricher api_key"
+        self.timeout = int(self.timeout)
+
+    @staticmethod
+    def configs() -> dict:
+        return {
+            "api_endpoint": {"default": "https://whisper.spoettel.dev/api/v1", "help": "WhisperApi api endpoint"},
+            "api_key": {"default": None, "help": "WhisperApi api key for authentication"},
+            "include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
+            "timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
+            "action": {"default": "translation", "help": "which Whisper operation to execute", "choices": ["transcript", "translation", "language_detection"]},
+
+        }
+
+    def enrich(self, to_enrich: Metadata) -> None:
+        if not self._get_s3_storage():
+            logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
+            return
+
+        url = to_enrich.get_url()
+        logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.")
+
+        job_results = {}
+        for i, m in enumerate(to_enrich.media):
+            if m.is_video() or m.is_audio():
+                m.store(url=url)
+                try:
+                    job_id = self.submit_job(m)
+                    job_results[job_id] = False
+                    logger.debug(f"JOB SUBMITTED: {job_id=} for {m.key=}")
+                    to_enrich.media[i].set("whisper_model", {"job_id": job_id})
+                except Exception as e:
+                    logger.error(f"Failed to submit whisper job for {m.filename=} with error {e}\n{traceback.format_exc()}")
+
+        job_results = self.check_jobs(job_results)
+
+        for i, m in enumerate(to_enrich.media):
+            if m.is_video() or m.is_audio():
+                job_id = to_enrich.media[i].get("whisper_model")["job_id"]
+                to_enrich.media[i].set("whisper_model", {
+                    "job_id": job_id,
+                    **(job_results[job_id] if job_results[job_id] else {"result": "incomplete or failed job"})
+                })
+                # append the extracted text to the content of the post so it gets written to the DBs like gsheets text column
+                if job_results[job_id]:
+                    for k,v in job_results[job_id].items():
+                        if "_text" in k and len(v):
+                            to_enrich.set_content(f"\n[automatic video transcript]: {v}")
+
+    def submit_job(self, media: Media):
+        s3 = self._get_s3_storage()
+        s3_url = s3.get_cdn_url(media)
+        assert s3_url in media.urls, f"Could not find S3 url ({s3_url}) in list of stored media urls "
+        payload = {
+            "url": s3_url,
+            "type": self.action,
+            # "language": "string" # may be a config
+        }
+        response = requests.post(f'{self.api_endpoint}/jobs', json=payload, headers={'Authorization': f'Bearer {self.api_key}'})
+        assert response.status_code == 201, f"calling the whisper api {self.api_endpoint} returned a non-success code: {response.status_code}"
+        logger.debug(response.json())
+        return response.json()['id']
+
+    def check_jobs(self, job_results: dict):
+        start_time = time.time()
+        all_completed = False
+        while not all_completed and (time.time() - start_time) <= self.timeout:
+            all_completed = True
+            for job_id in job_results:
+                if job_results[job_id] != False: continue
+                all_completed = False  # at least one not ready
+                try: job_results[job_id] = self.check_job(job_id)
+                except Exception as e:
+                    logger.error(f"Failed to check {job_id=} with error {e}\n{traceback.format_exc()}")
+            if not all_completed: time.sleep(3)
+        return job_results
+
+    def check_job(self, job_id):
+        r = requests.get(f'{self.api_endpoint}/jobs/{job_id}', headers={'Authorization': f'Bearer {self.api_key}'})
+        assert r.status_code == 200, f"Job status did not respond with 200, instead with: {r.status_code}"
+        j = r.json()
+        logger.debug(f"Checked job {job_id=} with status='{j['status']}'")
+        if j['status'] == "processing": return False
+        elif j['status'] == "error": return f"Error: {j['meta']['error']}"
+        elif j['status'] == "success":
+            r_res = requests.get(f'{self.api_endpoint}/jobs/{job_id}/artifacts', headers={'Authorization': f'Bearer {self.api_key}'})
+            assert r_res.status_code == 200, f"Job artifacts did not respond with 200, instead with: {r_res.status_code}"
+            logger.success(r_res.json())
+            result = {}
+            for art_id, artifact in enumerate(r_res.json()):
+                subtitle = []
+                full_text = []
+                for i, d in enumerate(artifact.get("data")):
+                    subtitle.append(f"{i+1}\n{d.get('start')} --> {d.get('end')}\n{d.get('text').strip()}")
+                    full_text.append(d.get('text').strip())
+                if not len(subtitle): continue
+                if self.include_srt: result[f"artifact_{art_id}_subtitle"] = "\n".join(subtitle)
+                result[f"artifact_{art_id}_text"] = "\n".join(full_text)
+            # call /delete endpoint on timely success
+            r_del = requests.delete(f'{self.api_endpoint}/jobs/{job_id}', headers={'Authorization': f'Bearer {self.api_key}'})
+            logger.debug(f"DELETE whisper {job_id=} result: {r_del.status_code}")
+            return result
+        return False
+
+    def _get_s3_storage(self) -> S3Storage:
+        try:
+            return next(s for s in ArchivingContext.get("storages") if s.__class__ == S3Storage)
+        except:
+            logger.warning("No S3Storage instance found in storages")
+            return
--- a/src/auto_archiver/feeders/cli_feeder.py
+++ b/src/auto_archiver/feeders/cli_feeder.py
@@ -1,7 +1,7 @@
 from loguru import logger

 from . import Feeder
-from ..core import Metadata
+from ..core import Metadata, ArchivingContext


 class CLIFeeder(Feeder):
@@ -26,5 +26,7 @@ class CLIFeeder(Feeder):
    def __iter__(self) -> Metadata:
        for url in self.urls:
            logger.debug(f"Processing {url}")
-            yield Metadata().set_url(url).set("folder", "cli", True)
+            yield Metadata().set_url(url)
+            ArchivingContext.set("folder", "cli")
+
        logger.success(f"Processed {len(self.urls)} URL(s)")
--- a/src/auto_archiver/feeders/gsheet_feeder.py
+++ b/src/auto_archiver/feeders/gsheet_feeder.py
@@ -5,9 +5,10 @@ from slugify import slugify

 # from . import Enricher
 from . import Feeder
-from ..core import Metadata
+from ..core import Metadata, ArchivingContext
 from ..utils import Gsheets, GWorksheet

+
 class GsheetsFeeder(Gsheets, Feeder):
    name = "gsheet_feeder"

@@ -31,7 +32,7 @@ class GsheetsFeeder(Gsheets, Feeder):
                    "help": "(CSV) explicitly block some worksheets from being processed",
                    "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
                },
-                "use_sheet_names_in_stored_paths":{
+                "use_sheet_names_in_stored_paths": {
                    "default": True,
                    "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
                }
@@ -61,11 +62,17 @@ class GsheetsFeeder(Gsheets, Feeder):
                if status not in ['', None]: continue

                # All checks done - archival process starts here
-                m = Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True)
-                if self.use_sheet_names_in_stored_paths:
-                    m.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
+                m = Metadata().set_url(url)
+                ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
+                folder = slugify(gw.get_cell(row, 'folder').strip())
+                if len(folder):
+                    if self.use_sheet_names_in_stored_paths:
+                        ArchivingContext.set("folder", os.path.join(folder, slugify(self.sheet), slugify(wks.title)), True)
+                    else:
+                        ArchivingContext.set("folder", folder, True)
+
                yield m
-                
+
            logger.success(f'Finished worksheet {wks.title}')

    def should_process_sheet(self, sheet_name: str) -> bool:
--- a/src/auto_archiver/formatters/html_formatter.py
+++ b/src/auto_archiver/formatters/html_formatter.py
@@ -6,8 +6,9 @@ from urllib.parse import quote
 from loguru import logger

 from ..version import __version__
-from ..core import Metadata, Media
+from ..core import Metadata, Media, ArchivingContext
 from . import Formatter
+from ..enrichers import HashEnricher


@dataclass
@@ -40,17 +41,22 @@ class HtmlFormatter(Formatter):
            url=url,
            title=item.get_title(),
            media=item.media,
-            metadata=item.get_clean_metadata(),
+            metadata=item.metadata,
            version=__version__
        )
-        html_path = os.path.join(item.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html")
+        html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html")
        with open(html_path, mode="w", encoding="utf-8") as outf:
            outf.write(content)
-        return Media(filename=html_path)
+        final_media = Media(filename=html_path)
+
+        he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
+        if len(hd := he.calculate_hash(final_media.filename)):
+            final_media.set("hash", f"{he.algorithm}:{hd}")
+
+        return final_media


 # JINJA helper filters
-
 class JinjaHelpers:
    @staticmethod
    def is_list(v) -> bool:
--- a/src/auto_archiver/formatters/templates/html_template.html
+++ b/src/auto_archiver/formatters/templates/html_template.html
@@ -29,7 +29,7 @@
            margin: auto;
            border: 1px solid;
            border-collapse: collapse;
-            vertical-align:top;
+            vertical-align: top;
        }

        table.metadata td:first-child {
@@ -42,7 +42,7 @@
        }

        .copy:hover {
-            font-weight: 600;
+            background: aliceblue;
            cursor: copy;
        }

@@ -185,7 +185,11 @@
        el.addEventListener("copy", (e) => {
            e.preventDefault();
            if (e.clipboardData) {
-                e.clipboardData.setData("text/plain", el.textContent);
+                if (el.hasAttribute("copy-value")) {
+                    e.clipboardData.setData("text/plain", el.getAttribute("copy-value"));
+                } else {
+                    e.clipboardData.setData("text/plain", el.textContent);
+                }
                console.log(e.clipboardData.getData("text"))
                showNotification("copied!")
            }
--- a/src/auto_archiver/formatters/templates/macros.html
+++ b/src/auto_archiver/formatters/templates/macros.html
@@ -46,14 +46,16 @@ No preview available for {{ m.key }}.
 {% endif %}
 {% if links %}
 <a href="{{ url }}">open</a> or
-<a href="{{ url }}" download="">download</a>
+<a href="{{ url }}" download="">download</a> or
+{{ copy_urlize(url, "copy") }}
+
 <br>
 {% endif %}
 {% endfor %}

 {%- endmacro -%}

-{% macro copy_urlize(val) -%}
+{% macro copy_urlize(val, href_text) -%}

 {% if val is mapping %}
 <ul>
@@ -65,7 +67,11 @@ No preview available for {{ m.key }}.
 </ul>

 {% else %}
+{% if href_text  | length == 0 %}
 <span class="copy">{{ val | string | urlize }}</span>
+{% else %}
+<span class="copy" copy-value="{{val}}">{{ href_text | string | urlize }}</span>
+{% endif %}
 {% endif %}

 {%- endmacro -%}
--- a/src/auto_archiver/storages/storage.py
+++ b/src/auto_archiver/storages/storage.py
@@ -1,10 +1,10 @@
 from __future__ import annotations
 from abc import abstractmethod
 from dataclasses import dataclass
-import hashlib
-from typing import IO, Any
+from typing import IO

-from ..core import Media, Metadata, Step
+from ..core import Media, Step, ArchivingContext
+from ..enrichers import HashEnricher
 from loguru import logger
 import os, uuid
 from slugify import slugify
@@ -41,8 +41,11 @@ class Storage(Step):
        # only for typing...
        return Step.init(name, config, Storage)

-    def store(self, media: Media, item: Metadata) -> None:
-        self.set_key(media, item)
+    def store(self, media: Media, url: str) -> None:
+        if media.is_stored(): 
+            logger.debug(f"{media.key} already stored, skipping")
+            return
+        self.set_key(media, url)
        self.upload(media)
        media.add_url(self.get_cdn_url(media))

@@ -57,25 +60,25 @@ class Storage(Step):
        with open(media.filename, 'rb') as f:
            return self.uploadf(f, media, **kwargs)

-    def set_key(self, media: Media, item: Metadata) -> None:
+    def set_key(self, media: Media, url) -> None:
        """takes the media and optionally item info and generates a key"""
        if media.key is not None and len(media.key) > 0: return
-        folder = item.get("folder", "")
+        folder = ArchivingContext.get("folder", "")
        filename, ext = os.path.splitext(media.filename)

        # path_generator logic
-        if self.path_generator == "flat": 
+        if self.path_generator == "flat":
            path = ""
-            filename = slugify(filename) # in case it comes with os.sep
-        elif self.path_generator == "url": path = slugify(item.get_url())
+            filename = slugify(filename)  # in case it comes with os.sep
+        elif self.path_generator == "url": path = slugify(url)
        elif self.path_generator == "random":
-            path = item.get("random_path", str(uuid.uuid4())[:16], True)
+            path = ArchivingContext.get("random_path", str(uuid.uuid4())[:16], True)

        # filename_generator logic
        if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
-        elif self.filename_generator == "static": 
-            with open(media.filename, "rb") as f:
-                bytes = f.read()  # read entire file as bytes
-            filename = hashlib.sha256(bytes).hexdigest()[:24]
+        elif self.filename_generator == "static":
+            he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
+            hd = he.calculate_hash(media.filename)
+            filename = hd[:24]

-        media.key = os.path.join(folder, path, f"{filename}{ext}")
+        media.key = os.path.join(folder, path, f"{filename}{ext}")
--- a/src/auto_archiver/utils/gsheet.py
+++ b/src/auto_archiver/utils/gsheet.py
@@ -30,11 +30,9 @@ class Gsheets(Step):
                    'archive': 'archive location',
                    'date': 'archive date',
                    'thumbnail': 'thumbnail',
-                    'thumbnail_index': 'thumbnail index',
                    'timestamp': 'upload timestamp',
                    'title': 'upload title',
                    'text': 'text content',
-                    'duration': 'duration',
                    'screenshot': 'screenshot',
                    'hash': 'hash',
                    'wacz': 'wacz',
--- a/src/auto_archiver/utils/gworksheet.py
+++ b/src/auto_archiver/utils/gworksheet.py
@@ -15,10 +15,8 @@ class GWorksheet:
        'archive': 'archive location',
        'date': 'archive date',
        'thumbnail': 'thumbnail',
-        'thumbnail_index': 'thumbnail index',
        'timestamp': 'upload timestamp',
        'title': 'upload title',
-        'duration': 'duration',
        'screenshot': 'screenshot',
        'hash': 'hash',
        'wacz': 'wacz',
@@ -98,7 +96,7 @@ class GWorksheet:
        cell_updates = [
            {
                'range': self.to_a1(row, col),
-                'values': [[val]]
+                'values': [[str(val)[0:49999]]]
            }
            for row, col, val in cell_updates
        ]
--- a/src/auto_archiver/version.py
+++ b/src/auto_archiver/version.py
@@ -1,9 +1,9 @@

 _MAJOR = "0"
-_MINOR = "4"
+_MINOR = "5"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "2"
+_PATCH = "12"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""
Author	SHA1	Message	Date
msramalho	b7c69c0f0d	Bump version to v0.5.12 for release	2023-05-10 18:58:34 +01:00
msramalho	c98991cdfb	fix: vk-url-scraper version update	2023-05-10 18:57:45 +01:00
msramalho	45b982ec38	fix: max chars on sheets cell	2023-05-10 18:57:33 +01:00
msramalho	e11be449e8	fix: delete completed whisper tasks	2023-05-10 18:57:17 +01:00
Logan Williams	134bf09257	Fix typo	2023-05-10 16:05:06 +02:00
Logan Williams	417ca9ef51	Limit build platforms to those supported by webrecorder	2023-05-10 16:03:51 +02:00
Logan Williams	5b79dcb80c	Configure multi-platform docker builds	2023-05-10 16:01:23 +02:00
msramalho	52d7b4a016	Merge branch 'dockerize' of https://github.com/bellingcat/auto-archiver into dockerize	2023-05-10 13:29:45 +01:00
msramalho	31f6aae7b9	fix: screenshots in docker	2023-05-10 13:29:42 +01:00
Logan Williams	26373d4545	Re-order README slightly	2023-05-10 11:48:34 +02:00
Logan Williams	7a34915f8e	Remove old auto auto archiver file	2023-05-10 11:16:54 +02:00
Miguel Sozinho Ramalho	b67a7b818a	Merge pull request #75 from bellingcat/feature/browsertrix	2023-05-10 10:14:40 +01:00
Logan Williams	2e63cb8411	Update README with new entrypoint	2023-05-10 11:13:47 +02:00
Logan Williams	9cb73c073f	Simplify entrypoint	2023-05-10 11:08:49 +02:00
msramalho	9d078a648f	version bump	2023-05-10 09:57:47 +01:00
msramalho	e150370657	updates docker instructions	2023-05-10 09:51:53 +01:00
Miguel Sozinho Ramalho	4116c90168	Merge pull request #74 from bellingcat/feature/browsertrix	2023-05-10 09:36:41 +01:00
Logan Williams	2c5b115fbe	Fix lock file issue	2023-05-09 19:34:16 +02:00
Logan Williams	bda812f850	Clean up comments	2023-05-09 19:34:16 +02:00
Logan Williams	ac82764ffc	Working, but some cleanup still necessary	2023-05-09 19:34:16 +02:00
Logan Williams	0fae7d96fb	Detect running in docker container in WACZ enricher	2023-05-09 19:34:16 +02:00
Logan Williams	2f7181ced6	Use browsertrix base image	2023-05-09 19:34:16 +02:00
msramalho	9c25b33f1c	fix: multiple storages with folder column	2023-05-09 12:14:07 +01:00
msramalho	ae3e607705	fix: depreacating thumbnail_index	2023-05-09 11:29:05 +01:00
msramalho	c1a60fde8a	fix: deprecates duration column	2023-05-09 11:26:19 +01:00
msramalho	875e1de589	feat: re-enable HASH on gsheet	2023-05-09 11:17:44 +01:00
msramalho	8f3d4e05c3	fixing bug in whisper wnericher	2023-05-04 09:36:10 +01:00
msramalho	3bd6bed825	Bump version to v0.5.10 for release	2023-05-02 19:44:00 +01:00
msramalho	2659675f06	skip trim	2023-05-02 19:06:10 +01:00
msramalho	9d44f4b207	content append instead of replace	2023-05-02 19:06:00 +01:00
msramalho	5b0bff612e	whisper transcripts to content	2023-05-02 19:05:32 +01:00
msramalho	ae7ceba0e5	better debug	2023-05-02 19:05:18 +01:00
msramalho	97821a81bc	log cleanup	2023-05-02 19:05:06 +01:00
msramalho	9191b38cf2	tbot archiver works	2023-05-02 19:04:51 +01:00
msramalho	567edfc35e	Bump version to v0.5.8 for release	2023-05-02 14:30:49 +01:00
msramalho	8c22a9df72	fixes "url-not-found"	2023-05-02 14:30:07 +01:00
msramalho	d2d6db162b	Bump version to v0.5.7 for release	2023-04-18 19:28:51 +01:00
msramalho	5cfbcc0137	html template copy ux	2023-04-18 19:28:43 +01:00
msramalho	5fdaa6c739	whisper improvements	2023-04-18 19:28:36 +01:00
msramalho	3d389ee05b	add url info	2023-04-18 19:14:47 +01:00
msramalho	0ecbed0df0	Bump version to v0.5.6 for release	2023-04-18 18:49:08 +01:00
msramalho	69bcfea2eb	to_json fix	2023-04-18 18:48:51 +01:00
msramalho	2e2e695444	whisper enricher	2023-03-23 18:50:37 +00:00
msramalho	493055a8d9	cleanup	2023-03-23 18:50:30 +00:00
msramalho	6f6eb2db7a	Archiving Context refactor complete	2023-03-23 14:28:45 +00:00
msramalho	906ed0f6e0	creating global context and refactoring tmp_dir logic	2023-03-23 11:17:38 +00:00
msramalho	39818e648a	Bump version to v0.4.5 for release	2023-03-16 15:05:42 +00:00
Miguel Sozinho Ramalho	2bbf534d67	Merge pull request #72 from milesmcc/patch-1 Fix hash enricher for flatfile output (closes #71)	2023-03-16 15:04:55 +00:00
R. Miles McCain	6be7536fad	Fix hash enricher for flatfile output (closes #71 )	2023-03-14 13:37:54 -07:00
msramalho	0654e8c5c6	hash calculation in chunks to avoid exhausting RAM	2023-03-10 11:34:29 +00:00
msramalho	0e3c427371	Bump version to v0.4.3 for release	2023-02-27 10:30:06 +01:00