Bump version to v0.5.12 for release

fix: vk-url-scraper version update
fix: max chars on sheets cell
2026-06-12 13:18:28 +03:00 · 2023-05-10 18:58:34 +01:00 · 2023-05-10 18:57:45 +01:00 · 2023-05-10 18:57:33 +01:00 · 2023-05-10 18:57:17 +01:00 · 2023-05-10 16:05:06 +02:00
35 changed files with 1604 additions and 673 deletions
--- a/.github/workflows/docker-publish.yaml
+++ b/.github/workflows/docker-publish.yaml
@@ -27,6 +27,14 @@ jobs:
      - name: Check out the repo
        uses: actions/checkout@v3
      - name: Set up QEMU
        uses: docker/setup-qemu-action@v1
      # https://github.com/docker/setup-buildx-action
      - name: Set up Docker Buildx
        id: buildx
        uses: docker/setup-buildx-action@v1
      - name: Log in to Docker Hub
        uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
        with:
@@ -40,9 +48,10 @@ jobs:
          images: bellingcat/auto-archiver
      - name: Build and push Docker image
-        uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
+        uses: docker/build-push-action@v2
        with:
          context: .
-          push: true
+          platforms: linux/amd64,linux/arm64
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
--- a/31
+++ b/31
@@ -1,35 +1,36 @@
-# stage 1 - all dependencies
+FROM webrecorder/browsertrix-crawler:latest
-From python:3.10
+
 ENV RUNNING_IN_DOCKER=1
 WORKDIR /app
 # TODO: use custom ffmpeg builds instead of apt-get install
 RUN pip install --upgrade pip && \
 	pip install pipenv && \
 	add-apt-repository ppa:mozillateam/ppa && \
 	apt-get update && \
-	apt-get install -y gcc ffmpeg fonts-noto firefox-esr && \
+	apt-get install -y gcc ffmpeg fonts-noto && \
-	wget https://github.com/mozilla/geckodriver/releases/download/v0.32.0/geckodriver-v0.32.0-linux64.tar.gz && \
+	apt-get install -y --no-install-recommends firefox-esr && \
 	ln -s /usr/bin/firefox-esr /usr/bin/firefox && \
 	wget https://github.com/mozilla/geckodriver/releases/download/v0.33.0/geckodriver-v0.33.0-linux64.tar.gz && \
 	tar -xvzf geckodriver* -C /usr/local/bin && \
 	chmod +x /usr/local/bin/geckodriver && \
 	rm geckodriver-v*
 # install docker for WACZ
 # TODO: currently disabled see https://github.com/bellingcat/auto-archiver/issues/66
 # RUN curl -fsSL https://get.docker.com | sh
 # TODO: avoid copying unnecessary files, including .git
-COPY Pipfile Pipfile.lock ./
+COPY Pipfile* ./
-RUN pipenv install --python=3.10 --system --deploy
+RUN pipenv install
-# ENV IS_DOCKER=1
+
 # doing this at the end helps during development, builds are quick
 COPY ./src/ . 
 # TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
 # RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
 # USER archiver
 ENTRYPOINT ["python"]
 # ENTRYPOINT ["docker-entrypoint.sh"]
-# should be executed with 2 volumes (3 if local_storage)
+
-# docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/secrets  -v $PWD/local_archive:/app/local_archive aa --help
+ENTRYPOINT ["pipenv", "run", "python3", "-m", "auto_archiver"]
 # should be executed with 2 volumes (3 if local_storage is used)
 # docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml
--- a/6
+++ b/6
@@ -30,9 +30,13 @@ cryptography = "==38.0.4"
 dataclasses-json = "*"
 yt-dlp = ">=2023.2.17"
 vk-url-scraper = "*"
 uwsgi = "*"
 requests = {extras = ["socks"], version = "*"}
 # wacz = "==0.4.8"
 pywb = ">=2.7.3"
 [requires]
-python_version = "3.9"
+python_version = "3.10"
 [dev-packages]
 autopep8 = "*"
--- a/Pipfile.lock
+++ b/Pipfile.lock
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@ Docker works like a virtual machine running inside your computer, it isolates ev
 1. install [docker](https://docs.docker.com/get-docker/)
 2. pull the auto-archiver docker [image](https://hub.docker.com/r/bellingcat/auto-archiver) with `docker pull bellingcat/auto-archiver`
-3. run the docker image locally in a container: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver -m auto_archiver  --config secrets/orchestration.yaml` breaking this command down:
+3. run the docker image locally in a container: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml` breaking this command down:
   1. `docker run` tells docker to start a new container (an instance of the image)
   2. `--rm` makes sure this container is removed after execution (less garbage locally)
   3. `-v $PWD/secrets:/app/secrets` - your secrets folder
@@ -87,11 +87,9 @@ The archiver work is orchestrated by the following workflow (we call each a **st
 4. **Formatter** creates a report from all the archived content (HTML, PDF, ...)
 5. **Database** knows what's been archived and also stores the archive result (spreadsheet, CSV, or just the console)
-To check all available steps (which archivers, storages, databses, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).
+To setup an auto-archiver instance, instance, create an `orchestration.yaml` which contains the workflow you would like. We advise you put this file into a `secrets/` folder and do not share it with others because it will contain passwords and other secrets. 
-The great thing is you configure all the workflow in your `orchestration.yaml` file which we advise you put into a `secrets/` folder and don't share it with others because it will contain passwords and other secrets. 
+The structure of orchestration file is split into 2 parts: `steps` (what **steps** to use) and `configurations` (how those steps should behave), here's a simplification:
 The structure of orchestration file is split into 2 parts: `steps` (what **steps** to use) and `configs` (how those steps should behave), here's a simplification:
 ```yaml
 # orchestration.yaml content
 steps:
@@ -113,10 +111,12 @@ configurations:
  # ... configurations for the other steps here ...
 ```
 To see all available `steps` (which archivers, storages, databses, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).
 All the `configurations` in the `orchestration.yaml` file (you can name it differently but need to pass it in the `--config FILENAME` argument) can be seen in the console by using the `--help` flag. They can also be overwritten, for example if you are using the `cli_feeder` to archive from the command line and want to provide the URLs you should do:
 ```bash
-auto-archiver --config orchestration.yaml --cli_feeder.urls="url1,url2,url3"
+auto-archiver --config secrets/orchestration.yaml --cli_feeder.urls="url1,url2,url3"
 ```
 Here's the complete workflow that the auto-archiver goes through:
@@ -193,7 +193,7 @@ Use `python -m src.auto_archiver --config secrets/orchestration.yaml` to run fro
 #### Docker development
 working with docker locally:
  * `docker build . -t auto-archiver` to build a local image
-  * `docker run --rm -v $PWD/secrets:/app/secrets aa --config secrets/config.yaml`
+  * `docker run --rm -v $PWD/secrets:/app/secrets aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml`
    * to use local archive, also create a volume `-v` for it by adding `-v $PWD/local_archive:/app/local_archive`
--- a/example.orchestration.yaml
+++ b/example.orchestration.yaml
@@ -45,11 +45,9 @@ configurations:
      archive: archive location
      date: archive date
      thumbnail: thumbnail
      thumbnail_index: thumbnail index
      timestamp: upload timestamp
      title: upload title
      text: textual content
      duration: duration
      screenshot: screenshot
      hash: hash
      wacz: wacz
--- a/scripts/create_update_gdrive_oauth_token.py
+++ b/scripts/create_update_gdrive_oauth_token.py
@@ -10,6 +10,7 @@ from googleapiclient.errors import HttpError
 # You can run this code to get a new token and verify it belongs to the correct user
 # This token will be refresh automatically by the auto-archiver
 # Code below from https://developers.google.com/drive/api/quickstart/python
 # Example invocation: py scripts/create_update_gdrive_oauth_token.py -c secrets/credentials.json -t secrets/gd-token.json
 SCOPES = ['https://www.googleapis.com/auth/drive']
--- a/src/auto_archiver/archivers/archiver.py
+++ b/src/auto_archiver/archivers/archiver.py
@@ -3,8 +3,8 @@ from abc import abstractmethod
 from dataclasses import dataclass
 import os
 import mimetypes, requests
-from ..core import Metadata
+
-from ..core import Step
+from ..core import Metadata, Step, ArchivingContext
@dataclass
@@ -51,7 +51,7 @@ class Archiver(Step):
            if len(to_filename) > 64:
                to_filename = to_filename[-64:]
        if item:
-            to_filename = os.path.join(item.get_tmp_dir(), to_filename)
+            to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
        }
--- a/src/auto_archiver/archivers/instagram_tbot_archiver.py
+++ b/src/auto_archiver/archivers/instagram_tbot_archiver.py
@@ -4,7 +4,7 @@ from loguru import logger
 import time, os
 from sqlite3 import OperationalError
 from . import Archiver
-from ..core import Metadata, Media
+from ..core import Metadata, Media, ArchivingContext
 class InstagramTbotArchiver(Archiver):
@@ -31,7 +31,7 @@ class InstagramTbotArchiver(Archiver):
            "api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
            "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
            "session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
-            "timeout": {"default": 15, "help": "timeout to fetch the instagram content in seconds."},
+            "timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."},
        }
    def setup(self) -> None:
@@ -44,7 +44,7 @@ class InstagramTbotArchiver(Archiver):
        if not "instagram.com" in url: return False
        result = Metadata()
-        tmp_dir = item.get_tmp_dir()
+        tmp_dir = ArchivingContext.get_tmp_dir()
        with self.client.start():
            chat = self.client.get_entity("instagram_load_bot")
            since_id = self.client.send_message(entity=chat, message=url).id
@@ -52,9 +52,9 @@ class InstagramTbotArchiver(Archiver):
            attempts = 0
            seen_media = []
            message = ""
-            time.sleep(4)
+            time.sleep(3)
            # media is added before text by the bot so it can be used as a stop-logic mechanism
-            while attempts < self.timeout and (not message or not len(seen_media)):
+            while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
                attempts += 1
                time.sleep(1)
                for post in self.client.iter_messages(chat, min_id=since_id):
--- a/src/auto_archiver/archivers/telethon_archiver.py
+++ b/src/auto_archiver/archivers/telethon_archiver.py
@@ -8,7 +8,7 @@ from tqdm import tqdm
 import re, time, json, os
 from . import Archiver
-from ..core import Metadata, Media
+from ..core import Metadata, Media, ArchivingContext
 class TelethonArchiver(Archiver):
@@ -128,7 +128,7 @@ class TelethonArchiver(Archiver):
            media_posts = self._get_media_posts_in_group(chat, post)
            logger.debug(f'got {len(media_posts)=} for {url=}')
-            tmp_dir = item.get_tmp_dir()
+            tmp_dir = ArchivingContext.get_tmp_dir()
            group_id = post.grouped_id if post.grouped_id is not None else post.id
            title = post.message
--- a/src/auto_archiver/archivers/tiktok_archiver.py
+++ b/src/auto_archiver/archivers/tiktok_archiver.py
@@ -3,7 +3,7 @@ import tiktok_downloader
 from loguru import logger
 from . import Archiver
-from ..core import Metadata, Media
+from ..core import Metadata, Media, ArchivingContext
 class TiktokArchiver(Archiver):
@@ -41,7 +41,7 @@ class TiktokArchiver(Archiver):
            logger.warning(f'Other Tiktok error {error}')
        try:
-            filename = os.path.join(item.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4')
+            filename = os.path.join(ArchivingContext.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4')
            tiktok_media = tiktok_downloader.snaptik(url).get_media()
            if len(tiktok_media) <= 0:
--- a/src/auto_archiver/archivers/vk_archiver.py
+++ b/src/auto_archiver/archivers/vk_archiver.py
@@ -3,7 +3,7 @@ from vk_url_scraper import VkScraper
 from ..utils.misc import dump_payload
 from . import Archiver
-from ..core import Metadata, Media
+from ..core import Metadata, Media, ArchivingContext
 class VkArchiver(Archiver):
@@ -50,7 +50,7 @@ class VkArchiver(Archiver):
        result.set_content(dump_payload(vk_scrapes))
-        filenames = self.vks.download_media(vk_scrapes, item.get_tmp_dir())
+        filenames = self.vks.download_media(vk_scrapes, ArchivingContext.get_tmp_dir())
        for filename in filenames:
            result.add_media(Media(filename))
--- a/src/auto_archiver/archivers/youtubedl_archiver.py
+++ b/src/auto_archiver/archivers/youtubedl_archiver.py
@@ -2,7 +2,7 @@ import datetime, os, yt_dlp
 from loguru import logger
 from . import Archiver
-from ..core import Metadata, Media
+from ..core import Metadata, Media, ArchivingContext
 class YoutubeDLArchiver(Archiver):
@@ -25,7 +25,7 @@ class YoutubeDLArchiver(Archiver):
            logger.debug('Using Facebook cookie')
            yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
-        ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(item.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False})
+        ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False})
        try:
            # don'd download since it can be a live stream
--- a/src/auto_archiver/auto_auto_archive.py
+++ b/src/auto_archiver/auto_auto_archive.py
@@ -1,34 +0,0 @@
 #TODO: refactor GDriveStorage before merging to main
 # is it possible to have something like this with the new pipeline?
 # # import tempfile
 # import auto_archive
 # from loguru import logger
 # from configs import Config
 # from storages import Storage
 # def main():
 #     c = Config()
 #     c.parse()
 #     logger.info(f'Opening document {c.sheet} to look for sheet names to archive')
 #     gc = c.gsheets_client
 #     sh = gc.open(c.sheet)
 #     wks = sh.get_worksheet(0)
 #     values = wks.get_all_values()
 #     with tempfile.TemporaryDirectory(dir="./") as tmpdir:
 #         Storage.TMP_FOLDER = tmpdir
 #         for i in range(11, len(values)):
 #             c.sheet = values[i][0]
 #             logger.info(f"Processing {c.sheet}")
 #             auto_archive.process_sheet(c)
 #         c.destroy_webdriver()
 # if __name__ == "__main__":
 #     main()
--- a/src/auto_archiver/core/init.py
+++ b/src/auto_archiver/core/init.py
@@ -1,6 +1,7 @@
 from .media import Media
 from .metadata import Metadata
 from .media import Media
 from .step import Step
 from .context import ArchivingContext
 # cannot import ArchivingOrchestrator/Config to avoid circular dep
 # from .orchestrator import ArchivingOrchestrator
--- a/src/auto_archiver/core/context.py
+++ b/src/auto_archiver/core/context.py
@@ -0,0 +1,52 @@
 from loguru import logger
 class ArchivingContext:
    """
    Singleton context class.
    ArchivingContext._get_instance() to retrieve it if needed
    otherwise just 
    ArchivingContext.set(key, value)
    and 
    ArchivingContext.get(key, default)
    When reset is called, all values are cleared EXCEPT if they were .set(keep_on_reset=True)
        reset(full_reset=True) will recreate everything including the keep_on_reset status
    """
    _instance = None
    def __init__(self):
        self.configs = {}
        self.keep_on_reset = set()
    @staticmethod
    def get_instance():
        if ArchivingContext._instance is None:
            ArchivingContext._instance = ArchivingContext()
        return ArchivingContext._instance
    @staticmethod
    def set(key, value, keep_on_reset: bool = False):
        ac = ArchivingContext.get_instance()
        ac.configs[key] = value
        if keep_on_reset: ac.keep_on_reset.add(key)
    @staticmethod
    def get(key: str, default=None):
        return ArchivingContext.get_instance().configs.get(key, default)
    @staticmethod
    def reset(full_reset: bool = False):
        ac = ArchivingContext.get_instance()
        if full_reset: ac.keep_on_reset = set()
        ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset}
    # ---- custom getters/setters for widely used context values
    @staticmethod
    def set_tmp_dir(tmp_dir: str):
        ArchivingContext.get_instance().configs["tmp_dir"] = tmp_dir
    @staticmethod
    def get_tmp_dir() -> str:
        return ArchivingContext.get_instance().configs.get("tmp_dir")
--- a/src/auto_archiver/core/media.py
+++ b/src/auto_archiver/core/media.py
@@ -3,18 +3,46 @@ from __future__ import annotations
 from ast import List
 from typing import Any
 from dataclasses import dataclass, field
-from dataclasses_json import dataclass_json
+from dataclasses_json import dataclass_json, config
 import mimetypes
-# annotation order matters
+from .context import ArchivingContext
-@dataclass_json
+
 from loguru import logger
@dataclass_json  # annotation order matters
@dataclass
 class Media:
    filename: str
    key: str = None
    urls: List[str] = field(default_factory=list)
    _mimetype: str = None  # eg: image/jpeg
    properties: dict = field(default_factory=dict)
    _mimetype: str = None  # eg: image/jpeg
    _stored: bool = field(default=False, repr=False, metadata=config(exclude=lambda _: True))  # always exclude
    def store(self: Media, override_storages: List = None, url: str = "url-not-available"):
        # stores the media into the provided/available storages [Storage]
        # repeats the process for its properties, in case they have inner media themselves
        # for now it only goes down 1 level but it's easy to make it recursive if needed
        storages = override_storages or ArchivingContext.get("storages")
        if not len(storages):
            logger.warning(f"No storages found in local context or provided directly for {self.filename}.")
            return
        for s in storages:
            s.store(self, url)
            # Media can be inside media properties, examples include transformations on original media
            for prop in self.properties.values():
                if isinstance(prop, Media):
                    s.store(prop, url)
                if isinstance(prop, list):
                    for prop_media in prop:
                        if isinstance(prop_media, Media):
                            s.store(prop_media, url)
    def is_stored(self) -> bool:
        return len(self.urls) > 0 and len(self.urls) == len(ArchivingContext.get("storages"))
    def set(self, key: str, value: Any) -> Media:
        self.properties[key] = value
@@ -40,3 +68,6 @@ class Media:
    def is_video(self) -> bool:
        return self.mimetype.startswith("video")
    def is_audio(self) -> bool:
        return self.mimetype.startswith("audio")
--- a/src/auto_archiver/core/metadata.py
+++ b/src/auto_archiver/core/metadata.py
@@ -3,24 +3,25 @@ from __future__ import annotations
 from ast import List, Set
 from typing import Any, Union, Dict
 from dataclasses import dataclass, field
-from dataclasses_json import dataclass_json
+from dataclasses_json import dataclass_json, config
 import datetime
 from urllib.parse import urlparse
 from dateutil.parser import parse as parse_dt
 from .media import Media
 from .context import ArchivingContext
-# annotation order matters
+@dataclass_json  # annotation order matters
@dataclass_json
@dataclass
 class Metadata:
    status: str = "no archiver"
    _processed_at: datetime = field(default_factory=datetime.datetime.utcnow)
    metadata: Dict[str, Any] = field(default_factory=dict)
    tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude": True})  # keys that are not to be saved in DBs
    media: List[Media] = field(default_factory=list)
    rearchivable: bool = True  # defaults to true, archivers can overwrite
    def __post_init__(self):
        self.set("_processed_at", datetime.datetime.utcnow())
    def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
        """
        merges two Metadata instances, will overwrite according to overwrite_left flag
@@ -30,7 +31,6 @@ class Metadata:
            if right.status and len(right.status):
                self.status = right.status
            self.rearchivable |= right.rearchivable
            self.tmp_keys |= right.tmp_keys
            for k, v in right.metadata.items():
                assert k not in self.metadata or type(v) == type(self.get(k))
                if type(v) not in [dict, list, set] or k not in self.metadata:
@@ -43,10 +43,14 @@ class Metadata:
            return right.merge(self)
        return self
-    def set(self, key: str, val: Any, is_tmp=False) -> Metadata:
+    def store(self: Metadata, override_storages: List = None):
-        # if not self.metadata: self.metadata = {}
+        # calls .store for all contained media. storages [Storage]
        storages = override_storages or ArchivingContext.get("storages")
        for media in self.media:
            media.store(override_storages=storages, url=self.get_url())
    def set(self, key: str, val: Any) -> Metadata:
        self.metadata[key] = val
        if is_tmp: self.tmp_keys.add(key)
        return self
    def get(self, key: str, default: Any = None, create_if_missing=False) -> Union[Metadata, str]:
@@ -64,7 +68,7 @@ class Metadata:
        return "success" in self.status
    def is_empty(self) -> bool:
-        return not self.is_success() and len(self.media) == 0 and len(self.get_clean_metadata()) <= 2  # url, processed_at
+        return not self.is_success() and len(self.media) == 0 and len(self.metadata) <= 2  # url, processed_at
    @property  # getter .netloc
    def netloc(self) -> str:
@@ -85,7 +89,8 @@ class Metadata:
    def set_content(self, content: str) -> Metadata:
        # a dump with all the relevant content
-        return self.set("content", content)
+        append_content = (self.get("content", "") + content + "\n").strip()
        return self.set("content", append_content)
    def set_title(self, title: str) -> Metadata:
        return self.set("title", title)
@@ -93,12 +98,6 @@ class Metadata:
    def get_title(self) -> str:
        return self.get("title")
    def set_tmp_dir(self, tmp_dir: str) -> Metadata:
        return self.set("tmp_dir", tmp_dir, True)
    def get_tmp_dir(self) -> str:
        return self.get("tmp_dir")
    def set_timestamp(self, timestamp: datetime.datetime) -> Metadata:
        if type(timestamp) == str:
            timestamp = parse_dt(timestamp)
@@ -139,8 +138,5 @@ class Metadata:
        _default = self.media[0] if len(self.media) else None
        return self.get_media_by_id("_final_media", _default)
-    def get_clean_metadata(self) -> Metadata:
+    def __str__(self) -> str:
-        return dict(
+        return self.__repr__()
            {k: v for k, v in self.metadata.items() if k not in self.tmp_keys},
            **{"processed_at": self._processed_at}
        )
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -2,6 +2,8 @@ from __future__ import annotations
 from ast import List
 from typing import Union
 from .context import ArchivingContext
 from ..archivers import Archiver
 from ..feeders import Feeder
 from ..formatters import Formatter
@@ -23,6 +25,7 @@ class ArchivingOrchestrator:
        self.archivers: List[Archiver] = config.archivers
        self.databases: List[Database] = config.databases
        self.storages: List[Storage] = config.storages
        ArchivingContext.set("storages", self.storages, keep_on_reset=True)
        for a in self.archivers: a.setup()
@@ -32,8 +35,9 @@ class ArchivingOrchestrator:
    def feed_item(self, item: Metadata) -> Metadata:
        try:
            ArchivingContext.reset()
            with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
-                item.set_tmp_dir(tmp_dir)
+                ArchivingContext.set_tmp_dir(tmp_dir)
                return self.archive(item)
        except KeyboardInterrupt:
            # catches keyboard interruptions to do a clean exit
@@ -89,7 +93,7 @@ class ArchivingOrchestrator:
                # Q: should this be refactored so it's just a.download(result)?
                result.merge(a.download(result))
                if result.is_success(): break
-            except Exception as e: logger.error(f"Unexpected error with archiver {a.name}: {e}")
+            except Exception as e: logger.error(f"Unexpected error with archiver {a.name}: {e}: {traceback.format_exc()}")
        # what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
        # should it call the HTMLgenerator as if it's not an enrichment?
@@ -101,26 +105,16 @@ class ArchivingOrchestrator:
        # eg: screenshot, wacz, webarchive, thumbnails
        for e in self.enrichers:
            try: e.enrich(result)
-            except Exception as exc: logger.error(f"Unexpected error with enricher {e.name}: {exc}")
+            except Exception as exc: logger.error(f"Unexpected error with enricher {e.name}: {exc}: {traceback.format_exc()}")
        # 5 - store media
        # looks for Media in result.media and also result.media[x].properties (as list or dict values)
-        for s in self.storages:
+        result.store()
            for m in result.media:
                s.store(m, result)  # modifies media
                # Media can be inside media properties, examples include transformations on original media
                for prop in m.properties.values():
                    if isinstance(prop, Media):
                        s.store(prop, result)
                    if isinstance(prop, list) and len(prop) > 0 and isinstance(prop[0], Media):
                        for prop_media in prop:
                            s.store(prop_media, result)
        # 6 - format and store formatted if needed
        # enrichers typically need access to already stored URLs etc
        if (final_media := self.formatter.format(result)):
-            for s in self.storages:
+            final_media.store(url=url)
                s.store(final_media, result)
            result.set_final_media(final_media)
        if result.is_empty():
--- a/src/auto_archiver/databases/gsheet_db.py
+++ b/src/auto_archiver/databases/gsheet_db.py
@@ -5,8 +5,7 @@ from urllib.parse import quote
 from loguru import logger
 from . import Database
-from ..core import Metadata
+from ..core import Metadata, Media, ArchivingContext
 from ..core import Media
 from ..utils import GWorksheet
@@ -63,8 +62,9 @@ class GsheetsDb(Database):
            batch_if_valid('archive', "\n".join(media.urls))
        batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
        batch_if_valid('title', item.get_title())
-        batch_if_valid('text', item.get("content", "")[:500])
+        batch_if_valid('text', item.get("content", ""))
        batch_if_valid('timestamp', item.get_timestamp())
        batch_if_valid('hash', media.get("hash", "not-calculated"))
        if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
            batch_if_valid('screenshot', "\n".join(screenshot.urls))
@@ -86,7 +86,7 @@ class GsheetsDb(Database):
            logger.debug(f"Unable to update sheet: {e}")
    def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
-        # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now
+        # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
-        gw: GWorksheet = item.get("gsheet").get("worksheet")
+        gw: GWorksheet = ArchivingContext.get("gsheet").get("worksheet")
-        row: int = item.get("gsheet").get("row")
+        row: int = ArchivingContext.get("gsheet").get("row")
        return gw, row
--- a/src/auto_archiver/enrichers/init.py
+++ b/src/auto_archiver/enrichers/init.py
@@ -4,3 +4,4 @@ from .wayback_enricher import WaybackArchiverEnricher
 from .hash_enricher import HashEnricher
 from .thumbnail_enricher import ThumbnailEnricher
 from .wacz_enricher import WaczEnricher
 from .whisper_enricher import WhisperEnricher
--- a/src/auto_archiver/enrichers/hash_enricher.py
+++ b/src/auto_archiver/enrichers/hash_enricher.py
@@ -2,7 +2,7 @@ import hashlib
 from loguru import logger
 from . import Enricher
-from ..core import Metadata
+from ..core import Metadata, ArchivingContext
 class HashEnricher(Enricher):
@@ -17,6 +17,7 @@ class HashEnricher(Enricher):
        algo_choices = self.configs()["algorithm"]["choices"]
        assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
        self.chunksize = int(self.chunksize)
        ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True)
    @staticmethod
    def configs() -> dict:
--- a/src/auto_archiver/enrichers/screenshot_enricher.py
+++ b/src/auto_archiver/enrichers/screenshot_enricher.py
@@ -4,7 +4,7 @@ from selenium.common.exceptions import TimeoutException
 from . import Enricher
 from ..utils import Webdriver, UrlUtil
-from ..core import Media, Metadata
+from ..core import Media, Metadata, ArchivingContext
 class ScreenshotEnricher(Enricher):
    name = "screenshot_enricher"
@@ -29,7 +29,7 @@ class ScreenshotEnricher(Enricher):
            try:
                driver.get(url)
                time.sleep(int(self.sleep_before_screenshot))
-                screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
+                screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
                driver.save_screenshot(screenshot_file)
                to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
            except TimeoutException:
--- a/src/auto_archiver/enrichers/thumbnail_enricher.py
+++ b/src/auto_archiver/enrichers/thumbnail_enricher.py
@@ -2,7 +2,7 @@ import ffmpeg, os, uuid
 from loguru import logger
 from . import Enricher
-from ..core import Media, Metadata
+from ..core import Media, Metadata, ArchivingContext
 class ThumbnailEnricher(Enricher):
@@ -23,7 +23,7 @@ class ThumbnailEnricher(Enricher):
        logger.debug(f"generating thumbnails")
        for i, m in enumerate(to_enrich.media[::]):
            if m.is_video():
-                folder = os.path.join(to_enrich.get_tmp_dir(), str(uuid.uuid4()))
+                folder = os.path.join(ArchivingContext.get_tmp_dir(), str(uuid.uuid4()))
                os.makedirs(folder, exist_ok=True)
                logger.debug(f"generating thumbnails for {m.filename}")
                fps, duration = 0.5, m.get("duration")
--- a/src/auto_archiver/enrichers/wacz_enricher.py
+++ b/src/auto_archiver/enrichers/wacz_enricher.py
@@ -1,7 +1,7 @@
 import os, shutil, subprocess, uuid
 from loguru import logger
-from ..core import Media, Metadata
+from ..core import Media, Metadata, ArchivingContext
 from . import Enricher
 from ..utils import UrlUtil
@@ -26,36 +26,58 @@ class WaczEnricher(Enricher):
    def enrich(self, to_enrich: Metadata) -> bool:
        # TODO: figure out support for browsertrix in docker
        url = to_enrich.get_url()
        if UrlUtil.is_auth_wall(url):
            logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
            return
        logger.debug(f"generating WACZ for {url=}")
        collection = str(uuid.uuid4())[0:8]
-        browsertrix_home = os.path.abspath(to_enrich.get_tmp_dir())
+        browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
-        cmd = [
+        
-            "docker", "run",
+        if os.getenv('RUNNING_IN_DOCKER'):
-            "--rm",  # delete container once it has completed running
+            logger.debug(f"generating WACZ without Docker for {url=}")
-            "-v", f"{browsertrix_home}:/crawls/",
+
-            # "-it", # this leads to "the input device is not a TTY"
+            cmd = [
-            "webrecorder/browsertrix-crawler", "crawl",
+                "crawl",
-            "--url", url,
+                "--url", url,
-            "--scopeType", "page",
+                "--scopeType", "page",
-            "--generateWACZ",
+                "--generateWACZ",
-            "--text",
+                "--text",
-            "--collection", collection,
+                "--collection", collection,
-            "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
+                "--id", collection,
-            "--behaviorTimeout", str(self.timeout),
+                "--saveState", "never",
-            "--timeout", str(self.timeout)
+                "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
-        ]
+                "--behaviorTimeout", str(self.timeout),
-        if self.profile:
+                "--timeout", str(self.timeout),
-            profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
+                "--profile", str(self.profile)
-            shutil.copyfile(self.profile, profile_fn)
+            ]
-            # TODO: test which is right
+        else:
-            cmd.extend(["--profile", profile_fn])
+            logger.debug(f"generating WACZ in Docker for {url=}")
-            # cmd.extend(["--profile", "/crawls/profile.tar.gz"])
+            
            cmd = [
                "docker", "run",
                "--rm",  # delete container once it has completed running
                "-v", f"{browsertrix_home}:/crawls/",
                # "-it", # this leads to "the input device is not a TTY"
                "webrecorder/browsertrix-crawler", "crawl",
                "--url", url,
                "--scopeType", "page",
                "--generateWACZ",
                "--text",
                "--collection", collection,
                "--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
                "--behaviorTimeout", str(self.timeout),
                "--timeout", str(self.timeout)
            ]
            if self.profile:
                profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
                shutil.copyfile(self.profile, profile_fn)
                # TODO: test which is right
                cmd.extend(["--profile", profile_fn])
                # cmd.extend(["--profile", "/crawls/profile.tar.gz"])
        try:
            logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
@@ -64,7 +86,13 @@ class WaczEnricher(Enricher):
            logger.error(f"WACZ generation failed: {e}")
            return False
-        filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
+        
        if os.getenv('RUNNING_IN_DOCKER'):
            filename = os.path.join("collections", collection, f"{collection}.wacz")
        else:
            filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
        if not os.path.exists(filename):
            logger.warning(f"Unable to locate and upload WACZ  {filename=}")
            return False
--- a/src/auto_archiver/enrichers/whisper_enricher.py
+++ b/src/auto_archiver/enrichers/whisper_enricher.py
@@ -0,0 +1,130 @@
 import traceback
 import requests, time
 from loguru import logger
 from . import Enricher
 from ..core import Metadata, Media, ArchivingContext
 from ..storages import S3Storage
 class WhisperEnricher(Enricher):
    """
    Connects with a Whisper API service to get texts out of audio
    whisper API repository: TODO
    Only works if an S3 compatible storage is used
    """
    name = "whisper_enricher"
    def __init__(self, config: dict) -> None:
        # without this STEP.__init__ is not called
        super().__init__(config)
        assert type(self.api_key) == str and len(self.api_key) > 0, "please provide a value for the whisper_enricher api_key"
        self.timeout = int(self.timeout)
    @staticmethod
    def configs() -> dict:
        return {
            "api_endpoint": {"default": "https://whisper.spoettel.dev/api/v1", "help": "WhisperApi api endpoint"},
            "api_key": {"default": None, "help": "WhisperApi api key for authentication"},
            "include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
            "timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
            "action": {"default": "translation", "help": "which Whisper operation to execute", "choices": ["transcript", "translation", "language_detection"]},
        }
    def enrich(self, to_enrich: Metadata) -> None:
        if not self._get_s3_storage():
            logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
            return
        url = to_enrich.get_url()
        logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.")
        job_results = {}
        for i, m in enumerate(to_enrich.media):
            if m.is_video() or m.is_audio():
                m.store(url=url)
                try:
                    job_id = self.submit_job(m)
                    job_results[job_id] = False
                    logger.debug(f"JOB SUBMITTED: {job_id=} for {m.key=}")
                    to_enrich.media[i].set("whisper_model", {"job_id": job_id})
                except Exception as e:
                    logger.error(f"Failed to submit whisper job for {m.filename=} with error {e}\n{traceback.format_exc()}")
        job_results = self.check_jobs(job_results)
        for i, m in enumerate(to_enrich.media):
            if m.is_video() or m.is_audio():
                job_id = to_enrich.media[i].get("whisper_model")["job_id"]
                to_enrich.media[i].set("whisper_model", {
                    "job_id": job_id,
                    **(job_results[job_id] if job_results[job_id] else {"result": "incomplete or failed job"})
                })
                # append the extracted text to the content of the post so it gets written to the DBs like gsheets text column
                if job_results[job_id]:
                    for k,v in job_results[job_id].items():
                        if "_text" in k and len(v):
                            to_enrich.set_content(f"\n[automatic video transcript]: {v}")
    def submit_job(self, media: Media):
        s3 = self._get_s3_storage()
        s3_url = s3.get_cdn_url(media)
        assert s3_url in media.urls, f"Could not find S3 url ({s3_url}) in list of stored media urls "
        payload = {
            "url": s3_url,
            "type": self.action,
            # "language": "string" # may be a config
        }
        response = requests.post(f'{self.api_endpoint}/jobs', json=payload, headers={'Authorization': f'Bearer {self.api_key}'})
        assert response.status_code == 201, f"calling the whisper api {self.api_endpoint} returned a non-success code: {response.status_code}"
        logger.debug(response.json())
        return response.json()['id']
    def check_jobs(self, job_results: dict):
        start_time = time.time()
        all_completed = False
        while not all_completed and (time.time() - start_time) <= self.timeout:
            all_completed = True
            for job_id in job_results:
                if job_results[job_id] != False: continue
                all_completed = False  # at least one not ready
                try: job_results[job_id] = self.check_job(job_id)
                except Exception as e:
                    logger.error(f"Failed to check {job_id=} with error {e}\n{traceback.format_exc()}")
            if not all_completed: time.sleep(3)
        return job_results
    def check_job(self, job_id):
        r = requests.get(f'{self.api_endpoint}/jobs/{job_id}', headers={'Authorization': f'Bearer {self.api_key}'})
        assert r.status_code == 200, f"Job status did not respond with 200, instead with: {r.status_code}"
        j = r.json()
        logger.debug(f"Checked job {job_id=} with status='{j['status']}'")
        if j['status'] == "processing": return False
        elif j['status'] == "error": return f"Error: {j['meta']['error']}"
        elif j['status'] == "success":
            r_res = requests.get(f'{self.api_endpoint}/jobs/{job_id}/artifacts', headers={'Authorization': f'Bearer {self.api_key}'})
            assert r_res.status_code == 200, f"Job artifacts did not respond with 200, instead with: {r_res.status_code}"
            logger.success(r_res.json())
            result = {}
            for art_id, artifact in enumerate(r_res.json()):
                subtitle = []
                full_text = []
                for i, d in enumerate(artifact.get("data")):
                    subtitle.append(f"{i+1}\n{d.get('start')} --> {d.get('end')}\n{d.get('text').strip()}")
                    full_text.append(d.get('text').strip())
                if not len(subtitle): continue
                if self.include_srt: result[f"artifact_{art_id}_subtitle"] = "\n".join(subtitle)
                result[f"artifact_{art_id}_text"] = "\n".join(full_text)
            # call /delete endpoint on timely success
            r_del = requests.delete(f'{self.api_endpoint}/jobs/{job_id}', headers={'Authorization': f'Bearer {self.api_key}'})
            logger.debug(f"DELETE whisper {job_id=} result: {r_del.status_code}")
            return result
        return False
    def _get_s3_storage(self) -> S3Storage:
        try:
            return next(s for s in ArchivingContext.get("storages") if s.__class__ == S3Storage)
        except:
            logger.warning("No S3Storage instance found in storages")
            return
--- a/src/auto_archiver/feeders/cli_feeder.py
+++ b/src/auto_archiver/feeders/cli_feeder.py
@@ -1,7 +1,7 @@
 from loguru import logger
 from . import Feeder
-from ..core import Metadata
+from ..core import Metadata, ArchivingContext
 class CLIFeeder(Feeder):
@@ -26,5 +26,7 @@ class CLIFeeder(Feeder):
    def __iter__(self) -> Metadata:
        for url in self.urls:
            logger.debug(f"Processing {url}")
-            yield Metadata().set_url(url).set("folder", "cli", True)
+            yield Metadata().set_url(url)
            ArchivingContext.set("folder", "cli")
        logger.success(f"Processed {len(self.urls)} URL(s)")
--- a/src/auto_archiver/feeders/gsheet_feeder.py
+++ b/src/auto_archiver/feeders/gsheet_feeder.py
@@ -5,9 +5,10 @@ from slugify import slugify
 # from . import Enricher
 from . import Feeder
-from ..core import Metadata
+from ..core import Metadata, ArchivingContext
 from ..utils import Gsheets, GWorksheet
 class GsheetsFeeder(Gsheets, Feeder):
    name = "gsheet_feeder"
@@ -31,7 +32,7 @@ class GsheetsFeeder(Gsheets, Feeder):
                    "help": "(CSV) explicitly block some worksheets from being processed",
                    "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
                },
-                "use_sheet_names_in_stored_paths":{
+                "use_sheet_names_in_stored_paths": {
                    "default": True,
                    "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
                }
@@ -61,9 +62,15 @@ class GsheetsFeeder(Gsheets, Feeder):
                if status not in ['', None]: continue
                # All checks done - archival process starts here
-                m = Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True)
+                m = Metadata().set_url(url)
-                if self.use_sheet_names_in_stored_paths:
+                ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
-                    m.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
+                folder = slugify(gw.get_cell(row, 'folder').strip())
                if len(folder):
                    if self.use_sheet_names_in_stored_paths:
                        ArchivingContext.set("folder", os.path.join(folder, slugify(self.sheet), slugify(wks.title)), True)
                    else:
                        ArchivingContext.set("folder", folder, True)
                yield m
            logger.success(f'Finished worksheet {wks.title}')
--- a/src/auto_archiver/formatters/html_formatter.py
+++ b/src/auto_archiver/formatters/html_formatter.py
@@ -6,8 +6,9 @@ from urllib.parse import quote
 from loguru import logger
 from ..version import __version__
-from ..core import Metadata, Media
+from ..core import Metadata, Media, ArchivingContext
 from . import Formatter
 from ..enrichers import HashEnricher
@dataclass
@@ -40,17 +41,22 @@ class HtmlFormatter(Formatter):
            url=url,
            title=item.get_title(),
            media=item.media,
-            metadata=item.get_clean_metadata(),
+            metadata=item.metadata,
            version=__version__
        )
-        html_path = os.path.join(item.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html")
+        html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html")
        with open(html_path, mode="w", encoding="utf-8") as outf:
            outf.write(content)
-        return Media(filename=html_path)
+        final_media = Media(filename=html_path)
        he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
        if len(hd := he.calculate_hash(final_media.filename)):
            final_media.set("hash", f"{he.algorithm}:{hd}")
        return final_media
 # JINJA helper filters
 class JinjaHelpers:
    @staticmethod
    def is_list(v) -> bool:
--- a/src/auto_archiver/formatters/templates/html_template.html
+++ b/src/auto_archiver/formatters/templates/html_template.html
@@ -29,7 +29,7 @@
            margin: auto;
            border: 1px solid;
            border-collapse: collapse;
-            vertical-align:top;
+            vertical-align: top;
        }
        table.metadata td:first-child {
@@ -42,7 +42,7 @@
        }
        .copy:hover {
-            font-weight: 600;
+            background: aliceblue;
            cursor: copy;
        }
@@ -185,7 +185,11 @@
        el.addEventListener("copy", (e) => {
            e.preventDefault();
            if (e.clipboardData) {
-                e.clipboardData.setData("text/plain", el.textContent);
+                if (el.hasAttribute("copy-value")) {
                    e.clipboardData.setData("text/plain", el.getAttribute("copy-value"));
                } else {
                    e.clipboardData.setData("text/plain", el.textContent);
                }
                console.log(e.clipboardData.getData("text"))
                showNotification("copied!")
            }
--- a/src/auto_archiver/formatters/templates/macros.html
+++ b/src/auto_archiver/formatters/templates/macros.html
@@ -46,14 +46,16 @@ No preview available for {{ m.key }}.
 {% endif %}
 {% if links %}
 <a href="{{ url }}">open</a> or
-<a href="{{ url }}" download="">download</a>
+<a href="{{ url }}" download="">download</a> or
 {{ copy_urlize(url, "copy") }}
 <br>
 {% endif %}
 {% endfor %}
 {%- endmacro -%}
-{% macro copy_urlize(val) -%}
+{% macro copy_urlize(val, href_text) -%}
 {% if val is mapping %}
 <ul>
@@ -65,7 +67,11 @@ No preview available for {{ m.key }}.
 </ul>
 {% else %}
 {% if href_text  | length == 0 %}
 <span class="copy">{{ val | string | urlize }}</span>
 {% else %}
 <span class="copy" copy-value="{{val}}">{{ href_text | string | urlize }}</span>
 {% endif %}
 {% endif %}
 {%- endmacro -%}
--- a/src/auto_archiver/storages/storage.py
+++ b/src/auto_archiver/storages/storage.py
@@ -1,10 +1,9 @@
 from __future__ import annotations
 from abc import abstractmethod
 from dataclasses import dataclass
-import hashlib
+from typing import IO
 from typing import IO, Any
-from ..core import Media, Metadata, Step
+from ..core import Media, Step, ArchivingContext
 from ..enrichers import HashEnricher
 from loguru import logger
 import os, uuid
@@ -42,8 +41,11 @@ class Storage(Step):
        # only for typing...
        return Step.init(name, config, Storage)
-    def store(self, media: Media, item: Metadata) -> None:
+    def store(self, media: Media, url: str) -> None:
-        self.set_key(media, item)
+        if media.is_stored(): 
            logger.debug(f"{media.key} already stored, skipping")
            return
        self.set_key(media, url)
        self.upload(media)
        media.add_url(self.get_cdn_url(media))
@@ -58,24 +60,24 @@ class Storage(Step):
        with open(media.filename, 'rb') as f:
            return self.uploadf(f, media, **kwargs)
-    def set_key(self, media: Media, item: Metadata) -> None:
+    def set_key(self, media: Media, url) -> None:
        """takes the media and optionally item info and generates a key"""
        if media.key is not None and len(media.key) > 0: return
-        folder = item.get("folder", "")
+        folder = ArchivingContext.get("folder", "")
        filename, ext = os.path.splitext(media.filename)
        # path_generator logic
        if self.path_generator == "flat":
            path = ""
            filename = slugify(filename)  # in case it comes with os.sep
-        elif self.path_generator == "url": path = slugify(item.get_url())
+        elif self.path_generator == "url": path = slugify(url)
        elif self.path_generator == "random":
-            path = item.get("random_path", str(uuid.uuid4())[:16], True)
+            path = ArchivingContext.get("random_path", str(uuid.uuid4())[:16], True)
        # filename_generator logic
        if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
        elif self.filename_generator == "static":
-            he = HashEnricher({"hash_enricher": {"algorithm": "SHA-256", "chunksize": 1.6e7}})
+            he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
            hd = he.calculate_hash(media.filename)
            filename = hd[:24]
--- a/src/auto_archiver/utils/gsheet.py
+++ b/src/auto_archiver/utils/gsheet.py
@@ -30,11 +30,9 @@ class Gsheets(Step):
                    'archive': 'archive location',
                    'date': 'archive date',
                    'thumbnail': 'thumbnail',
                    'thumbnail_index': 'thumbnail index',
                    'timestamp': 'upload timestamp',
                    'title': 'upload title',
                    'text': 'text content',
                    'duration': 'duration',
                    'screenshot': 'screenshot',
                    'hash': 'hash',
                    'wacz': 'wacz',
--- a/src/auto_archiver/utils/gworksheet.py
+++ b/src/auto_archiver/utils/gworksheet.py
@@ -15,10 +15,8 @@ class GWorksheet:
        'archive': 'archive location',
        'date': 'archive date',
        'thumbnail': 'thumbnail',
        'thumbnail_index': 'thumbnail index',
        'timestamp': 'upload timestamp',
        'title': 'upload title',
        'duration': 'duration',
        'screenshot': 'screenshot',
        'hash': 'hash',
        'wacz': 'wacz',
@@ -98,7 +96,7 @@ class GWorksheet:
        cell_updates = [
            {
                'range': self.to_a1(row, col),
-                'values': [[val]]
+                'values': [[str(val)[0:49999]]]
            }
            for row, col, val in cell_updates
        ]
--- a/src/auto_archiver/version.py
+++ b/src/auto_archiver/version.py
@@ -1,9 +1,9 @@
 _MAJOR = "0"
-_MINOR = "4"
+_MINOR = "5"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "5"
+_PATCH = "12"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""
Author	SHA1	Message	Date
msramalho	b7c69c0f0d	Bump version to v0.5.12 for release	2023-05-10 18:58:34 +01:00
msramalho	c98991cdfb	fix: vk-url-scraper version update	2023-05-10 18:57:45 +01:00
msramalho	45b982ec38	fix: max chars on sheets cell	2023-05-10 18:57:33 +01:00
msramalho	e11be449e8	fix: delete completed whisper tasks	2023-05-10 18:57:17 +01:00
Logan Williams	134bf09257	Fix typo	2023-05-10 16:05:06 +02:00
Logan Williams	417ca9ef51	Limit build platforms to those supported by webrecorder	2023-05-10 16:03:51 +02:00
Logan Williams	5b79dcb80c	Configure multi-platform docker builds	2023-05-10 16:01:23 +02:00
msramalho	52d7b4a016	Merge branch 'dockerize' of https://github.com/bellingcat/auto-archiver into dockerize	2023-05-10 13:29:45 +01:00
msramalho	31f6aae7b9	fix: screenshots in docker	2023-05-10 13:29:42 +01:00
Logan Williams	26373d4545	Re-order README slightly	2023-05-10 11:48:34 +02:00
Logan Williams	7a34915f8e	Remove old auto auto archiver file	2023-05-10 11:16:54 +02:00
Miguel Sozinho Ramalho	b67a7b818a	Merge pull request #75 from bellingcat/feature/browsertrix	2023-05-10 10:14:40 +01:00
Logan Williams	2e63cb8411	Update README with new entrypoint	2023-05-10 11:13:47 +02:00
Logan Williams	9cb73c073f	Simplify entrypoint	2023-05-10 11:08:49 +02:00
msramalho	9d078a648f	version bump	2023-05-10 09:57:47 +01:00
msramalho	e150370657	updates docker instructions	2023-05-10 09:51:53 +01:00
Miguel Sozinho Ramalho	4116c90168	Merge pull request #74 from bellingcat/feature/browsertrix	2023-05-10 09:36:41 +01:00
Logan Williams	2c5b115fbe	Fix lock file issue	2023-05-09 19:34:16 +02:00
Logan Williams	bda812f850	Clean up comments	2023-05-09 19:34:16 +02:00
Logan Williams	ac82764ffc	Working, but some cleanup still necessary	2023-05-09 19:34:16 +02:00
Logan Williams	0fae7d96fb	Detect running in docker container in WACZ enricher	2023-05-09 19:34:16 +02:00
Logan Williams	2f7181ced6	Use browsertrix base image	2023-05-09 19:34:16 +02:00
msramalho	9c25b33f1c	fix: multiple storages with folder column	2023-05-09 12:14:07 +01:00
msramalho	ae3e607705	fix: depreacating thumbnail_index	2023-05-09 11:29:05 +01:00
msramalho	c1a60fde8a	fix: deprecates duration column	2023-05-09 11:26:19 +01:00
msramalho	875e1de589	feat: re-enable HASH on gsheet	2023-05-09 11:17:44 +01:00
msramalho	8f3d4e05c3	fixing bug in whisper wnericher	2023-05-04 09:36:10 +01:00
msramalho	3bd6bed825	Bump version to v0.5.10 for release	2023-05-02 19:44:00 +01:00
msramalho	2659675f06	skip trim	2023-05-02 19:06:10 +01:00
msramalho	9d44f4b207	content append instead of replace	2023-05-02 19:06:00 +01:00
msramalho	5b0bff612e	whisper transcripts to content	2023-05-02 19:05:32 +01:00
msramalho	ae7ceba0e5	better debug	2023-05-02 19:05:18 +01:00
msramalho	97821a81bc	log cleanup	2023-05-02 19:05:06 +01:00
msramalho	9191b38cf2	tbot archiver works	2023-05-02 19:04:51 +01:00
msramalho	567edfc35e	Bump version to v0.5.8 for release	2023-05-02 14:30:49 +01:00
msramalho	8c22a9df72	fixes "url-not-found"	2023-05-02 14:30:07 +01:00
msramalho	d2d6db162b	Bump version to v0.5.7 for release	2023-04-18 19:28:51 +01:00
msramalho	5cfbcc0137	html template copy ux	2023-04-18 19:28:43 +01:00
msramalho	5fdaa6c739	whisper improvements	2023-04-18 19:28:36 +01:00
msramalho	3d389ee05b	add url info	2023-04-18 19:14:47 +01:00
msramalho	0ecbed0df0	Bump version to v0.5.6 for release	2023-04-18 18:49:08 +01:00
msramalho	69bcfea2eb	to_json fix	2023-04-18 18:48:51 +01:00
msramalho	2e2e695444	whisper enricher	2023-03-23 18:50:37 +00:00
msramalho	493055a8d9	cleanup	2023-03-23 18:50:30 +00:00
msramalho	6f6eb2db7a	Archiving Context refactor complete	2023-03-23 14:28:45 +00:00
msramalho	906ed0f6e0	creating global context and refactoring tmp_dir logic	2023-03-23 11:17:38 +00:00