v1.0.1 dependency updates, generic extractor improvements (#307 )

* wacz: allow exceptional cases where more than one resource image is available * improves generic extractor edge-cases and yt-dlp updates * REMOVES vk_extractor until further notice * bumps browsertrix in docker image * npm version bump on scripts/settings * poetry updates * Changed log level on gsheet_feeder_db started from warning to info (#301) * closes 305 and further fixes finding local downloads from uncommon ytdlp extractors * use ffmpeg -bitexact to reduce duplicate content storing * formatting * adds yt-dlp curl-cffi * version bump * linting --------- Co-authored-by: Dave Mateer <davemateer@gmail.com>
catch for if self.comments are true but no actual comments in video (#303 )
2026-06-08 19:38:29 +03:00 · 2025-06-02 20:57:12 +01:00 · 2025-06-02 13:02:19 +01:00 · 2025-04-28 11:16:01 +01:00 · 2025-04-07 21:15:18 +01:00 · 2025-03-31 16:19:29 +01:00
54 changed files with 2717 additions and 1447 deletions
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -0,0 +1,40 @@
+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for all configuration options:
+# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
+
+version: 2
+updates:
+  - package-ecosystem: "pip"
+    directory: "/"
+    groups:
+      python:
+        patterns:
+          - "*"
+    schedule:
+      interval: "weekly"
+  
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    groups:
+      actions:
+        patterns:
+          - "*"
+    schedule:
+      interval: "weekly"
+
+  - package-ecosystem: "npm"
+    directory: "/scripts/settings/"
+    groups:
+      actions:
+        patterns:
+          - "*"
+    schedule:
+      interval: "weekly"
+
+  - package-ecosystem: "docker"
+    # Look for a `Dockerfile` in the `root` directory
+    directory: "/"
+    # Check for updates once a week
+    schedule:
+      interval: "weekly"
--- a/.github/workflows/docker-publish.yaml
+++ b/.github/workflows/docker-publish.yaml
@@ -22,7 +22,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Check out the repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        
      - name: Set up QEMU
        uses: docker/setup-qemu-action@v3
@@ -33,14 +33,14 @@ jobs:
        uses: docker/setup-buildx-action@v3

      - name: Log in to Docker Hub
-        uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567
+        uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772
        with:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_PASSWORD }}

      - name: Extract metadata (tags, labels) for Docker
        id: meta
-        uses: docker/metadata-action@369eb591f429131d6889c46b94e711f089e6ca96
+        uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804
        with:
          images: bellingcat/auto-archiver
      
--- a/.github/workflows/ruff.yaml
+++ b/.github/workflows/ruff.yaml
@@ -3,8 +3,18 @@ name: Ruff Formatting & Linting
 on:
  push:
    branches: [ main ]
+    paths-ignore:
+      - "README.md"
+      - ".github"
+      - "poetry.lock"
+      - "scripts/settings"
  pull_request:
    branches: [ main ]
+    paths-ignore:
+      - "README.md"
+      - ".github"
+      - "poetry.lock"
+      - "scripts/settings"

 jobs:
  build:
--- a/.github/workflows/tests-core.yaml
+++ b/.github/workflows/tests-core.yaml
@@ -20,8 +20,7 @@ jobs:
      fail-fast: false
      matrix:
        python-version: ["3.10", "3.11", "3.12"]
-        os: [ubuntu-22.04]
-        #TODO: re-enable ubuntu-latest, this is disabled as oscrypto cannot be pinned to github commit and pushed to pypi
+        os: [ubuntu-22.04, ubuntu-latest]
    defaults:
      run:
        working-directory: ./
@@ -29,16 +28,23 @@ jobs:
    steps:
      - uses: actions/checkout@v4

-      - name: Install Poetry
-        run: pipx install poetry
-
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
-          cache: 'poetry'

-      - name: Install dependencies
+      - name: Install latest Poetry
+        run: pipx install poetry
+
+      - name: Cache Poetry and pip artifacts
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/pypoetry
+            ~/.cache/pip
+          key: poetry-${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }}
+
+      - name: Install dependencies from source only
        run: poetry install --no-interaction --with dev

      - name: Run Core Tests
--- a/.github/workflows/tests-download.yaml
+++ b/.github/workflows/tests-download.yaml
@@ -22,16 +22,23 @@ jobs:
    steps:
      - uses: actions/checkout@v4

-      - name: Install poetry
-        run: pipx install poetry
-
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
-          cache: 'poetry'

-      - name: Install dependencies
+      - name: Install latest Poetry
+        run: pipx install poetry
+
+      - name: Cache Poetry and pip artifacts
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/pypoetry
+            ~/.cache/pip
+          key: poetry-${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }}
+
+      - name: Install dependencies from source only
        run: poetry install --no-interaction --with dev

      - name: Run Download Tests
--- a/2
+++ b/2
@@ -1,4 +1,4 @@
-FROM webrecorder/browsertrix-crawler:1.4.2 AS base
+FROM webrecorder/browsertrix-crawler:1.6.1 AS base

 ENV RUNNING_IN_DOCKER=1 \
    LANG=C.UTF-8 \
--- a/README.md
+++ b/README.md
@@ -1,12 +1,13 @@
 <h1 align="center">Auto Archiver</h1>

+[![Documentation Status](https://readthedocs.org/projects/auto-archiver/badge/?version=latest)](https://auto-archiver.readthedocs.io/en/latest/?badge=latest)
 [![PyPI version](https://badge.fury.io/py/auto-archiver.svg)](https://badge.fury.io/py/auto-archiver)
-[![Docker Image Version (latest by date)](https://img.shields.io/docker/v/bellingcat/auto-archiver?label=version&logo=docker)](https://hub.docker.com/r/bellingcat/auto-archiver)
+[![Docker Image Version (latest by date)](https://img.shields.io/docker/v/bellingcat/auto-archiver?sort=semver&logo=docker&color=#69F0AE)](https://hub.docker.com/r/bellingcat/auto-archiver)
 [![Core Test Status](https://github.com/bellingcat/auto-archiver/workflows/Core%20Tests/badge.svg)](https://github.com/bellingcat/auto-archiver/actions/workflows/tests-core.yaml)
-[![Download Test Status](https://github.com/bellingcat/auto-archiver/workflows/Download%20Tests/badge.svg)](https://github.com/bellingcat/auto-archiver/actions/workflows/tests-download.yaml)
+<!-- [![Download Test Status](https://github.com/bellingcat/auto-archiver/workflows/Download%20Tests/badge.svg)](https://github.com/bellingcat/auto-archiver/actions/workflows/tests-download.yaml) -->
+
 <!-- ![Docker Pulls](https://img.shields.io/docker/pulls/bellingcat/auto-archiver) -->
 <!-- [![PyPI download month](https://img.shields.io/pypi/dm/auto-archiver.svg)](https://pypi.python.org/pypi/auto-archiver/) -->
-<!-- [![Documentation Status](https://readthedocs.org/projects/vk-url-scraper/badge/?version=latest)](https://vk-url-scraper.readthedocs.io/en/latest/?badge=latest) -->



--- a/docs/source/how_to/authentication_how_to.md
+++ b/docs/source/how_to/authentication_how_to.md
@@ -106,5 +106,117 @@ Finally,Some important things to remember:

 ## Authenticating on XXXX site with username/password

-```{note} This section is still under construction 🚧
+```{note} 
+This section is still under construction 🚧
 ```
+
+
+# Proof of Origin Tokens
+
+YouTube uses **Proof of Origin Tokens (POT)** as part of its bot detection system to verify that requests originate from valid clients. If a token is missing or invalid, some videos may return errors like "Sign in to confirm you're not a bot."
+
+yt-dlp provides [a detailed guide to POTs](https://github.com/yt-dlp/yt-dlp/wiki/PO-Token-Guide).
+
+### How Auto Archiver Uses POT
+This feature is enabled for the Generic Archiver via two yt-dlp plugins:
+
+- **Client-side plugin**: [yt-dlp-get-pot](https://github.com/coletdjnz/yt-dlp-get-pot)  
+  Detects when a token is required and requests one from a provider.
+
+- **Provider plugin**: [bgutil-ytdlp-pot-provider](https://github.com/Brainicism/bgutil-ytdlp-pot-provider)  
+  Includes both a Python plugin and a **Node.js server or script** to generate the token.
+
+These are installed in our Poetry environment.
+
+### Integration Methods
+
+**Docker (Recommended)**:
+
+When running the Auto Archiver using the Docker image, we use the [Node.js token generation script](https://github.com/Brainicism/bgutil-ytdlp-pot-provider/tree/master/server).
+This is to avoid managing a separate server process, and is handled automatically inside the Docker container when needed.
+
+This is already included in the Docker image, however if you need to disable this you can set the config option `bguils_po_token_method` under the `generic_extractor` section of your `orchestration.yaml` config file to "disabled".
+```yaml
+generic_extractor:
+  bguils_po_token_method: "disabled"
+```
+
+**PyPi/ Local**:
+
+When using the Auto Archiver PyPI package, or running locally, you will need additional system requirements to run the token generation script, namely either Docker, or Node.js and Yarn.
+
+See the [bgutil-ytdlp-pot-provider](https://github.com/Brainicism/bgutil-ytdlp-pot-provider?tab=readme-ov-file#a-http-server-option) documentation for more details.
+
+⚠️WARNING⚠️: This will add the server scripts to the home directory of wherever this is running.
+
+- You can set the config option `bguils_po_token_method` under the `generic_extractor` section of your `orchestration.yaml` config file to "script" to enable the token generation script process locally.
+- Alternatively you can run the bgutil-ytdlp-pot-provider server separately using their Docker image or Node.js server.
+
+### Notes
+
+- The token generation script is only triggered when needed by yt-dlp, so it should have no effect unless YouTube requests a POT.
+- If you're running the Auto Archiver in Docker, this is set up automatically.
+- If you're running locally, you'll need to run the setup script manually or enable the feature in your config.
+- You can set up both the server and the script, and the plugin will fallback on each other if needed. This is recommended for robustness!
+
+### Configurations: 
+
+## Configurations Summary
+
+| Option     | Behavior                                                                                                                                   | Docker Default? |
+|------------| ------------------------------------------------------------------------------------------------------------------------------------------ | --------------- |
+| `auto`     | Docker: Automatically downloads and uses the token generation script. Local: Does nothing; assumes a separate server is running externally. | ✅ Yes           |
+| `script`   | Explicitly downloads and uses the token generation script, even locally.                                                                   | ❌ No            |
+| `disabled` | Disables token generation completely.                                                                                                      | ❌ No            |
+
+Example configuration:
+
+ 
+```yaml
+generic_extractor:
+  # ...  
+  bguils_po_token_method: "script"
+  # For debugging add the verbose flag here:
+  ytdlp_args: "--no-abort-on-error --abort-on-error --verbose"
+
+```
+
+**Advanced Configuration:**
+
+If you change the default port of the bgutil-ytdlp-pot-provider server, you can pass the updated values using our `extractor_args` option for the gereric extractor.
+
+```yaml
+generic_extractor:
+  ytdlp_args: "--no-abort-on-error --abort-on-error --verbose"
+  ytdlp_update_interval: 5
+  bguils_po_token_method: "script"
+  extractor_args:
+    youtube:
+      getpot_bgutil_baseurl: "http://127.0.0.1:8080"
+      player_client: web,tv
+```
+For more details on this for bgutils see [here](https://github.com/Brainicism/bgutil-ytdlp-pot-provider?tab=readme-ov-file#usage)
+
+### Checking the logs
+
+To verify that the POT process working, look for the following lines in your log after adding the config option:
+
+```shell
+[GetPOT] BgUtilScript: Generating POT via script: /Users/you/bgutil-ytdlp-pot-provider/server/build/generate_once.js
+[debug] [GetPOT] BgUtilScript: Executing command to get POT via script: /Users/you/.nvm/versions/node/v20.18.0/bin/node /Users/you/bgutil-ytdlp-pot-provider/server/build/generate_once.js -v ymCMy8OflKM
+[debug] [GetPOT] BgUtilScript: stdout:
+{"poToken":"MlMxojNFhEJvUzGeHEkVRSK_luXtwcDnwSNIOgaUutqB7t99nmlNvtWgYayboopG6ZopZgmQ-6PJCWEMHv89MIiFGGlJRY25Fkwzxmia_8uYgf5AWf==","generatedAt":"2025-03-26T10:45:26.156Z","visitIdentifier":"ymCMy8OflKM"}
+[debug] [GetPOT] Fetching gvs PO Token for tv client
+```
+
+If it can't find the script or something, you'll see something like this:
+```shell
+[debug] [GetPOT] Fetching player PO Token for tv client
+WARNING: [GetPOT] BgUtilScript: Script path doesn't exist: /Users/you/bgutil-ytdlp-pot-provider/server/build/generate_once.js. Please make sure the script has been transpiled correctly.
+WARNING: [GetPOT] BgUtilHTTP: Error reaching GET http://127.0.0.1:4416/ping (caused by TransportError). Please make sure that the server is reachable at http://127.0.0.1:4416.
+[debug] [GetPOT] No player PO Token provider available for tv client
+```
+
+In this case check that the script has been transpiled correctly and is available at the path specified in the log, 
+or that the server is running and reachable.
+
--- a/docs/source/how_to/new_config_format.md
+++ b/docs/source/how_to/new_config_format.md
@@ -71,7 +71,6 @@ The names of the actual modules have also changed, so for any extractor modules
 - `telethon_archiver` → `telethon_extractor`
 - `wacz_archiver_enricher` → `wacz_extractor_enricher`
 - `wayback_archiver_enricher` → `wayback_extractor_enricher`
- `vk_archiver` → `vk_extractor`


 #### c) Module Renaming
--- a/docs/source/how_to/run_instagrapi_server.md
+++ b/docs/source/how_to/run_instagrapi_server.md
@@ -0,0 +1,169 @@
+# InstagrAPI Server
+
+The instagram API Extractor requires access to a running instance of the InstagrAPI server. 
+We have a lightweight script with the endpoints required for our Instagram API Extractor module which you can run locally, or via Docker.
+
+
+
+⚠️ Warning: Remember that it's best not to use your own personal account for archiving. [Here's why](../installation/authentication.md#recommendations-for-authentication).
+## Quick Start: Using Docker
+
+We've provided a convenient shell script (`run_instagrapi_server.sh`) that simplifies the process of setting up and running the Instagrapi server in Docker. This script handles building the Docker image, setting up credentials, and starting the container.
+
+### 🔧 Running the script:
+
+Run this script either from the repository root or from within the `scripts/instagrapi_server` directory:
+
+```bash
+./scripts/instagrapi_server/run_instagrapi_server.sh
+```
+
+This script will:
+- Prompt for your Instagram username and password.
+- Create the necessary `.env` file.
+- Build the Docker image.
+- Start the Docker container and authenticate with Instagram, creating a session automatically.
+
+### ⏱ To run the server again later:
+```bash
+docker start ig-instasrv
+```
+
+### 🐛 Debugging:
+View logs:
+```bash
+docker logs ig-instasrv
+```
+
+
+### Overview: How the Setup Works
+
+1. You enter your Instagram credentials in a local `.env` file
+2. You run the server **once locally** to generate a session file
+3. After that, you can choose to run the server again locally or inside Docker without needing to log in again
+
+---
+
+## Optional: Manual / Local Setup
+
+If you'd prefer to run the server manually (without Docker), you can follow these steps:
+
+
+1. **Navigate to the server folder (and stay there for the rest of this guide)**:
+   ```bash
+   cd scripts/instagrapi_server
+   ```
+
+2. **Create a `secrets/` folder** (if it doesn't already exist in `scripts/instagrapi_server`):
+   ```bash
+   mkdir -p secrets
+   ```
+
+3. **Create a `.env` file** inside `secrets/` with your Instagram credentials:
+   ```dotenv
+   INSTAGRAM_USERNAME="your_username"
+   INSTAGRAM_PASSWORD="your_password"
+   ```
+
+4. **Install dependencies** using the pyproject.toml file:
+  
+   ```bash
+   poetry install --no-root
+   ```
+
+5. **Run the server locally**:
+   ```bash
+   poetry run uvicorn src.instaserver:app --port 8000
+   ```
+
+6. **Watch for the message**:
+   ```
+   Login successful, session saved.
+   ```
+
+✅ Your session is now saved to `secrets/instagrapi_session.json`.
+
+### To run it again locally:
+```bash
+poetry run uvicorn src.instaserver:app --port 8000
+```
+
+---
+
+## Adding the API Endpoint to Auto Archiver
+
+The server should now be running within that session, and accessible at  http://127.0.0.1:8000 
+
+You can set this in the Auto Archiver orchestration.yaml file like this:
+```yaml
+instagram_api_extractor:
+  api_endpoint: http://127.0.0.1:8000
+```
+
+
+---
+
+## 2. Running the Server Again
+
+Once the session file is created, you should be able to run the server without logging in again.
+
+### To run it locally (from scripts/instagrapi_server):
+```bash
+poetry run uvicorn src.instgrapinstance.instaserver:app --port 8000
+```
+
+---
+
+## 3. Running via Docker (After Setup is Complete, either locally or via the script)
+
+Once the `instagrapi_session.json` and `.env` files are set up, you can pass them Docker and it should authenticate successfully.
+
+### 🔨 Build the Docker image manually:
+```bash
+docker build -t instagrapi-server .
+```
+
+### ▶️ Run the container:
+```bash
+docker run -d \
+  --env-file secrets/.env \
+  -v "$(pwd)/secrets:/app/secrets" \
+  -p 8000:8000 \
+  --name ig-instasrv \
+  instagrapi-server
+```
+
+This passes the /secrets/ directory to docker as well as the environment variables from the `.env` file.
+
+
+
+---
+
+## 4. Optional Cleanup
+
+- **Stop the Docker container**:
+  ```bash
+  docker stop ig-instasrv
+  ```
+
+- **Remove the container**:
+  ```bash
+  docker rm ig-instasrv
+  ```
+
+- **Remove the Docker image**:
+  ```bash
+  docker rmi instagrapi-server
+  ```
+
+### ⏱ To run again later:
+```bash
+docker start ig-instasrv
+```
+
+---
+
+##  Notes
+
+- Never share your `.env` or `instagrapi_session.json` — these contain sensitive login data. 
+- If you want to reset your session, simply delete the `secrets/instagrapi_session.json` file and re-run the local server.
--- a/docs/source/installation/authentication.md
+++ b/docs/source/installation/authentication.md
@@ -6,6 +6,15 @@ There are two main use cases for authentication:
 * Some websites require some kind of authentication in order to view the content. Examples include Facebook, Telegram etc.
 * Some websites use anti-bot systems to block bot-like tools from accessing the website. Adding real login information to auto-archiver can sometimes bypass this.

+```{note}
+
+The Authentication framework currently only works with the following modules:
+* Generic Extractor
+* Screenshot Enricher
+
+To authenticate for WACZ archiving, see the instructions on the [](../modules/autogen/enricher/wacz_extractor_enricher.md) page.
+```
+
 ## The Authentication Config

 You can save your authentication information directly inside your orchestration config file, or as a separate file (for security/multi-deploy purposes). Whether storing your settings inside the orchestration file, or as a separate file, the configuration format is the same. Currently, auto-archiver supports the following authentication types:
@@ -27,7 +36,7 @@ You can save your authentication information directly inside your orchestration

 The Username & Password, and API settings only work with the Generic Extractor. Other modules (like the screenshot enricher) can only use the `cookies` options. Furthermore, many sites can still detect bots and block username/password logins. Twitter/X and YouTube are two prominent ones that block username/password logging.

-One of the 'Cookies' options is recommended for the most robust archiving.
+One of the 'Cookies' options is recommended for the most robust archiving, but it still isn't guaranteed to work.
 ```

 ```{code} yaml
--- a/docs/source/installation/faq.md
+++ b/docs/source/installation/faq.md
@@ -11,7 +11,6 @@ are available on the [extractors](../modules/extractor.md) page. Some sites supp
 * Twitter
 * Instagram
 * Telegram
-* VKontact
 * Tiktok
 * Bluesky

--- a/docs/source/installation/setup.md
+++ b/docs/source/installation/setup.md
@@ -27,8 +27,8 @@ The way you run the Auto Archiver depends on how you installed it (docker instal
 If you installed Auto Archiver using docker, open up your terminal, and copy-paste / type the following command:

 ```bash
-docker run -it --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver
- ```
+docker run -it --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver -- "https://example.com/1/"
+```

 breaking this command down:
   1. `docker run` tells docker to start a new container (an instance of the image)
@@ -42,6 +42,7 @@ breaking this command down:
       1.  `-v` same as above, this is a volume instruction
       2.  `$PWD/local_archive` is a folder `local_archive/` in case you want to archive locally and have the files accessible outside docker
       3.  `/app/local_archive` is a folder inside docker that you can reference in your orchestration.yml file 
+   6. ` -- "https://example.com/1/"` this will pass the URL to archive to the default [command line feeder](../modules/autogen/feeder/cli_feeder.md)

 ### Example invocations

--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"

 [project]
 name = "auto-archiver"
-version = "0.13.8"
+version = "1.0.1"
 description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."

 requires-python = ">=3.10,<3.13"
@@ -41,23 +41,22 @@ dependencies = [
    "instaloader (>=0.0.0)",
    "tqdm (>=0.0.0)",
    "jinja2 (>=0.0.0)",
-    "pyOpenSSL (==24.2.1)",
-    "cryptography (>=41.0.0,<42.0.0)",
    "boto3 (>=1.28.0,<2.0.0)",
    "dataclasses-json (>=0.0.0)",
-    "yt-dlp (>=2025.1.26,<2026.0.0)",
    "numpy (==2.1.3)",
-    "vk-url-scraper (>=0.0.0)",
    "requests[socks] (>=0.0.0)",
    "warcio (>=0.0.0)",
    "jsonlines (>=0.0.0)",
    "pysubs2 (>=0.0.0)",
    "retrying (>=0.0.0)",
-    "tsp-client (>=0.0.0)",
-    "certvalidator (>=0.0.0)",
    "rich-argparse (>=1.6.0,<2.0.0)",
    "ruamel-yaml (>=0.18.10,<0.19.0)",
+    "rfc3161-client (>=1.0.1,<2.0.0)",
+    "cryptography (>44.0.1,<45.0.0)",
    "opentimestamps (>=0.4.5,<0.5.0)",
+    "bgutil-ytdlp-pot-provider (>=1.0.0)",
+    "yt-dlp[curl-cffi,default] (>=2025.5.22,<2026.0.0)",
+    "secretstorage (>=3.3.3,<4.0.0)",
 ]

 [tool.poetry.group.dev.dependencies]
--- a/scripts/instagrapi_server/.gitignore
+++ b/scripts/instagrapi_server/.gitignore
@@ -0,0 +1,2 @@
+secrets*
+*instagrapi_session.json
--- a/scripts/instagrapi_server/Dockerfile
+++ b/scripts/instagrapi_server/Dockerfile
@@ -0,0 +1,19 @@
+FROM python:3.12-slim
+WORKDIR /app
+
+# Install Poetry
+RUN pip install --upgrade pip
+RUN pip install poetry
+
+# Copy all source code
+COPY . .
+
+# Prevent Poetry from creating a virtual environment
+RUN poetry config virtualenvs.create false
+
+# Install dependencies
+RUN poetry install --no-root
+
+
+# Use uvicorn to run the FastAPI app
+CMD ["poetry", "run", "uvicorn", "src.instaserver:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/scripts/instagrapi_server/pyproject.toml
+++ b/scripts/instagrapi_server/pyproject.toml
@@ -0,0 +1,18 @@
+[project]
+name = "instaserver"
+version = "0.1.0"
+description = "A FastAPI InstagrAPI server"
+package-mode = false
+requires-python = ">=3.10"
+dependencies = [
+    "fastapi (>=0.115.12,<0.116.0)",
+    "instagrapi (>=2.1.3,<3.0.0)",
+    "uvicorn (>=0.34.0,<0.35.0)",
+    "pillow (>=11.1.0,<12.0.0)",
+    "python-dotenv (>=1.1.0,<2.0.0)"
+]
+
+
+[build-system]
+requires = ["poetry-core>=2.0.0,<3.0.0"]
+build-backend = "poetry.core.masonry.api"
--- a/scripts/instagrapi_server/run_instagrapi_server.sh
+++ b/scripts/instagrapi_server/run_instagrapi_server.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+#
+# run_instagrapi_server.sh
+# Usage:
+#   From repo root:   ./scripts/instagrapi_server/run_instagrapi_server.sh
+#   Or from script dir: ./run_instagrapi_server.sh
+#
+
+set -e
+
+# Step 1: cd to the script's directory (contains Dockerfile and secrets/)
+cd "$(dirname "$0")" || exit 1
+
+# Create secrets/ if it doesn't exist
+if [[ ! -d "secrets" ]]; then
+  echo "Creating secrets/ directory..."
+  mkdir secrets
+fi
+
+echo "Enter your Instagram credentials to store in secrets/.env"
+read -rp "Instagram Username: " IGUSER
+read -rsp "Instagram Password: " IGPASS
+echo ""
+
+cat <<EOF > secrets/.env
+INSTAGRAM_USERNAME=$IGUSER
+INSTAGRAM_PASSWORD=$IGPASS
+EOF
+echo "Created secrets/.env with your credentials."
+
+# Build Docker image
+IMAGE_NAME="instagrapi-server"
+echo "Building Docker image '$IMAGE_NAME'..."
+docker build -t "$IMAGE_NAME" .
+
+# Run container
+CONTAINER_NAME="ig-instasrv"
+echo "Running container '$CONTAINER_NAME'..."
+docker run -d \
+  --env-file secrets/.env \
+  -v "$(pwd)/secrets:/app/secrets" \
+  -p 8000:8000 \
+  --name "$CONTAINER_NAME" \
+  "$IMAGE_NAME"
+
+echo "Done! Instagrapi server is running on port 8000."
+echo "Use 'docker logs $CONTAINER_NAME' to view logs."
+echo "Use 'docker stop $CONTAINER_NAME' and 'docker rm $CONTAINER_NAME' to stop/remove the container."
--- a/scripts/instagrapi_server/src/instaserver.py
+++ b/scripts/instagrapi_server/src/instaserver.py
@@ -0,0 +1,157 @@
+"""https://subzeroid.github.io/instagrapi/
+
+Run using the following command:
+ uvicorn src.instgrapinstance.instaserver:app --host 0.0.0.0 --port 8000 --reload
+"""
+
+import logging
+import os
+import sys
+from dotenv import load_dotenv
+
+from fastapi import FastAPI, HTTPException
+from instagrapi import Client
+from instagrapi.exceptions import LoginRequired, BadCredentials
+
+load_dotenv(dotenv_path="secrets/.env")
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+
+INSTAGRAM_USERNAME = os.getenv("INSTAGRAM_USERNAME")
+INSTAGRAM_PASSWORD = os.getenv("INSTAGRAM_PASSWORD")
+SESSION_FILE = "secrets/instagrapi_session.json"
+
+app = FastAPI()
+cl = Client()
+
+
+@app.on_event("startup")
+def startup_event():
+    """Login automatically when server starts"""
+    try:
+        login_instagram()
+    except RuntimeError as e:
+        logging.error(f"API failed to start: {e}")
+        sys.exit(1)
+
+
+def login_instagram():
+    """Ensures Instagrapi is logged in and session is persistent"""
+    if not INSTAGRAM_USERNAME or not INSTAGRAM_PASSWORD:
+        raise RuntimeError("Instagram credentials are missing.")
+
+    if os.path.exists(SESSION_FILE):
+        try:
+            cl.load_settings(SESSION_FILE)
+            cl.get_timeline_feed()
+            logging.info("Using saved session.")
+            return
+        except LoginRequired:
+            logging.info("Session expired. Logging in again...")
+
+    try:
+        cl.login(INSTAGRAM_USERNAME, INSTAGRAM_PASSWORD)
+        cl.dump_settings(SESSION_FILE)
+        logging.info("Login successful, session saved.")
+    except BadCredentials as bc:
+        raise RuntimeError("Incorrect Instagram username or password.") from bc
+    except Exception as e:
+        raise RuntimeError(f"Login failed: {e}") from e
+
+
+@app.get("/v1/media/by/id")
+def get_media_by_id(id: str):
+    """Fetch post details by media ID"""
+    logging.info(f"Fetching media by ID: {id}")
+    try:
+        media = cl.media_info(id)
+        return media.model_dump()
+    except Exception as e:
+        logging.warning(f"Media not found for ID {id}: {e}")
+        raise HTTPException(status_code=404, detail="Post not found") from e
+
+
+@app.get("/v1/media/by/code")
+def get_media_by_code(code: str):
+    """Fetch post details by shortcode"""
+    logging.info(f"Fetching media by shortcode: {code}")
+    try:
+        media_id = cl.media_pk_from_code(code)
+        media = cl.media_info(media_id)
+        return media.model_dump()
+    except Exception as e:
+        logging.warning(f"Media not found for code {code}: {e}")
+        raise HTTPException(status_code=404, detail="Post not found") from e
+
+
+@app.get("/v2/user/tag/medias")
+def get_user_tagged_medias(user_id: str, page_id: str = None):
+    logging.info(f"Fetching tagged medias for user_id={user_id} page_id={page_id}")
+    try:
+        # Placeholder for now
+        items, next_page_id = [], None
+        return {"response": {"items": items}, "next_page_id": next_page_id}
+    except Exception as e:
+        logging.warning(f"Tagged media not found for {user_id}: {e}")
+        raise HTTPException(status_code=404, detail="Tagged media not found") from e
+
+
+@app.get("/v1/user/highlights")
+def get_user_highlights(user_id: str):
+    logging.info(f"Fetching highlights list for user_id={user_id}")
+    try:
+        highlights = cl.user_highlights(user_id)
+        return [h.model_dump() for h in highlights]
+    except Exception as e:
+        logging.warning(f"Highlights not found for {user_id}: {e}")
+        raise HTTPException(status_code=404, detail="No highlights found") from e
+
+
+@app.get("/v2/highlight/by/id")
+def get_highlight_by_id(id: str):
+    logging.info(f"Fetching highlight details for id={id}")
+    try:
+        highlight = cl.highlight_info(id)
+        return {"response": {"reels": {f"highlight:{id}": highlight.model_dump()}}}
+    except Exception as e:
+        logging.warning(f"Highlight not found for id {id}: {e}")
+        raise HTTPException(status_code=404, detail="Highlight not found") from e
+
+
+@app.get("/v1/user/stories/by/username")
+def get_stories(username: str):
+    logging.info(f"Fetching stories for username={username}")
+    try:
+        user_id = cl.user_id_from_username(username)
+        stories = cl.user_stories(user_id)
+        return [story.model_dump() for story in stories]
+    except Exception as e:
+        logging.warning(f"Stories not found for {username}: {e}")
+        raise HTTPException(status_code=404, detail="Stories not found") from e
+
+
+@app.get("/v2/user/by/username")
+def get_user_by_username(username: str):
+    logging.info(f"Fetching user profile for username={username}")
+    try:
+        user = cl.user_info_by_username(username)
+        return {"user": user.model_dump()}
+    except Exception as e:
+        logging.warning(f"User not found: {username}: {e}")
+        raise HTTPException(status_code=404, detail="User not found") from e
+
+
+@app.get("/v1/user/medias/chunk")
+def get_user_medias(user_id: str, end_cursor: str = None):
+    logging.info(f"Fetching paginated medias for user_id={user_id}, end_cursor={end_cursor}")
+    try:
+        posts, next_cursor = cl.user_medias_paginated(user_id, end_cursor=end_cursor)
+        return [[post.model_dump() for post in posts], next_cursor]
+    except Exception as e:
+        logging.warning(f"No posts found for user_id={user_id}: {e}")
+        raise HTTPException(status_code=404, detail="No posts found") from e
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/scripts/settings/package-lock.json
+++ b/scripts/settings/package-lock.json
--- a/src/auto_archiver/core/module.py
+++ b/src/auto_archiver/core/module.py
@@ -214,11 +214,8 @@ class LazyBaseModule:

        # check external dependencies are installed
        def check_deps(deps, check):
-            for dep in deps:
-                if not len(dep):
-                    # clear out any empty strings that a user may have erroneously added
-                    continue
-                if not check(dep):
+            for dep in filter(lambda d: len(d.strip()) > 0, deps):
+                if not check(dep.strip()):
                    logger.error(
                        f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \
                                 Have you installed the required dependencies for the '{self.name}' module? See the documentation for more information."
@@ -277,6 +274,9 @@ class LazyBaseModule:
        # finally, get the class instance
        instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)()

+        # save the instance for future easy loading
+        self._instance = instance
+
        # set the name, display name and module factory
        instance.name = self.name
        instance.display_name = self.display_name
@@ -289,8 +289,6 @@ class LazyBaseModule:
        instance.config_setup(config)
        instance.setup()

-        # save the instance for future easy loading
-        self._instance = instance
        return instance

    def __repr__(self):
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -5,6 +5,7 @@ formatting, database operations and clean up.
 """

 from __future__ import annotations
+from packaging import version
 from typing import Generator, Union, List, Type, TYPE_CHECKING
 import argparse
 import os
@@ -387,8 +388,10 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
                except (KeyboardInterrupt, Exception) as e:
                    if not isinstance(e, KeyboardInterrupt) and not isinstance(e, SetupError):
                        logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
-                    if loaded_module and module_type == "extractor":
-                        loaded_module.cleanup()
+
+                    # access the _instance here because loaded_module may not return if there's an error
+                    if lazy_module._instance and module_type == "extractor":
+                        lazy_module._instance.cleanup()
                    raise e

                if not loaded_module:
@@ -434,16 +437,19 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_

    def check_for_updates(self):
        response = requests.get("https://pypi.org/pypi/auto-archiver/json").json()
-        latest_version = response["info"]["version"]
+        latest_version = version.parse(response["info"]["version"])
+        current_version = version.parse(__version__)
        # check version compared to current version
-        if latest_version != __version__:
+        if latest_version > current_version:
            if os.environ.get("RUNNING_IN_DOCKER"):
                update_cmd = "`docker pull bellingcat/auto-archiver:latest`"
            else:
                update_cmd = "`pip install --upgrade auto-archiver`"
            logger.warning("")
            logger.warning("********* IMPORTANT: UPDATE AVAILABLE ********")
-            logger.warning(f"A new version of auto-archiver is available (v{latest_version}, you have {__version__})")
+            logger.warning(
+                f"A new version of auto-archiver is available (v{latest_version}, you have v{current_version})"
+            )
            logger.warning(f"Make sure to update to the latest version using: {update_cmd}")
            logger.warning("")

--- a/src/auto_archiver/core/validators.py
+++ b/src/auto_archiver/core/validators.py
@@ -4,12 +4,6 @@ import argparse
 import json


-def example_validator(value):
-    if "example" not in value:
-        raise argparse.ArgumentTypeError(f"{value} is not a valid value for this argument")
-    return value
-
-
 def positive_number(value):
    if value < 0:
        raise argparse.ArgumentTypeError(f"{value} is not a positive number")
--- a/src/auto_archiver/modules/generic_extractor/manifest.py
+++ b/src/auto_archiver/modules/generic_extractor/manifest.py
@@ -74,6 +74,11 @@ If you are having issues with the extractor, you can review the version of `yt-d
            "default": "inf",
            "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
        },
+        "bguils_po_token_method": {
+            "default": "auto",
+            "help": "Set up a Proof of origin token provider. This process has additional requirements. See [authentication](https://auto-archiver.readthedocs.io/en/latest/how_to/authentication_how_to.html) for more information.",
+            "choices": ["auto", "script", "disabled"],
+        },
        "extractor_args": {
            "default": {},
            "help": "Additional arguments to pass to the yt-dlp extractor. See https://github.com/yt-dlp/yt-dlp/blob/master/README.md#extractor-arguments.",
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -1,12 +1,18 @@
+import mimetypes
+import shutil
+import sys
 import datetime
 import os
 import importlib
 import subprocess
+import zipfile

 from typing import Generator, Type
+from urllib.request import urlretrieve

 import yt_dlp
 from yt_dlp.extractor.common import InfoExtractor
+from yt_dlp.utils import MaxDownloadsReached
 import pysubs2

 from loguru import logger
@@ -25,45 +31,138 @@ class GenericExtractor(Extractor):
    _dropins = {}

    def setup(self):
-        # check for file .ytdlp-update in the secrets folder
+        self.check_for_extractor_updates()
+        self.setup_po_tokens()
+
+    def check_for_extractor_updates(self):
+        """Checks whether yt-dlp or its plugins need updating and triggers a restart if so."""
        if self.ytdlp_update_interval < 0:
            return

-        use_secrets = os.path.exists("secrets")
-        path = os.path.join("secrets" if use_secrets else "", ".ytdlp-update")
-        next_update_check = None
-        if os.path.exists(path):
-            with open(path, "r") as f:
-                next_update_check = datetime.datetime.fromisoformat(f.read())
+        update_file = os.path.join("secrets" if os.path.exists("secrets") else "", ".ytdlp-update")
+        next_check = None
+        if os.path.exists(update_file):
+            with open(update_file, "r") as f:
+                next_check = datetime.datetime.fromisoformat(f.read())

-        if not next_update_check or next_update_check < datetime.datetime.now():
-            self.update_ytdlp()
+        if next_check and next_check > datetime.datetime.now():
+            return

-            next_update_check = datetime.datetime.now() + datetime.timedelta(days=self.ytdlp_update_interval)
-            with open(path, "w") as f:
-                f.write(next_update_check.isoformat())
+        yt_dlp_updated = self.update_package("yt-dlp")
+        bgutil_updated = self.update_package("bgutil-ytdlp-pot-provider")

-    def update_ytdlp(self):
-        logger.info("Checking and updating yt-dlp...")
-        logger.info(
-            f"Tip: change the 'ytdlp_update_interval' setting to control how often yt-dlp is updated. Set to -1 to disable or 0 to enable on every run. Current setting: {self.ytdlp_update_interval}"
-        )
+        # Write the new timestamp
+        with open(update_file, "w") as f:
+            next_check = datetime.datetime.now() + datetime.timedelta(days=self.ytdlp_update_interval)
+            f.write(next_check.isoformat())
+
+        if yt_dlp_updated or bgutil_updated:
+            if os.environ.get("AUTO_ARCHIVER_ALLOW_RESTART", "1") != "1":
+                logger.warning("yt-dlp or plugin was updated — please restart auto-archiver manually")
+            else:
+                logger.warning("yt-dlp or plugin was updated — restarting auto-archiver")
+                logger.warning(" ======= RESTARTING ======= ")
+                os.execv(sys.executable, [sys.executable] + sys.argv)
+
+    def update_package(self, package_name: str) -> bool:
+        logger.info(f"Checking and updating {package_name}...")
        from importlib.metadata import version as get_version

-        old_version = get_version("yt-dlp")
+        old_version = get_version(package_name)
        try:
-            # try and update with pip (this works inside poetry environment and in a normal virtualenv)
-            result = subprocess.run(["pip", "install", "--upgrade", "yt-dlp"], check=True, capture_output=True)
+            result = subprocess.run(["pip", "install", "--upgrade", package_name], check=True, capture_output=True)
+            if f"Successfully installed {package_name}" in result.stdout.decode():
+                new_version = importlib.metadata.version(package_name)
+                logger.info(f"{package_name} updated from {old_version} to {new_version}")
+                return True
+            logger.info(f"{package_name} already up to date")
+        except Exception as e:
+            logger.error(f"Error updating {package_name}: {e}")
+        return False

-            if "Successfully installed yt-dlp" in result.stdout.decode():
-                new_version = importlib.metadata.version("yt-dlp")
-                logger.info(f"yt-dlp successfully (from {old_version} to {new_version})")
-                importlib.reload(yt_dlp)
+    def setup_po_tokens(self) -> None:
+        """Setup Proof of Origin Token method conditionally.
+        Uses provider: https://github.com/Brainicism/bgutil-ytdlp-pot-provider.
+        """
+        in_docker = os.environ.get("RUNNING_IN_DOCKER")
+        if self.bguils_po_token_method == "disabled":
+            # This allows disabling of the PO Token generation script in the Docker implementation.
+            logger.warning("Proof of Origin Token generation is disabled.")
+            return
+
+        if self.bguils_po_token_method == "auto" and not in_docker:
+            logger.info(
+                "Proof of Origin Token method not explicitly set. "
+                "If you're running an external HTTP server separately, you can safely ignore this message. "
+                "To reduce the likelihood of bot detection, enable one of the methods described in the documentation: "
+                "https://auto-archiver.readthedocs.io/en/settings_page/installation/authentication.html#proof-of-origin-tokens"
+            )
+            return
+
+        # Either running in Docker, or "script" method is set beyond this point
+        self.setup_token_generation_script()
+
+    def setup_token_generation_script(self) -> None:
+        """This function sets up the Proof of Origin Token generation script method for
+        bgutil-ytdlp-pot-provider if enabled or in Docker."""
+        missing_tools = [tool for tool in ("node", "yarn", "npx") if shutil.which(tool) is None]
+        if missing_tools:
+            logger.error(
+                f"Cannot set up PO Token script; missing required tools: {', '.join(missing_tools)}. "
+                "Install these tools or run bgutils via Docker. "
+                "See: https://github.com/Brainicism/bgutil-ytdlp-pot-provider"
+            )
+            return
+        try:
+            from importlib.metadata import version as get_version
+
+            plugin_version = get_version("bgutil-ytdlp-pot-provider")
+            base_dir = os.path.expanduser("~/bgutil-ytdlp-pot-provider")
+            server_dir = os.path.join(base_dir, "server")
+            version_file = os.path.join(server_dir, ".VERSION")
+            transpiled_script = os.path.join(server_dir, "build", "generate_once.js")
+
+            # Skip setup if version is correct and transpiled script exists
+            if os.path.isfile(transpiled_script) and os.path.isfile(version_file):
+                with open(version_file) as vf:
+                    if vf.read().strip() == plugin_version:
+                        logger.info("PO Token script already set up and up to date.")
            else:
-                logger.info("yt-dlp already up to date")
+                # Remove an outdated directory and pull a new version
+                if os.path.exists(base_dir):
+                    shutil.rmtree(base_dir)
+                os.makedirs(base_dir, exist_ok=True)
+
+                zip_url = (
+                    f"https://github.com/Brainicism/bgutil-ytdlp-pot-provider/archive/refs/tags/{plugin_version}.zip"
+                )
+                zip_path = os.path.join(base_dir, f"{plugin_version}.zip")
+                logger.info(f"Downloading bgutils release zip for version {plugin_version}...")
+                urlretrieve(zip_url, zip_path)
+                with zipfile.ZipFile(zip_path, "r") as z:
+                    z.extractall(base_dir)
+                os.remove(zip_path)
+
+                extracted_root = os.path.join(base_dir, f"bgutil-ytdlp-pot-provider-{plugin_version}")
+                shutil.move(os.path.join(extracted_root, "server"), server_dir)
+                shutil.rmtree(extracted_root)
+                logger.info("Installing dependencies and transpiling PoT Generator script...")
+                subprocess.run(["yarn", "install", "--frozen-lockfile"], cwd=server_dir, check=True)
+                subprocess.run(["npx", "tsc"], cwd=server_dir, check=True)
+
+                with open(version_file, "w") as vf:
+                    vf.write(plugin_version)
+
+            script_path = os.path.join(server_dir, "build", "generate_once.js")
+            if not os.path.exists(script_path):
+                logger.error("generate_once.js not found after transpilation.")
+                return
+
+            self.extractor_args.setdefault("youtubepot-bgutilscript", {})["script_path"] = script_path
+            logger.info(f"PO Token script configured at: {script_path}")

        except Exception as e:
-            logger.error(f"Error updating yt-dlp: {e}")
+            logger.error(f"Failed to set up PO Token script: {e}")

    def suitable_extractors(self, url: str) -> Generator[str, None, None]:
        """
@@ -204,9 +303,9 @@ class GenericExtractor(Extractor):
            result.set_url(url)

        if "description" in video_data and not result.get("content"):
-            result.set_content(video_data["description"])
+            result.set_content(video_data.pop("description"))
        # extract comments if enabled
-        if self.comments:
+        if self.comments and video_data.get("comments", []) is not None:
            result.set(
                "comments",
                [
@@ -265,7 +364,12 @@ class GenericExtractor(Extractor):
        # this time download
        ydl.params["getcomments"] = self.comments
        # TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
-        data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
+        try:
+            data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
+        except MaxDownloadsReached:  # proceed as normal once MaxDownloadsReached is raised
+            pass
+        logger.success(data)
+
        if "entries" in data:
            entries = data.get("entries", [])
            if not len(entries):
@@ -273,14 +377,33 @@ class GenericExtractor(Extractor):
                return False
        else:
            entries = [data]
-
        result = Metadata()

+        def _helper_get_filename(entry: dict) -> str:
+            entry_url = entry.get("url")
+
+            filename = ydl.prepare_filename(entry)
+            base_filename, _ = os.path.splitext(filename)  # '/get/path/to/file' ignore '.ext'
+            directory = os.path.dirname(base_filename)  # '/get/path/to'
+            basename = os.path.basename(base_filename)  # 'file'
+            for f in os.listdir(directory):
+                if (
+                    f.startswith(basename)
+                    or (entry_url and os.path.splitext(f)[0] in entry_url)
+                    and "video/" in (mimetypes.guess_type(f)[0] or "")
+                ):
+                    return os.path.join(directory, f)
+            return False
+
        for entry in entries:
            try:
-                filename = ydl.prepare_filename(entry)
-                if not os.path.exists(filename):
-                    filename = filename.split(".")[0] + ".mkv"
+                filename = _helper_get_filename(entry)
+
+                if not filename or not os.path.exists(filename):
+                    # file was not downloaded or could not be retrieved, example: sensitive videos on YT without using cookies.
+                    continue
+
+                logger.debug(f"Using filename {filename} for entry {entry.get('id', 'unknown')}")

                new_media = Media(filename)
                for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
@@ -299,6 +422,9 @@ class GenericExtractor(Extractor):
                result.add_media(new_media)
            except Exception as e:
                logger.error(f"Error processing entry {entry}: {e}")
+        if not len(result.media):
+            logger.warning(f"No media found for entry {entry}, skipping.")
+            return False

        return self.add_metadata(data, info_extractor, url, result)

@@ -357,6 +483,13 @@ class GenericExtractor(Extractor):

        dropin_submodule = self.dropin_for_name(info_extractor.ie_key())

+        def _helper_for_successful_extract_info(data, info_extractor, url, ydl):
+            if data.get("is_live", False) and not self.livestreams:
+                logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
+                return False
+            # it's a valid video, that the youtubdedl can download out of the box
+            return self.get_metadata_for_video(data, info_extractor, url, ydl)
+
        try:
            if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor):
                logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
@@ -364,11 +497,12 @@ class GenericExtractor(Extractor):

            # don't download since it can be a live stream
            data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
-            if data.get("is_live", False) and not self.livestreams:
-                logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
-                return False
-            # it's a valid video, that the youtubdedl can download out of the box
-            result = self.get_metadata_for_video(data, info_extractor, url, ydl)
+
+            result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)
+
+        except MaxDownloadsReached:
+            # yt-dlp raises an error when the max downloads limit is reached, and it shouldn't for our purposes, so we consider that a success
+            result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)

        except Exception as e:
            if info_extractor.IE_NAME == "generic":
@@ -422,6 +556,8 @@ class GenericExtractor(Extractor):
            "--write-subs" if self.subtitles else "--no-write-subs",
            "--write-auto-subs" if self.subtitles else "--no-write-auto-subs",
            "--live-from-start" if self.live_from_start else "--no-live-from-start",
+            "--postprocessor-args",
+            "ffmpeg:-bitexact",  # ensure bitexact output to avoid mismatching hashes for same video
        ]

        # proxy handling
--- a/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py
+++ b/src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py
@@ -88,10 +88,7 @@ class GsheetsFeederDB(Feeder, Database):
        if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:
            # ALLOW rules exist AND sheet name not explicitly allowed
            return False
-        if len(self.block_worksheets) and sheet_name in self.block_worksheets:
-            # BLOCK rules exist AND sheet name is blocked
-            return False
-        return True
+        return not (self.block_worksheets and sheet_name in self.block_worksheets)

    def missing_required_columns(self, gw: GWorksheet) -> list:
        missing = []
@@ -101,7 +98,7 @@ class GsheetsFeederDB(Feeder, Database):
        return missing

    def started(self, item: Metadata) -> None:
-        logger.warning(f"STARTED {item}")
+        logger.info(f"STARTED {item}")
        gw, row = self._retrieve_gsheet(item)
        gw.set_cell(row, "status", "Archive in progress")

@@ -161,9 +158,8 @@ class GsheetsFeederDB(Feeder, Database):
        if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
            batch_if_valid("screenshot", "\n".join(screenshot.urls))

-        if thumbnail := item.get_first_image("thumbnail"):
-            if hasattr(thumbnail, "urls"):
-                batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")')
+        if (thumbnail := item.get_first_image("thumbnail")) and hasattr(thumbnail, "urls"):
+            batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")')

        if browsertrix := item.get_media_by_id("browsertrix"):
            batch_if_valid("wacz", "\n".join(browsertrix.urls))
--- a/src/auto_archiver/modules/instagram_api_extractor/manifest.py
+++ b/src/auto_archiver/modules/instagram_api_extractor/manifest.py
@@ -31,9 +31,11 @@
        },
    },
    "description": """
-Archives various types of Instagram content using the Instagrapi API.
+Archives Instagram content using a deployment of the [Instagrapi API](https://subzeroid.github.io/instagrapi/).

-Requires setting up an Instagrapi API deployment and providing an access token and API endpoint.
+Requires either getting a token from using a hosted [(paid) service](https://api.instagrapi.com/docs) and setting this in the configuration file.
+Alternatively you can run your own server. We have a basic script which you can use for this which can be ran locally or using Docker.
+For more information, read the [how to guide](https://auto-archiver.readthedocs.io/en/latest/how_to/run_instagrapi_server.html) on this.

 ### Features
 - Connects to an Instagrapi API deployment to fetch Instagram profiles, posts, stories, highlights, reels, and tagged content.
--- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
+++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
@@ -88,6 +88,9 @@ class InstagramTbotExtractor(Extractor):

            if message:
                result.set_content(message).set_title(message[:128])
+            elif result.is_empty():
+                logger.debug(f"No media found for link {url=} for {self.name}: {message}")
+                return False
            return result.success("insta-via-bot")

    def _send_url_to_bot(self, url: str):
@@ -104,13 +107,13 @@ class InstagramTbotExtractor(Extractor):
        message = ""
        time.sleep(3)
        # media is added before text by the bot so it can be used as a stop-logic mechanism
-        while attempts < max(self.timeout - 3, 3) and (not message or not len(seen_media)):
+        while attempts < max(self.timeout - 3, 15) and (not message or not len(seen_media)):
            attempts += 1
            time.sleep(1)
            for post in self.client.iter_messages(chat, min_id=since_id):
                since_id = max(since_id, post.id)
                # Skip known filler message:
-                if post.message == "The bot receives information through https://hikerapi.com/p/hJqpppqi":
+                if "The bot receives information through https://hikerapi.com/" in post.message:
                    continue
                if post.media and post.id not in seen_media:
                    filename_dest = os.path.join(tmp_dir, f"{chat.id}_{post.id}")
--- a/src/auto_archiver/modules/telethon_extractor/manifest.py
+++ b/src/auto_archiver/modules/telethon_extractor/manifest.py
@@ -19,7 +19,7 @@
        },
        "session_file": {
            "default": "secrets/anon",
-            "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value.",
+            "help": "Path of the file to save the telegram login session for future usage, '.session' will be appended to the provided path.",
        },
        "join_channels": {
            "default": True,
--- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
+++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
@@ -1,4 +1,10 @@
+import os
 import shutil
+import re
+import time
+from pathlib import Path
+from datetime import date
+
 from telethon.sync import TelegramClient
 from telethon.errors import ChannelInvalidError
 from telethon.tl.functions.messages import ImportChatInviteRequest
@@ -8,11 +14,9 @@ from telethon.errors.rpcerrorlist import (
    InviteRequestSentError,
    InviteHashExpiredError,
 )
-from loguru import logger
+
 from tqdm import tqdm
-import re
-import time
-import os
+from loguru import logger

 from auto_archiver.core import Extractor
 from auto_archiver.core import Metadata, Media
@@ -31,10 +35,22 @@ class TelethonExtractor(Extractor):
        """
        logger.info(f"SETUP {self.name} checking login...")

+        # in case the user already added '.session' to the session_file
+        base_session_name = self.session_file.removesuffix(".session")
+        base_session_filepath = f"{base_session_name}.session"
+
+        if self.session_file and not os.path.exists(base_session_filepath):
+            logger.warning(
+                f"SETUP - Session file {base_session_filepath} does not exist for {self.name}, creating an empty one."
+            )
+            Path(base_session_filepath).touch()
+
        # make a copy of the session that is used exclusively with this archiver instance
-        new_session_file = os.path.join("secrets/", f"telethon-{time.strftime('%Y-%m-%d')}{random_str(8)}.session")
-        shutil.copy(self.session_file + ".session", new_session_file)
-        self.session_file = new_session_file.replace(".session", "")
+        self.session_file = os.path.join(
+            os.path.dirname(base_session_filepath), f"telethon-{date.today().strftime('%Y-%m-%d')}{random_str(8)}"
+        )
+        logger.debug(f"Making a copy of the session file {base_session_filepath} to {self.session_file}.session")
+        shutil.copy(base_session_filepath, f"{self.session_file}.session")

        # initiate the client
        self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
@@ -87,8 +103,8 @@ class TelethonExtractor(Extractor):
                    pbar.update()

    def cleanup(self) -> None:
-        logger.info(f"CLEANUP {self.name}.")
-        session_file_name = self.session_file + ".session"
+        logger.info(f"CLEANUP {self.name} - removing session file {self.session_file}.session")
+        session_file_name = f"{self.session_file}.session"
        if os.path.exists(session_file_name):
            os.remove(session_file_name)

@@ -174,7 +190,7 @@ class TelethonExtractor(Extractor):
        if getattr(original_post, "grouped_id", None) is None:
            return [original_post] if getattr(original_post, "media", False) else []

-        search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)]
+        search_ids = list(range(original_post.id - max_amp, original_post.id + max_amp + 1))
        posts = self.client.get_messages(chat, ids=search_ids)
        media = []
        for post in posts:
--- a/src/auto_archiver/modules/timestamping_enricher/manifest.py
+++ b/src/auto_archiver/modules/timestamping_enricher/manifest.py
@@ -3,30 +3,38 @@
    "type": ["enricher"],
    "requires_setup": True,
    "dependencies": {
-        "python": ["loguru", "slugify", "tsp_client", "asn1crypto", "certvalidator", "certifi"],
+        "python": ["loguru", "slugify", "cryptography", "rfc3161_client", "certifi"],
    },
    "configs": {
        "tsa_urls": {
            "default": [
-                # [Adobe Approved Trust List] and [Windows Cert Store]
-                "http://timestamp.digicert.com",
-                "http://timestamp.identrust.com",
-                # "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping
-                # "https://timestamp.sectigo.com", # wait 15 seconds between each request.
-                # [Adobe: European Union Trusted Lists].
-                # "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request.
-                # [Windows Cert Store]
-                "http://timestamp.globalsign.com/tsa/r6advanced1",
-                # [Adobe: European Union Trusted Lists] and [Windows Cert Store]
-                # "http://ts.quovadisglobal.com/eu", # not valid for timestamping
-                # "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain
-                # "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain
-                # "http://tsa.sep.bg", # self-signed certificate in certificate chain
-                # "http://tsa.izenpe.com", #unable to get local issuer certificate
-                # "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate
-                "http://tss.accv.es:8318/tsa",
-            ],
+                # See https://github.com/trailofbits/rfc3161-client/issues/46 for a list of valid TSAs
+                # Full list of TSAs: https://gist.github.com/Manouchehri/fd754e402d98430243455713efada710
+                    "http://timestamp.identrust.com",
+                    "http://timestamp.ssl.trustwave.com",
+                    "http://zeitstempel.dfn.de",
+                    "http://ts.ssl.com",
+                    # "http://tsa.izenpe.com", # self-signed
+                    "http://tsa.lex-persona.com/tsa",
+                    # "http://ca.signfiles.com/TSAServer.aspx", # self-signed
+                    # "http://tsa.sinpe.fi.cr/tsaHttp/", # self-signed
+                    # "http://tsa.cra.ge/signserver/tsa?workerName=qtsa", # self-signed
+                    "http://tss.cnbs.gob.hn/TSS/HttpTspServer",
+                    "http://dss.nowina.lu/pki-factory/tsa/good-tsa",
+                    # "https://freetsa.org/tsr", # self-signed
+                ],
            "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
+        },
+        "cert_authorities": {
+            "default": None,
+            "help": "Path to a file containing trusted Certificate Authorities (CAs) in PEM format. If empty, the default system authorities are used.",
+            "type": "str",
+        },
+        "allow_selfsigned": {
+            "default": False,
+            "help": "Whether or not to allow and save self-signed Timestamping certificates. This allows for a greater range of timestamping servers to be used, \
+but they are not trusted authorities",
+            "type": "bool"
        }
    },
    "description": """
--- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py
+++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py
@@ -1,15 +1,23 @@
 import os
-from loguru import logger
-from tsp_client import TSPSigner, SigningSettings, TSPVerifier
-from tsp_client.algorithms import DigestAlgorithm
+
 from importlib.metadata import version
-from asn1crypto.cms import ContentInfo
-from certvalidator import CertificateValidator, ValidationContext
-from asn1crypto import pem
+import hashlib
+
+from slugify import slugify
+import requests
+from loguru import logger
+
+from rfc3161_client import (decode_timestamp_response,TimestampRequestBuilder,TimeStampResponse, VerifierBuilder)
+from rfc3161_client import VerificationError as Rfc3161VerificationError
+from rfc3161_client.base import HashAlgorithm
+from rfc3161_client.tsp import SignedData
+from cryptography import x509
+from cryptography.hazmat.primitives import serialization
 import certifi

 from auto_archiver.core import Enricher
 from auto_archiver.core import Metadata, Media
+from auto_archiver.version import __version__


 class TimestampingEnricher(Enricher):
@@ -21,6 +29,25 @@ class TimestampingEnricher(Enricher):
    See https://gist.github.com/Manouchehri/fd754e402d98430243455713efada710 for list of timestamp authorities.
    """

+    session = None
+
+    def setup(self):
+        self.session = requests.Session()
+        self.session.headers.update(
+            {
+                "Content-Type": "application/timestamp-query",
+                "User-Agent": f"Auto-Archiver {__version__}",
+                "Accept": "application/timestamp-reply",
+            }
+        )
+
+    def cleaup(self) -> None:
+        """
+        Terminates the underlying network session.
+        """
+        if self.session:
+            self.session.close()
+
    def enrich(self, to_enrich: Metadata) -> None:
        url = to_enrich.get_url()
        logger.debug(f"RFC3161 timestamping existing files for {url=}")
@@ -34,8 +61,8 @@ class TimestampingEnricher(Enricher):
            logger.warning(f"No hashes found in {url=}")
            return

-        tmp_dir = self.tmp_dir
-        hashes_fn = os.path.join(tmp_dir, "hashes.txt")
+        
+        hashes_fn = os.path.join(self.tmp_dir, "hashes.txt")

        data_to_sign = "\n".join(hashes)
        with open(hashes_fn, "w") as f:
@@ -43,62 +70,160 @@ class TimestampingEnricher(Enricher):
        hashes_media = Media(filename=hashes_fn)

        timestamp_tokens = []
-        from slugify import slugify
-
        for tsa_url in self.tsa_urls:
            try:
-                signing_settings = SigningSettings(tsp_server=tsa_url, digest_algorithm=DigestAlgorithm.SHA256)
-                signer = TSPSigner()
-                message = bytes(data_to_sign, encoding="utf8")
-                # send TSQ and get TSR from the TSA server
-                signed = signer.sign(message=message, signing_settings=signing_settings)
-                # fail if there's any issue with the certificates, uses certifi list of trusted CAs
-                TSPVerifier(certifi.where()).verify(signed, message=message)
-                # download and verify timestamping certificate
-                cert_chain = self.download_and_verify_certificate(signed)
-                # continue with saving the timestamp token
-                tst_fn = os.path.join(tmp_dir, f"timestamp_token_{slugify(tsa_url)}")
-                with open(tst_fn, "wb") as f:
-                    f.write(signed)
-                timestamp_tokens.append(Media(filename=tst_fn).set("tsa", tsa_url).set("cert_chain", cert_chain))
+                message = bytes(data_to_sign, encoding='utf8')
+
+                logger.debug(f"Timestamping {url=} with {tsa_url=}")
+                signed: TimeStampResponse = self.sign_data(tsa_url, message)
+                
+                # fail if there's any issue with the certificates, uses certifi list of trusted CAs or the user-defined `cert_authorities`
+                root_cert = self.verify_signed(signed, message)
+
+                if not root_cert:
+                    if self.allow_selfsigned:
+                        logger.warning(f"Allowing self-signed certificat from TSA {tsa_url=}")
+                    else:
+                        raise ValueError(f"No valid root certificate found for {tsa_url=}. Are you sure it's a trusted TSA? Or define an alternative trusted root with `cert_authorities`. (tried: {self.cert_authorities or certifi.where()})")
+
+                # save the timestamping certificate
+                cert_chain = self.save_certificate(signed, root_cert)
+
+                timestamp_token_path = self.save_timestamp_token(signed.time_stamp_token(), tsa_url)
+                timestamp_tokens.append(Media(filename=timestamp_token_path).set("tsa", tsa_url).set("cert_chain", cert_chain))
            except Exception as e:
                logger.warning(f"Error while timestamping {url=} with {tsa_url=}: {e}")

        if len(timestamp_tokens):
            hashes_media.set("timestamp_authority_files", timestamp_tokens)
            hashes_media.set("certifi v", version("certifi"))
-            hashes_media.set("tsp_client v", version("tsp_client"))
-            hashes_media.set("certvalidator v", version("certvalidator"))
+            hashes_media.set("rfc3161-client v", version("rfc3161_client"))
+            hashes_media.set("cryptography v", version("cryptography"))
            to_enrich.add_media(hashes_media, id="timestamped_hashes")
            to_enrich.set("timestamped", True)
            logger.success(f"{len(timestamp_tokens)} timestamp tokens created for {url=}")
        else:
            logger.warning(f"No successful timestamps for {url=}")

-    def download_and_verify_certificate(self, signed: bytes) -> list[Media]:
+    def save_timestamp_token(self, timestamp_token: bytes, tsa_url: str) -> str:
+        """
+        Takes a timestamp token, and saves it to a file with the TSA URL as part of the filename.
+        """
+        tst_path = os.path.join(self.tmp_dir, f"timestamp_token_{slugify(tsa_url)}")
+        with open(tst_path, "wb") as f:
+            f.write(timestamp_token)
+        return tst_path
+
+    def verify_signed(self, timestamp_response: TimeStampResponse, message: bytes) ->  x509.Certificate:
+        """
+        Verify a Signed Timestamp Response is trusted by a known Certificate Authority.
+
+        Args:
+            timestamp_response (TimeStampResponse): The signed timestamp response.
+            message (bytes): The message that was timestamped.
+
+        Returns:
+            x509.Certificate: A valid root certificate that was used to sign the timestamp response, or None
+
+        Raises:
+            ValueError: If no valid root certificate was found in the trusted root store.
+        """
+
+        trusted_root_path = self.cert_authorities or certifi.where()
+        cert_authorities = []
+
+        with open(trusted_root_path, 'rb') as f:
+            cert_authorities = x509.load_pem_x509_certificates(f.read())
+
+        if not cert_authorities:
+            raise ValueError(f"No trusted roots found in {trusted_root_path}.")
+        
+        timestamp_certs = self.tst_certs(timestamp_response)
+        intermediate_certs = timestamp_certs[1:-1]
+
+        message_hash = None
+        hash_algorithm = timestamp_response.tst_info.message_imprint.hash_algorithm
+        if hash_algorithm == x509.ObjectIdentifier(value="2.16.840.1.101.3.4.2.3"):
+            message_hash = hashlib.sha512(message).digest()
+        elif hash_algorithm == x509.ObjectIdentifier(value="2.16.840.1.101.3.4.2.1"):
+            message_hash = hashlib.sha256(message).digest()
+        else:
+            raise ValueError(f"Unsupported hash algorithm: {hash_algorithm}")
+        
+        for certificate in cert_authorities:
+            builder = VerifierBuilder()
+            builder.add_root_certificate(certificate)
+
+            for intermediate_cert in intermediate_certs:
+                builder.add_intermediate_certificate(intermediate_cert)
+
+            verifier = builder.build()
+
+            
+            try:
+                verifier.verify(timestamp_response, message_hash)
+                return certificate
+            except Rfc3161VerificationError:
+                continue
+
+        return None
+
+    def sign_data(self, tsa_url: str, bytes_data: bytes) -> TimeStampResponse:
+        # see https://github.com/sigstore/sigstore-python/blob/99948d5b80525a5a104e904ffea58169dc6e0629/sigstore/_internal/timestamp.py#L84-L121
+
+        timestamp_request = (
+                TimestampRequestBuilder().data(bytes_data).nonce(nonce=True).build()
+            )
+        try:
+            response = self.session.post(tsa_url, data=timestamp_request.as_bytes(), timeout=10)
+            response.raise_for_status()
+        except requests.RequestException as e:
+            logger.error(f"Error while sending request to {tsa_url=}: {e}")
+            raise
+
+        # Check that we can parse the response but do not *verify* it
+        try:
+            timestamp_response = decode_timestamp_response(response.content)
+        except ValueError as e:
+            logger.error(f"Invalid timestamp response from server {tsa_url}: {e}")
+            raise
+        return timestamp_response
+    
+    def tst_certs(self, tsp_response: TimeStampResponse):
+        signed_data: SignedData = tsp_response.signed_data
+        certs = [x509.load_der_x509_certificate(c) for c in signed_data.certificates]
+        # reorder the certs to be in the correct order
+        ordered_certs = []
+        if len(certs) == 1:
+            return certs
+
+        while(len(ordered_certs) < len(certs)):
+            if len(ordered_certs) == 0:
+                for cert in certs:
+                    if not [c for c in certs if cert.subject == c.issuer]:
+                        ordered_certs.append(cert)
+                        break
+            else:
+                for cert in certs:
+                    if cert.subject == ordered_certs[-1].issuer:
+                        ordered_certs.append(cert)
+                        break
+        return ordered_certs
+
+    def save_certificate(self, tsp_response: TimeStampResponse, verified_root_cert: x509.Certificate) -> list[Media]:
        # returns the leaf certificate URL, fails if not set
-        tst = ContentInfo.load(signed)

-        trust_roots = []
-        with open(certifi.where(), "rb") as f:
-            for _, _, der_bytes in pem.unarmor(f.read(), multiple=True):
-                trust_roots.append(der_bytes)
-        context = ValidationContext(trust_roots=trust_roots)
+        certificates = self.tst_certs(tsp_response)

-        certificates = tst["content"]["certificates"]
-        first_cert = certificates[0].dump()
-        intermediate_certs = []
-        for i in range(1, len(certificates)):  # cannot use list comprehension [1:]
-            intermediate_certs.append(certificates[i].dump())
-
-        validator = CertificateValidator(first_cert, intermediate_certs=intermediate_certs, validation_context=context)
-        path = validator.validate_usage({"digital_signature"}, extended_key_usage={"time_stamping"})
+        if verified_root_cert:
+            # add the verified root certificate (if there is one - self signed certs will have None here)
+            certificates += [verified_root_cert]

        cert_chain = []
-        for cert in path:
-            cert_fn = os.path.join(self.tmp_dir, f"{str(cert.serial_number)[:20]}.crt")
+        for i, cert in enumerate(certificates):
+            cert_fn = os.path.join(self.tmp_dir, f"{i+1} – {str(cert.serial_number)[:20]}.crt")
            with open(cert_fn, "wb") as f:
-                f.write(cert.dump())
-            cert_chain.append(Media(filename=cert_fn).set("subject", cert.subject.native["common_name"]))
+                f.write(cert.public_bytes(encoding=serialization.Encoding.PEM))
+            cert_chain.append(Media(filename=cert_fn).set("subject", cert.subject.get_attributes_for_oid(x509.NameOID.COMMON_NAME)[0].value))

        return cert_chain
--- a/src/auto_archiver/modules/vk_extractor/init.py
+++ b/src/auto_archiver/modules/vk_extractor/init.py
@@ -1 +0,0 @@
-from .vk_extractor import VkExtractor
--- a/src/auto_archiver/modules/vk_extractor/manifest.py
+++ b/src/auto_archiver/modules/vk_extractor/manifest.py
@@ -1,37 +0,0 @@
-{
-    "name": "VKontakte Extractor",
-    "type": ["extractor"],
-    "requires_setup": True,
-    "depends": ["core", "utils"],
-    "dependencies": {
-        "python": ["loguru", "vk_url_scraper"],
-    },
-    "configs": {
-        "username": {"required": True, "help": "valid VKontakte username"},
-        "password": {"required": True, "help": "valid VKontakte password"},
-        "session_file": {
-            "default": "secrets/vk_config.v2.json",
-            "help": "valid VKontakte password",
-        },
-    },
-    "description": """
-The `VkExtractor` fetches posts, text, and images from VK (VKontakte) social media pages. 
-This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract 
-and download content. Note that VK videos are handled separately by the `YTDownloader`.
-
-### Features
- Extracts text, timestamps, and metadata from VK `/wall` posts.
- Downloads associated images and attaches them to the resulting `Metadata` object.
- Processes multiple segments of VK URLs that contain mixed content (e.g., wall, photo).
- Outputs structured metadata and media using `Metadata` and `Media` objects.
-
-### Setup
-To use the `VkArchiver`, you must provide valid VKontakte login credentials and session information:
- **Username**: A valid VKontakte account username.
- **Password**: The corresponding password for the VKontakte account.
- **Session File**: Optional. Path to a session configuration file (`.json`) for persistent VK login.
-
-Credentials can be set in the configuration file or directly via environment variables. Ensure you 
-have access to the VKontakte API by creating an account at [VKontakte](https://vk.com/).
-""",
-}
--- a/src/auto_archiver/modules/vk_extractor/vk_extractor.py
+++ b/src/auto_archiver/modules/vk_extractor/vk_extractor.py
@@ -1,43 +0,0 @@
-from loguru import logger
-from vk_url_scraper import VkScraper
-
-from auto_archiver.utils.misc import dump_payload
-from auto_archiver.core import Extractor
-from auto_archiver.core import Metadata, Media
-
-
-class VkExtractor(Extractor):
-    """ "
-    VK videos are handled by YTDownloader, this archiver gets posts text and images.
-    Currently only works for /wall posts
-    """
-
-    def setup(self) -> None:
-        self.vks = VkScraper(self.username, self.password, session_file=self.session_file)
-
-    def download(self, item: Metadata) -> Metadata:
-        url = item.get_url()
-
-        if "vk.com" not in item.netloc:
-            return False
-
-        # some urls can contain multiple wall/photo/... parts and all will be fetched
-        vk_scrapes = self.vks.scrape(url)
-        if not len(vk_scrapes):
-            return False
-        logger.debug(f"VK: got {len(vk_scrapes)} scraped instances")
-
-        result = Metadata()
-        for scrape in vk_scrapes:
-            if not result.get_title():
-                result.set_title(scrape["text"])
-            if not result.get_timestamp():
-                result.set_timestamp(scrape["datetime"])
-
-        result.set_content(dump_payload(vk_scrapes))
-
-        filenames = self.vks.download_media(vk_scrapes, self.tmp_dir)
-        for filename in filenames:
-            result.add_media(Media(filename))
-
-        return result.success("vk")
--- a/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
@@ -40,27 +40,31 @@
    Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving.
    [Browsertrix-crawler](https://crawler.docs.browsertrix.com/user-guide/) is a headless browser-based crawler that archives web pages in WACZ format.

-    ## Setup
-
-    **Docker**
-    If you are using the Docker file to run Auto Archiver (recommended), then everything is set up and you can use WACZ out of the box!
-    Otherwise, if you are using a local install of Auto Archiver (e.g. pip or dev install), then you will need to install Docker and run 
-    the docker daemon to be able to run the `browsertrix-crawler` tool.
-
-    **Browsertrix Profiles**
-    A browsertrix profile is a custom browser profile (login information, browser extensions, etc.) that can be used to archive private or dynamic content.
-    You can run the WACZ Enricher without a profile, but for more resilient archiving, it is recommended to create a profile. See the [Browsertrix documentation](https://crawler.docs.browsertrix.com/user-guide/browser-profiles/)
-    for more information.
-
-    ** Docker in Docker **
-    If you are running Auto Archiver within a Docker container, you will need to enable Docker in Docker to run the `browsertrix-crawler` tool.
-    This can be done by setting the `WACZ_ENABLE_DOCKER` environment variable to `1`.
-
    ## Features
    - Archives web pages into .WACZ format using Docker or direct invocation of `browsertrix-crawler`.
    - Supports custom profiles for archiving private or dynamic content.
    - Extracts media (images, videos, audio) and screenshots from the archive, optionally adding them to the enrichment pipeline.
    - Generates metadata from the archived page's content and structure (e.g., titles, text).

+    ## Setup
+
+    ### Using Docker
+    If you are using the Auto Archiver [Docker image](https://auto-archiver.readthedocs.io/en/latest/installation/installation.html#installing-with-docker)
+    to run Auto Archiver (recommended), then everything is set up and you can use WACZ out of the box!
+    Otherwise, if you are using a local install of Auto Archiver (e.g. pip or dev install), then you will need to install Docker and run 
+    the docker daemon to be able to run the `browsertrix-crawler` tool.
+
+    ### Browsertrix Profiles
+    A browsertrix profile is a custom browser profile (login information, browser extensions, etc.) that can be used to archive private or dynamic content.
+    You can run the WACZ Enricher without a profile, but for more resilient archiving, it is recommended to create a profile.
+    See the [Browsertrix documentation](https://crawler.docs.browsertrix.com/user-guide/browser-profiles/) for more information on how to use the `create-login-profile` tool.
+
+
+
+    ### Docker in Docker
+    If you are running Auto Archiver within a Docker container, you will need to enable Docker in Docker to run the `browsertrix-crawler` tool.
+    This can be done by setting the `WACZ_ENABLE_DOCKER` environment variable to `1`.
+
+
    """,
 }
--- a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py
@@ -86,6 +86,12 @@ class WaczExtractorEnricher(Enricher, Extractor):
        if self.docker_in_docker:
            cmd.extend(["--cwd", self.cwd_dind])

+        if self.auth_for_site(url):
+            # there's an auth for this site, but browsertrix only supports username/password auth
+            logger.warning(
+                "The WACZ enricher / Browsertrix does not support using the 'authentication' information for logging in. You should consider creating a Browser Profile for WACZ archiving. More information: https://auto-archiver.readthedocs.io/en/latest/modules/autogen/extractor/wacz_extractor_enricher.html#browsertrix-profiles"
+            )
+
        # call docker if explicitly enabled or we are running on the host (not in docker)
        if self.use_docker:
            logger.debug(f"generating WACZ in Docker for {url=}")
@@ -188,7 +194,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
                        shutil.copyfileobj(infile, outfile)

        # get media out of .warc
-        counter = 0
+        counter_warc_files = 0
+        counter_screenshots = 0
        seen_urls = set()

        with open(warc_filename, "rb") as warc_stream:
@@ -197,12 +204,12 @@ class WaczExtractorEnricher(Enricher, Extractor):
                if (
                    record.rec_type == "resource" and record.content_type == "image/png" and self.extract_screenshot
                ):  # screenshots
-                    fn = os.path.join(tmp_dir, f"warc-file-{counter}.png")
+                    fn = os.path.join(tmp_dir, f"warc-file-{counter_screenshots}.png")
                    with open(fn, "wb") as outf:
                        outf.write(record.raw_stream.read())
                    m = Media(filename=fn)
-                    to_enrich.add_media(m, "browsertrix-screenshot")
-                    counter += 1
+                    to_enrich.add_media(m, f"browsertrix-screenshot-{counter_screenshots}")
+                    counter_screenshots += 1
                if not self.extract_media:
                    continue

@@ -225,7 +232,7 @@ class WaczExtractorEnricher(Enricher, Extractor):

                # create local file and add media
                ext = mimetypes.guess_extension(content_type)
-                warc_fn = f"warc-file-{counter}{ext}"
+                warc_fn = f"warc-file-{counter_screenshots}{ext}"
                fn = os.path.join(tmp_dir, warc_fn)

                record_url_best_qual = UrlUtil.twitter_best_quality_url(record_url)
@@ -250,6 +257,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
                    continue

                to_enrich.add_media(m, warc_fn)
-                counter += 1
+                counter_warc_files += 1
                seen_urls.add(record_url)
-        logger.info(f"WACZ extract_media/extract_screenshot finished, found {counter} relevant media file(s)")
+        logger.info(
+            f"WACZ extract_media/extract_screenshot finished, found {counter_warc_files + counter_screenshots} relevant media file(s)"
+        )
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -10,14 +10,31 @@ from typing import Dict, Tuple
 import hashlib

 import pytest
-from auto_archiver.core.metadata import Metadata
+from auto_archiver.core.metadata import Metadata, Media
 from auto_archiver.core.module import ModuleFactory

 # Test names inserted into this list will be run last. This is useful for expensive/costly tests
 # that you only want to run if everything else succeeds (e.g. API calls). The order here is important
 # what comes first will be run first (at the end of all other tests not mentioned)
 # format is the name of the module (python file) without the .py extension
-TESTS_TO_RUN_LAST = ["test_twitter_api_archiver"]
+TESTS_TO_RUN_LAST = ["test_generic_archiver", "test_twitter_api_archiver"]
+
+
+# don't check for ytdlp updates in tests
+@pytest.fixture(autouse=True)
+def skip_check_for_update(mocker):
+    update_ytdlp = mocker.patch(
+        "auto_archiver.modules.generic_extractor.generic_extractor.GenericExtractor.update_package"
+    )
+    update_ytdlp.return_value = False
+
+
+@pytest.fixture
+def get_lazy_module():
+    def _get_lazy_module(module_name):
+        return ModuleFactory().get_module_lazy(module_name)
+
+    return _get_lazy_module


@pytest.fixture
@@ -134,12 +151,21 @@ def unpickle():

@pytest.fixture
 def mock_binary_dependencies(mocker):
+    mocker.patch("subprocess.run").return_value = mocker.Mock(returncode=0)
    mock_shutil_which = mocker.patch("shutil.which")
    # Mock all binary dependencies as available
    mock_shutil_which.return_value = "/usr/bin/fake_binary"
    return mock_shutil_which


+@pytest.fixture
+def sample_media(tmp_path) -> Media:
+    """Fixture creating a Media object with temporary source file"""
+    src_file = tmp_path / "source.txt"
+    src_file.write_text("test content")
+    return Media(_key="subdir/test.txt", filename=str(src_file))
+
+
@pytest.fixture
 def sample_datetime():
    return datetime(2023, 1, 1, 12, 0, tzinfo=timezone.utc)
--- a/tests/data/test_modules/example_extractor/example_extractor.py
+++ b/tests/data/test_modules/example_extractor/example_extractor.py
@@ -1,6 +1,11 @@
 from auto_archiver.core import Extractor

+from loguru import logger
+

 class ExampleExtractor(Extractor):
    def download(self, item):
-        print("download")
+        logger.info("download")
+
+    def cleanup(self):
+        logger.info("cleanup")
--- a/tests/data/test_modules/example_module/example_module.py
+++ b/tests/data/test_modules/example_module/example_module.py
@@ -1,27 +1,29 @@
 from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata

+from loguru import logger
+

 class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):
    def download(self, item):
-        print("download")
+        logger.info("download")

    def __iter__(self):
        yield Metadata().set_url("https://example.com")

    def done(self, result):
-        print("done")
+        logger.info("done")

    def enrich(self, to_enrich):
-        print("enrich")
+        logger.info("enrich")

    def get_cdn_url(self, media):
        return "nice_url"

    def save(self, item):
-        print("save")
+        logger.info("save")

    def uploadf(self, file, key, **kwargs):
-        print("uploadf")
+        logger.info("uploadf")

    def format(self, item):
-        print("format")
+        logger.info("format")
--- a/tests/data/timestamping/digicert.tsr
+++ b/tests/data/timestamping/digicert.tsr
--- a/tests/data/timestamping/rfc3161-client-issue-104.tsr
+++ b/tests/data/timestamping/rfc3161-client-issue-104.tsr
--- a/tests/data/timestamping/self_signed.tsr
+++ b/tests/data/timestamping/self_signed.tsr
--- a/tests/data/timestamping/timestamp_token_http-timestamp-identrust-com
+++ b/tests/data/timestamping/timestamp_token_http-timestamp-identrust-com
--- a/tests/data/timestamping/valid_timestamp.tsr
+++ b/tests/data/timestamping/valid_timestamp.tsr
--- a/tests/enrichers/test_timestamping_enricher.py
+++ b/tests/enrichers/test_timestamping_enricher.py
@@ -0,0 +1,215 @@
+from pathlib import Path
+import pytest
+
+from rfc3161_client import (
+    TimeStampResponse,
+    decode_timestamp_response,
+)
+import requests
+
+from auto_archiver.modules.timestamping_enricher.timestamping_enricher import TimestampingEnricher
+from auto_archiver.core import Metadata
+
+
+@pytest.fixture
+def timestamp_response() -> TimeStampResponse:
+    with open("tests/data/timestamping/valid_timestamp.tsr", "rb") as f:
+        return decode_timestamp_response(f.read())
+
+
+@pytest.fixture
+def wrong_order_timestamp_response() -> TimeStampResponse:
+    with open("tests/data/timestamping/rfc3161-client-issue-104.tsr", "rb") as f:
+        return decode_timestamp_response(f.read())
+
+
+@pytest.fixture
+def selfsigned_response() -> TimeStampResponse:
+    with open("tests/data/timestamping/self_signed.tsr", "rb") as f:
+        return decode_timestamp_response(f.read())
+
+
+@pytest.fixture
+def digicert_response() -> TimeStampResponse:
+    with open("tests/data/timestamping/digicert.tsr", "rb") as f:
+        return f.read()
+
+
+@pytest.fixture
+def filehash():
+    return "4b7b4e39f12b8c725e6e603e6d4422500316df94211070682ef10260ff5759ef"
+
+
+@pytest.mark.download
+def test_enriching(setup_module, sample_media):
+    tsp: TimestampingEnricher = setup_module("timestamping_enricher")
+
+    # tests the current TSAs set as default in the __manifest__ to make sure they are all still working
+
+    # test the enrich method
+    metadata = Metadata().set_url("https://example.com")
+    sample_media.set("hash", "4b7b4e39f12b8c725e6e603e6d4422500316df94211070682ef10260ff5759ef")
+    metadata.add_media(sample_media)
+    tsp.enrich(metadata)
+
+
+def test_full_enriching_selfsigned(setup_module, sample_media, mocker, selfsigned_response, filehash):
+    mock_post = mocker.patch("requests.sessions.Session.post")
+    mock_post.return_value.status_code = 200
+    mock_decode_timestamp_response = mocker.patch(
+        "auto_archiver.modules.timestamping_enricher.timestamping_enricher.decode_timestamp_response"
+    )
+    mock_decode_timestamp_response.return_value = selfsigned_response
+
+    tsp: TimestampingEnricher = setup_module("timestamping_enricher", {"tsa_urls": ["http://timestamp.identrust.com"]})
+    metadata = Metadata().set_url("https://example.com")
+    sample_media.set("hash", filehash)
+    metadata.add_media(sample_media)
+    tsp.enrich(metadata)
+
+    assert len(metadata.media) == 1  # doesn't allow self-signed
+
+    # set self-signed on tsp
+    tsp.allow_selfsigned = True
+    tsp.enrich(metadata)
+
+    assert len(metadata.media) == 2
+
+
+def test_full_enriching(setup_module, sample_media, mocker, timestamp_response, filehash):
+    mock_post = mocker.patch("requests.sessions.Session.post")
+    mock_post.return_value.status_code = 200
+    mock_decode_timestamp_response = mocker.patch(
+        "auto_archiver.modules.timestamping_enricher.timestamping_enricher.decode_timestamp_response"
+    )
+    mock_decode_timestamp_response.return_value = timestamp_response
+
+    tsp: TimestampingEnricher = setup_module("timestamping_enricher", {"tsa_urls": ["http://timestamp.identrust.com"]})
+    metadata = Metadata().set_url("https://example.com")
+    sample_media.set("hash", filehash)
+    metadata.add_media(sample_media)
+    tsp.enrich(metadata)
+
+    assert metadata.get("timestamped") is True
+    assert len(metadata.media) == 2  # the original 'sample_media' and the new 'timestamp_media'
+
+    timestamp_media = metadata.media[1]
+    assert timestamp_media.filename == f"{tsp.tmp_dir}/hashes.txt"
+    assert Path(timestamp_media.filename).read_text() == filehash
+
+    # we only have one authority file because we only used one TSA
+    assert len(timestamp_media.get("timestamp_authority_files")) == 1
+    timestamp_authority_file = timestamp_media.get("timestamp_authority_files")[0]
+    assert Path(timestamp_authority_file.filename).read_bytes() == timestamp_response.time_stamp_token()
+
+    cert_chain = timestamp_authority_file.get("cert_chain")
+    assert len(cert_chain) == 3
+    assert cert_chain[0].filename == f"{tsp.tmp_dir}/1 – 85078758028491331763.crt"
+    assert cert_chain[1].filename == f"{tsp.tmp_dir}/2 – 85078371663472981624.crt"
+    assert cert_chain[2].filename == f"{tsp.tmp_dir}/3 – 13298821034946342390.crt"
+
+
+def test_full_enriching_multiple_tsa(setup_module, sample_media, mocker, timestamp_response, filehash):
+    mock_post = mocker.patch("requests.sessions.Session.post")
+    mock_post.return_value.status_code = 200
+
+    mock_decode_timestamp_response = mocker.patch(
+        "auto_archiver.modules.timestamping_enricher.timestamping_enricher.decode_timestamp_response"
+    )
+    mock_decode_timestamp_response.return_value = timestamp_response
+
+    tsp: TimestampingEnricher = setup_module(
+        "timestamping_enricher", {"tsa_urls": ["http://example.com/timestamp1", "http://example.com/timestamp2"]}
+    )
+    metadata = Metadata().set_url("https://example.com")
+    sample_media.set("hash", filehash)
+    metadata.add_media(sample_media)
+    tsp.enrich(metadata)
+
+    assert metadata.get("timestamped") is True
+    assert len(metadata.media) == 2  # the original 'sample_media' and the new 'timestamp_media'
+
+    timestamp_media = metadata.media[1]
+    assert len(timestamp_media.get("timestamp_authority_files")) == 2
+    for timestamp_token_media in timestamp_media.get("timestamp_authority_files"):
+        assert Path(timestamp_token_media.filename).read_bytes() == timestamp_response.time_stamp_token()
+        assert len(timestamp_token_media.get("cert_chain")) == 3
+
+
+def test_fails_for_digicert(setup_module, mocker, digicert_response):
+    """
+    Digicert TSRs are not compliant with RFC 3161.
+    See https://github.com/trailofbits/rfc3161-client/issues/104#issuecomment-2621960840
+    """
+    mocker.patch("requests.sessions.Session.post", return_value=requests.Response())
+    mocker.patch("requests.Response.raise_for_status")
+    mocker.patch("requests.Response.content", new_callable=mocker.PropertyMock, return_value=digicert_response)
+
+    tsa_url = "http://timestamp.digicert.com"
+    tsp: TimestampingEnricher = setup_module("timestamping_enricher")
+
+    data = b"4b7b4e39f12b8c725e6e603e6d4422500316df94211070682ef10260ff5759ef"
+    with pytest.raises(ValueError) as e:
+        tsp.sign_data(tsa_url, data)
+    assert "ASN.1 parse error: ParseError" in str(e.value)
+
+
+@pytest.mark.download
+def test_download_tsr(setup_module):
+    tsa_url = "http://timestamp.identrust.com"
+    tsp: TimestampingEnricher = setup_module("timestamping_enricher")
+
+    data = b"4b7b4e39f12b8c725e6e603e6d4422500316df94211070682ef10260ff5759ef"
+    result: TimeStampResponse = tsp.sign_data(tsa_url, data)
+    assert isinstance(result, TimeStampResponse)
+
+    verified_root_cert = tsp.verify_signed(result, data)
+    assert verified_root_cert.subject.rfc4514_string() == "CN=IdenTrust Commercial Root CA 1,O=IdenTrust,C=US"
+
+    # test downloading the cert
+    cert_chain = tsp.save_certificate(result, verified_root_cert)
+    assert len(cert_chain) == 3
+
+
+def test_verify_save(setup_module, timestamp_response):
+    tsp: TimestampingEnricher = setup_module("timestamping_enricher")
+
+    verified_root_cert = tsp.verify_signed(
+        timestamp_response, b"4b7b4e39f12b8c725e6e603e6d4422500316df94211070682ef10260ff5759ef"
+    )
+    assert verified_root_cert.subject.rfc4514_string() == "CN=IdenTrust Commercial Root CA 1,O=IdenTrust,C=US"
+
+    cert_chain = tsp.save_certificate(timestamp_response, verified_root_cert)
+    assert len(cert_chain) == 3
+
+    assert cert_chain[0].filename == f"{tsp.tmp_dir}/1 – 85078758028491331763.crt"
+    assert cert_chain[1].filename == f"{tsp.tmp_dir}/2 – 85078371663472981624.crt"
+    assert cert_chain[2].filename == f"{tsp.tmp_dir}/3 – 13298821034946342390.crt"
+
+
+def test_order_crt_correctly(setup_module, wrong_order_timestamp_response):
+    # reference: https://github.com/trailofbits/rfc3161-client/issues/104#issuecomment-2711244010
+    tsp: TimestampingEnricher = setup_module("timestamping_enricher")
+
+    # get the certificates, make sure the reordering is working:
+
+    ordered_certs = tsp.tst_certs(wrong_order_timestamp_response)
+    assert len(ordered_certs) == 2
+    assert ordered_certs[0].subject.rfc4514_string() == "CN=TrustID Timestamp Authority,O=IdenTrust,C=US"
+    assert ordered_certs[1].subject.rfc4514_string() == "CN=TrustID Timestamping CA 3,O=IdenTrust,C=US"
+
+
+def test_invalid_tsa_invalid_response(setup_module, mocker):
+    mocker.patch("requests.sessions.Session.post", return_value=requests.Response())
+    raise_for_status = mocker.patch("requests.Response.raise_for_status")
+    raise_for_status.side_effect = requests.exceptions.HTTPError("404 Client Error")
+    tsp = setup_module("timestamping_enricher")
+
+    with pytest.raises(requests.exceptions.HTTPError, match="404 Client Error"):
+        tsp.sign_data("http://bellingcat.com/page-not-found/", b"my-message")
+
+
+def test_fail_on_selfsigned_cert(setup_module, selfsigned_response):
+    tsp = setup_module("timestamping_enricher")
+    root_cert = tsp.verify_signed(selfsigned_response, b"my-message")
+    assert root_cert is None
--- a/tests/enrichers/test_wacz_enricher.py
+++ b/tests/enrichers/test_wacz_enricher.py
@@ -119,4 +119,4 @@ def test_extract_media(wacz_enricher, metadata, tmp_path, mocker) -> None:
    metadata.add_media(Media("something.wacz"), "browsertrix")
    wacz_enricher.extract_media_from_wacz(metadata, str(wacz_file))
    assert len(metadata.media) == 2
-    assert metadata.media[1].properties.get("id") == "browsertrix-screenshot"
+    assert metadata.media[1].properties.get("id") == "browsertrix-screenshot-0"
--- a/tests/extractors/test_extractor_base.py
+++ b/tests/extractors/test_extractor_base.py
@@ -25,5 +25,5 @@ class TestExtractorBase(object):
        else:
            assert status == test_response.status

-        assert title == test_response.get_title()
-        assert timestamp, test_response.get("timestamp")
+        assert title in test_response.get_title()
+        assert timestamp == test_response.get("timestamp")
--- a/tests/extractors/test_generic_extractor.py
+++ b/tests/extractors/test_generic_extractor.py
@@ -29,6 +29,7 @@ class TestGenericExtractor(TestExtractorBase):
        "proxy": None,
        "cookies_from_browser": False,
        "cookie_file": None,
+        "pot_provider": False,
    }

    def test_load_dropin(self):
@@ -36,7 +37,7 @@ class TestGenericExtractor(TestExtractorBase):
        package = "auto_archiver.modules.generic_extractor"
        assert self.extractor.dropin_for_name("bluesky", package=package)

-        # test loading dropings via filepath
+        # test loading dropins via filepath
        path = os.path.join(dirname(dirname(__file__)), "data/")
        assert self.extractor.dropin_for_name("dropin", additional_paths=[path])

@@ -121,7 +122,7 @@ class TestGenericExtractor(TestExtractorBase):
            == "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
        )
        assert len(result.media) == 2
-        assert Path(result.media[0].filename).name == "J---aiyznGQ.webm"
+        assert "J---aiyznGQ" in Path(result.media[0].filename).name
        assert Path(result.media[1].filename).name == "hqdefault.jpg"

    @pytest.mark.download
@@ -218,7 +219,7 @@ class TestGenericExtractor(TestExtractorBase):
        post = self.extractor.download(make_item(url))
        self.assertValidResponseMetadata(
            post,
-            "Bellingcat - This month's Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it's crucial to investigate the people behind these services",
+            "Bellingcat - This month's Bellingchat Premium is with @KolinaKoltai",
            datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc),
        )

@@ -291,3 +292,42 @@ class TestGenericExtractor(TestExtractorBase):
        post = self.extractor.download(make_item(url))
        assert "Bellingcat researcher Kolina Koltai delves deeper into Clothoff" in post.get("content")
        assert post.get_title() == "Bellingcat"
+
+
+class TestGenericExtractorPoToken:
+    @pytest.fixture
+    def extractor(self, mocker):
+        extractor = GenericExtractor()
+        extractor.extractor_args = {}
+        extractor.setup_token_generation_script = mocker.Mock()
+        return extractor
+
+    def test_po_token_disabled_does_not_call_setup(self, extractor):
+        extractor.bguils_po_token_method = "disabled"
+        extractor.in_docker = True
+        extractor.setup_po_tokens()
+        extractor.setup_token_generation_script.assert_not_called()
+
+    def test_po_token_default_in_docker_calls_setup(self, extractor, mocker):
+        extractor.bguils_po_token_method = "auto"
+        mocker.patch.dict(os.environ, {"RUNNING_IN_DOCKER": "1"})
+        extractor.setup_po_tokens()
+        extractor.setup_token_generation_script.assert_called_once()
+
+    def test_po_token_default_local_does_not_call_setup(self, extractor, caplog, mocker):
+        extractor.bguils_po_token_method = "auto"
+        # clears env vars for this test
+        mocker.patch.dict(os.environ, {}, clear=True)
+        extractor.setup_po_tokens()
+        extractor.setup_token_generation_script.assert_not_called()
+        assert "Proof of Origin Token method not explicitly set" in caplog.text
+
+    def test_po_token_script_always_calls_setup(self, extractor):
+        extractor.bguils_po_token_method = "script"
+        extractor.in_docker = False
+        extractor.setup_po_tokens()
+        extractor.setup_token_generation_script.assert_called_once()
+        extractor.setup_token_generation_script.reset_mock()
+        extractor.in_docker = True
+        extractor.setup_po_tokens()
+        extractor.setup_token_generation_script.assert_called_once()
--- a/tests/extractors/test_instagram_tbot_extractor.py
+++ b/tests/extractors/test_instagram_tbot_extractor.py
@@ -68,6 +68,12 @@ def test_download_invalid(extractor, metadata_sample, mocker):
    assert extractor.download(metadata_sample) is False


+def test_fails_with_empty_response(extractor, metadata_sample, mocker):
+    mocker.patch.object(extractor, "_send_url_to_bot", return_value=(mocker.MagicMock(), 101))
+    mocker.patch.object(extractor, "_process_messages", return_value="")
+    assert extractor.download(metadata_sample) is False
+
+
@pytest.mark.skip(reason="Requires authentication.")
 class TestInstagramTbotExtractorReal(TestExtractorBase):
    # To run these tests set the TELEGRAM_API_ID and TELEGRAM_API_HASH environment variables, and ensure the session file exists.
--- a/tests/extractors/test_telethon_extractor.py
+++ b/tests/extractors/test_telethon_extractor.py
@@ -0,0 +1,26 @@
+import os
+from datetime import date
+
+import pytest
+
+
+@pytest.fixture(autouse=True)
+def mock_client_setup(mocker):
+    mocker.patch("telethon.client.auth.AuthMethods.start")
+
+
+def test_setup_fails_clear_session_file(get_lazy_module, tmp_path, mocker):
+    start = mocker.patch("telethon.client.auth.AuthMethods.start")
+    start.side_effect = Exception("Test exception")
+
+    # make sure the default setup file is created
+    session_file = tmp_path / "test.session"
+
+    lazy_module = get_lazy_module("telethon_extractor")
+
+    with pytest.raises(Exception):
+        lazy_module.load({"telethon_extractor": {"session_file": str(session_file), "api_id": 123, "api_hash": "ABC"}})
+
+    assert session_file.exists()
+    assert f"telethon-{date.today().strftime('%Y-%m-%d')}" in lazy_module._instance.session_file
+    assert os.path.exists(lazy_module._instance.session_file + ".session")
--- a/tests/extractors/test_vk_extractor.py
+++ b/tests/extractors/test_vk_extractor.py
@@ -1,77 +0,0 @@
-import pytest
-
-from auto_archiver.core import Metadata
-from auto_archiver.modules.vk_extractor import VkExtractor
-
-
-@pytest.fixture
-def mock_vk_scraper(mocker):
-    """Fixture to mock VkScraper."""
-    return mocker.patch("auto_archiver.modules.vk_extractor.vk_extractor.VkScraper")
-
-
-@pytest.fixture
-def vk_extractor(setup_module, mock_vk_scraper) -> VkExtractor:
-    """Fixture to initialize VkExtractor with mocked VkScraper."""
-    extractor_module = "vk_extractor"
-    configs = {
-        "username": "name",
-        "password": "password123",
-        "session_file": "secrets/vk_config.v2.json",
-    }
-    vk = setup_module(extractor_module, configs)
-    vk.vks = mock_vk_scraper.return_value
-    return vk
-
-
-def test_netloc(vk_extractor, metadata):
-    # metadata url set as: "https://example.com/"
-    assert vk_extractor.download(metadata) is False
-
-
-def test_vk_url_but_scrape_returns_empty(vk_extractor, metadata):
-    metadata.set_url("https://vk.com/valid-wall")
-    vk_extractor.vks.scrape.return_value = []
-    assert vk_extractor.download(metadata) is False
-    assert metadata.netloc == "vk.com"
-    vk_extractor.vks.scrape.assert_called_once_with(metadata.get_url())
-
-
-def test_successful_scrape_and_download(vk_extractor, metadata, mocker):
-    mock_scrapes = [
-        {"text": "Post Title", "datetime": "2023-01-01T00:00:00", "id": 1},
-        {"text": "Another Post", "datetime": "2023-01-02T00:00:00", "id": 2},
-    ]
-    mock_filenames = ["image1.jpg", "image2.png"]
-    vk_extractor.vks.scrape.return_value = mock_scrapes
-    vk_extractor.vks.download_media.return_value = mock_filenames
-    metadata.set_url("https://vk.com/valid-wall")
-    result = vk_extractor.download(metadata)
-    # Test metadata
-    assert result.is_success()
-    assert result.status == "vk: success"
-    assert result.get_title() == "Post Title"
-    assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
-    assert "Another Post" in result.metadata["content"]
-    # Test Media objects
-    assert len(result.media) == 2
-    assert result.media[0].filename == "image1.jpg"
-    assert result.media[1].filename == "image2.png"
-    vk_extractor.vks.download_media.assert_called_once_with(mock_scrapes, vk_extractor.tmp_dir)
-
-
-def test_adds_first_title_and_timestamp(vk_extractor):
-    metadata = Metadata().set_url("https://vk.com/no-metadata")
-    metadata.set_url("https://vk.com/no-metadata")
-    mock_scrapes = [
-        {"text": "value", "datetime": "2023-01-01T00:00:00"},
-        {"text": "value2", "datetime": "2023-01-02T00:00:00"},
-    ]
-    vk_extractor.vks.scrape.return_value = mock_scrapes
-    vk_extractor.vks.download_media.return_value = []
-    result = vk_extractor.download(metadata)
-
-    assert result.get_title() == "value"
-    # formatted timestamp
-    assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
-    assert result.is_success()
--- a/tests/test_orchestrator.py
+++ b/tests/test_orchestrator.py
@@ -237,3 +237,23 @@ def test_wrong_step_type(test_args, caplog):
    with pytest.raises(SetupError) as err:
        orchestrator.setup(args)
        assert "Module 'example_extractor' is not a feeder" in str(err.value)
+
+
+def test_load_failed_extractor_cleanup(test_args, mocker, caplog):
+    orchestrator = ArchivingOrchestrator()
+
+    # hack to set up the paths so we can patch properly
+    orchestrator.module_factory.setup_paths([TEST_MODULES])
+
+    # patch example_module.setup to throw an exception
+    mocker.patch(
+        "auto_archiver.modules.example_extractor.example_extractor.ExampleExtractor.setup",
+        side_effect=Exception("Test exception"),
+    )
+
+    with pytest.raises(Exception):
+        orchestrator.setup(test_args + ["--extractors", "example_extractor"])
+
+    assert "Error during setup of modules: Test exception" in caplog.text
+    # make sure the 'cleanup' is called
+    assert "cleanup" in caplog.text
Author	SHA1	Message	Date
Miguel Sozinho Ramalho	6735fa890b	v1.0.1 dependency updates, generic extractor improvements (#307 ) * wacz: allow exceptional cases where more than one resource image is available * improves generic extractor edge-cases and yt-dlp updates * REMOVES vk_extractor until further notice * bumps browsertrix in docker image * npm version bump on scripts/settings * poetry updates * Changed log level on gsheet_feeder_db started from warning to info (#301) * closes 305 and further fixes finding local downloads from uncommon ytdlp extractors * use ffmpeg -bitexact to reduce duplicate content storing * formatting * adds yt-dlp curl-cffi * version bump * linting --------- Co-authored-by: Dave Mateer <davemateer@gmail.com>	2025-06-02 20:57:12 +01:00
Dave Mateer	48be13fb2a	catch for if self.comments are true but no actual comments in video (#303 ) * catch for if self.comments are true but no actual comments in video * simplifies check code --------- Co-authored-by: Miguel Sozinho Ramalho <19508417+msramalho@users.noreply.github.com>	2025-06-02 13:02:19 +01:00
msramalho	e6fdef66df	improves instructions on docker setup with an example URL	2025-04-28 11:16:01 +01:00
Miguel Sozinho Ramalho	33cacd145f	Update tests-download.yaml to shift ownership of notifications	2025-04-07 21:15:18 +01:00
Miguel Sozinho Ramalho	0f69b5fe0c	update repo badges	2025-03-31 16:19:29 +01:00
Erin Clark	ad2e8397b2	Merge pull request #287 from bellingcat/fix/insta_tbot_empty Only return success for instagram_tbot_extractor.py with content.	2025-03-31 14:31:46 +01:00
erinhmclark	144adaad5b	Only return success for instagram_tbot_extractor.py with content.	2025-03-31 14:14:36 +01:00
Erin Clark	c7c7eb00a1	Merge pull request #286 from bellingcat/version_comparison Small code fixes and GH Actions cache	2025-03-31 12:40:42 +01:00
erinhmclark	7e4ba62918	Small code change	2025-03-31 12:05:39 +01:00
erinhmclark	9c2b506189	update runner os to matrix os.	2025-03-31 12:00:24 +01:00
erinhmclark	8940580638	Add poetry cache clear, and small code change	2025-03-31 11:41:26 +01:00
erinhmclark	c2821d7c83	Fix poetry install deletion	2025-03-31 11:25:51 +01:00
erinhmclark	a590647279	Small code tidy to trigger tests.	2025-03-31 11:23:49 +01:00
erinhmclark	1edfdae03e	Update download tests to match cache process.	2025-03-31 11:17:40 +01:00
erinhmclark	6c7f6af4b4	Add cache action with key to OS, py version and lockfile hash, and install packages from source.	2025-03-31 11:11:56 +01:00
Erin Clark	8685b6bf13	Merge pull request #285 from bellingcat/fix-ubuntu-22 [WIP] Change order of poetry install - in case this fixes core tests	2025-03-28 15:38:03 +00:00
Patrick Robertson	0ce7f5a1b5	Disable caching	2025-03-28 18:40:02 +04:00
Patrick Robertson	85d3f2fa02	Revert changes	2025-03-28 18:36:11 +04:00
Patrick Robertson	fd540bd03a	Code change to trigger tests	2025-03-28 18:29:59 +04:00
Patrick Robertson	86f328515c	Use cache key that includes os version	2025-03-28 18:29:52 +04:00
erinhmclark	68992025b0	Update version comparison.	2025-03-28 14:29:44 +00:00
Patrick Robertson	6544934825	Merge pull request #283 from bellingcat/1.0-release v1.0.0 release 🎉	2025-03-28 18:06:59 +04:00
Patrick Robertson	197599b406	Merge pull request #284 from bellingcat/revert-downloads-test Revert downloads CI tests changes	2025-03-28 18:06:49 +04:00
Erin Clark	96efdcbba1	Merge pull request #281 from bellingcat/add_inst_api_script Add InstagrAPI server script to be used with the Instagram API Extractor.	2025-03-28 13:58:37 +00:00
Patrick Robertson	2ec494b4b9	Revert downloads CI tests changes It wasn't properly being triggered after the core tests. this reverts so that the download tests just run whatever	2025-03-28 17:55:58 +04:00
Erin Clark	1d18399d70	Merge pull request #222 from bellingcat/feat/yt-dlp-pots yt-dlp proposed extractor_args and PO Token client.	2025-03-28 13:54:27 +00:00
Patrick Robertson	3550a009e6	v1.0.0 release 🎉	2025-03-28 13:53:29 +00:00
erinhmclark	dd7d85b4b4	Lock	2025-03-28 13:47:18 +00:00
erinhmclark	c510c04643	Update config reference in test_generic_extractor.py	2025-03-28 13:43:46 +00:00
erinhmclark	a0d955fe84	lock	2025-03-28 13:39:58 +00:00
erinhmclark	5e7c57650b	Update "default" to "auto" for clarity, update docs	2025-03-28 13:16:16 +00:00
erinhmclark	1db7d6702d	Update the documentation	2025-03-28 12:27:18 +00:00
erinhmclark	b1a8792f9f	Remove duplicate line	2025-03-28 11:44:37 +00:00
erinhmclark	f715100dd5	Add run_instagrapi_server.sh and update docs	2025-03-28 11:31:23 +00:00
erinhmclark	dbcf19d1b8	Update update path reference	2025-03-28 10:55:21 +00:00
erinhmclark	0840b7283c	Format	2025-03-28 10:43:00 +00:00
erinhmclark	b5dc1854a2	Merge branch 'main' into feat/yt-dlp-pots	2025-03-28 10:42:24 +00:00
erinhmclark	efab0f9a91	Add test	2025-03-28 10:37:22 +00:00
erinhmclark	bc35116975	Update poetry.lock	2025-03-28 10:37:13 +00:00
Patrick Robertson	25f1f5dc93	Merge pull request #279 from bellingcat/telethon_tweaks Fix calling extractor.cleanup (fixes telethon issue) + tidy up telethon extractor session file naming	2025-03-28 14:13:26 +04:00
erinhmclark	f99dcc63a1	Minor updates	2025-03-28 09:46:44 +00:00
Patrick Robertson	48fbfc3b86	Merge pull request #280 from bellingcat/download-tests Download tests	2025-03-28 13:33:30 +04:00
Erin Clark	e7aae76ffe	Merge pull request #271 from bellingcat/dependabot/github_actions/actions-7fa5136294 Bump the actions group with 3 updates	2025-03-28 09:33:25 +00:00
erinhmclark	1466700b45	Small updates to docs, poetry.lock	2025-03-28 08:23:10 +00:00
erinhmclark	00b29db390	Update documentaion for instagrapi api	2025-03-28 00:41:35 +00:00
erinhmclark	2a0dfaead2	Add instagrapi server scripts	2025-03-28 00:41:05 +00:00
Patrick Robertson	a448e2532c	Code tweak for clarity	2025-03-27 15:20:52 +04:00
Patrick Robertson	46a51cce11	Fix up tests-download to properly run once core tests completed	2025-03-27 15:18:58 +04:00
Patrick Robertson	b7949a489f	Simplify telethon unit tests for CI (don't use TestExtractorBase - it causes loading issues)	2025-03-26 23:51:21 +04:00
Patrick Robertson	e0e9f93065	Skip update checks for ytdlp when running tests	2025-03-26 23:41:20 +04:00
Patrick Robertson	e06b0c0585	Skip checking if docker is running for tests + more graceful test for filename	2025-03-26 23:03:48 +04:00
Patrick Robertson	95ea9fb231	Telethon unit tests + tidyup	2025-03-26 22:53:27 +04:00
Patrick Robertson	17d2d14680	Fix running 'cleanup' method on extractors that fail to start	2025-03-26 22:52:52 +04:00
erinhmclark	f54b5c5f18	Update poetry.lock	2025-03-26 18:05:04 +00:00
erinhmclark	456b2746c8	Update the docs	2025-03-26 18:01:20 +00:00
erinhmclark	2cad5edea8	Fix default config	2025-03-26 17:33:00 +00:00
Patrick Robertson	580de88366	Set the new session filename before copying Fixes a potential bug whereby if the copy fails for some reason, the 'cleanup' command would remove the original session file	2025-03-26 21:32:23 +04:00
erinhmclark	093ce34a6a	Ruff format.	2025-03-26 17:02:20 +00:00
erinhmclark	7872d9356c	Merge branch 'main' into feat/yt-dlp-pots	2025-03-26 17:00:38 +00:00
erinhmclark	23e7dd0995	Remove old implementaion	2025-03-26 17:00:31 +00:00
erinhmclark	565275ac37	Basic documentation for POT process	2025-03-26 16:59:01 +00:00
erinhmclark	4a02407659	Typo fix.	2025-03-26 16:46:21 +00:00
erinhmclark	ae523eb06f	Udpate PO token generation script method	2025-03-26 16:45:29 +00:00
erinhmclark	d87c0dc3a9	Implement update for pot plugin.	2025-03-26 16:02:29 +00:00
dependabot[bot]	1612fef59b	Bump the actions group with 3 updates Bumps the actions group with 3 updates: [actions/checkout](https://github.com/actions/checkout), [docker/login-action](https://github.com/docker/login-action) and [docker/metadata-action](https://github.com/docker/metadata-action). Updates `actions/checkout` from 3 to 4 - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v3...v4) Updates `docker/login-action` from 3.3.0 to 3.4.0 - [Release notes](https://github.com/docker/login-action/releases) - [Commits](`9780b0c442...74a5d14239`) Updates `docker/metadata-action` from 5.6.1 to 5.7.0 - [Release notes](https://github.com/docker/metadata-action/releases) - [Commits](`369eb591f4...902fa8ec7d`) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major dependency-group: actions - dependency-name: docker/login-action dependency-type: direct:production update-type: version-update:semver-minor dependency-group: actions - dependency-name: docker/metadata-action dependency-type: direct:production update-type: version-update:semver-minor dependency-group: actions ... Signed-off-by: dependabot[bot] <support@github.com>	2025-03-26 12:09:25 +00:00
Patrick Robertson	fbf51f61b9	Merge pull request #276 from bellingcat/actions_updates Only run 'download' actions once core completes + re-add `ubuntu-latest` to matrix	2025-03-26 12:08:13 +00:00
Patrick Robertson	a9ff55a36e	Merge pull request #278 from bellingcat/dependabot_fix This force-pins cryptography to >44.0.1 to fix dependabot warning	2025-03-26 11:57:35 +00:00
Patrick Robertson	20bc80b9ef	Slightly more consistent/tidier naming for the session files Don't add/remove .session from name, keep the file name without .session at all times	2025-03-26 15:57:11 +04:00
Patrick Robertson	5bb0cbf3ff	Lock poetry file	2025-03-26 15:43:03 +04:00
Patrick Robertson	3eb9ffddfe	This force-pins cryptography to >44.0.1 to fix dependabot warning pyOpenSSL also no longer needed	2025-03-26 15:39:53 +04:00
Patrick Robertson	76e90dd23a	Small code tidy ups	2025-03-26 15:34:33 +04:00
Patrick Robertson	0450d3fcb9	Merge branch 'main' into actions_updates	2025-03-26 15:29:38 +04:00
Patrick Robertson	e9ee4d67ba	Re-add 'ubuntu-latest' - now that we're rid of tsp_client	2025-03-26 15:29:36 +04:00
Patrick Robertson	43a80dbcda	Merge pull request #224 from bellingcat/timestamping_rewrite Timestamping rewrite	2025-03-26 11:25:55 +00:00
Patrick Robertson	cb3ae055d6	Also remove certvalidator from poetry/project	2025-03-26 15:11:25 +04:00
Patrick Robertson	4cfa6455c7	Only make the downloads action run if the core action was successful	2025-03-26 15:07:57 +04:00
Patrick Robertson	0073a08525	Update manifest dependencies to remove tsp_client et al.	2025-03-26 14:57:55 +04:00
Patrick Robertson	46e31808f6	Version bump	2025-03-26 14:54:33 +04:00
Patrick Robertson	4af23e13d1	Bump rfc3161-client to 1.0.1	2025-03-26 14:50:12 +04:00
Patrick Robertson	d6be1ff84f	Merge branch 'main' into timestamping_rewrite	2025-03-26 14:37:51 +04:00
erinhmclark	633290a9cc	Update for pot providers list	2025-03-25 18:27:06 +00:00
erinhmclark	040a864d5c	Merge branch 'refs/heads/main' into feat/yt-dlp-pots # Conflicts: # poetry.lock	2025-03-25 18:26:43 +00:00
erinhmclark	b4c33318c4	Merge branch 'main' into feat/yt-dlp-pots # Conflicts: # src/auto_archiver/modules/generic_extractor/__manifest__.py # tests/test_modules.py	2025-03-25 15:16:31 +00:00
Patrick Robertson	74974ef0ed	Merge pull request #268 from bellingcat/minor_improvements Minor improvements	2025-03-25 12:52:08 +00:00
Patrick Robertson	5c6005d843	Merge pull request #269 from bellingcat/update-dependabot Add explicit dependabots for pip/poetry, GH actions and npm	2025-03-25 06:30:24 +00:00
Patrick Robertson	d6a7f31248	Add note that authentication only works for some modules	2025-03-24 18:28:35 +04:00
Patrick Robertson	8aba663534	Update node module versions	2025-03-24 18:28:30 +04:00
Patrick Robertson	ace97ac7fd	Don't run ruff on non-python file changes	2025-03-24 18:00:14 +04:00
Patrick Robertson	ad373ae733	Add explicit dependabots for pip/poetry, GH actiona and npm	2025-03-24 17:57:53 +04:00
Patrick Robertson	260e76dd3d	Update dependencies	2025-03-24 17:48:25 +04:00
Patrick Robertson	a9fe959ea1	Fix unit tests for latest yt-dlp (Yt-dlp title is now truncated)	2025-03-24 17:48:15 +04:00
Patrick Robertson	beb7f3893d	Add comments/notes to WACZ enricher about browser profiles	2025-03-24 17:39:47 +04:00
Patrick Robertson	5055402c2a	Bump browsertrix version	2025-03-24 17:39:44 +04:00
Patrick Robertson	3c4625d708	Further ruff tweaks	2025-03-24 16:39:59 +04:00
Patrick Robertson	31fa7380f5	Fix up unit tests + issue when working with self-signed certs	2025-03-24 16:00:40 +04:00
Patrick Robertson	396ec03bae	Tidy up unit tests further + make more non-download	2025-03-24 15:26:22 +04:00
Patrick Robertson	e811196711	Ruff fixes	2025-03-24 15:10:46 +04:00
Patrick Robertson	dfde6f1995	Merge main into timestamping_enricher	2025-03-24 15:09:29 +04:00
Miguel Sozinho Ramalho	7b454baa02	Create dependabot.yml	2025-03-24 10:49:36 +00:00
Patrick Robertson	0f9c6a9a5c	Update yt-dlp to latest	2025-03-24 14:49:18 +04:00
Patrick Robertson	c980500978	Actually restart AA after updating yt-dlp. A simple 'importlib.reload()' doesn't take into account all imports	2025-03-24 14:33:59 +04:00
erinhmclark	93921e71d4	Clarify comments in pot scripts.	2025-03-19 11:33:35 +00:00
erinhmclark	675de50ee7	Update module test to test for default config keys within loaded	2025-03-19 10:47:28 +00:00
erinhmclark	fc6946f78a	Run format.	2025-03-18 21:43:18 +00:00
erinhmclark	2fdf6b7564	Update generic_extractor.py for general/ youtube extraction.	2025-03-18 21:33:21 +00:00
erinhmclark	ba9d67e4bb	Merge branch 'main' into feat/yt-dlp-pots	2025-03-18 20:10:38 +00:00
erinhmclark	c4e63ebd8c	Add conditional check to setup bgutils token generation script. TODO: Update tests	2025-03-18 14:54:57 +00:00
erinhmclark	b83bfda187	Update directory location, add .gitignore	2025-03-18 14:10:20 +00:00
erinhmclark	cb632723bd	Add scripts to pull only /server/ section of pots generator, adn only install at runtime.	2025-03-18 13:47:01 +00:00
erinhmclark	0c892f3cf1	Temp fix for tests by setting path in manifest.	2025-03-18 11:44:08 +00:00
erinhmclark	43ef8f2aeb	Add update to POT setup script.	2025-03-17 20:59:34 +00:00
erinhmclark	e6b1a8c893	Add POT setup script.	2025-03-17 20:34:00 +00:00
erinhmclark	8548b7def7	Refactor setup method to pull and transpile the token generator.	2025-03-17 18:53:59 +00:00
erinhmclark	bbe25537c7	Merge branch 'main' into feat/yt-dlp-pots	2025-03-17 16:54:29 +00:00
erinhmclark	5daeae994a	Fix the extractor args for new list structure.	2025-03-17 14:17:31 +00:00
erinhmclark	f5bbfe5d1c	Merge branch 'main' into feat/yt-dlp-pots	2025-03-17 10:43:35 +00:00
Patrick Robertson	89ee6f19b6	List out all valid TSAs + option for users to allow self signed if they want	2025-03-11 16:12:13 +00:00
Patrick Robertson	294033f156	Fix bug ordering tsr that only have one cert + more unit tests	2025-03-11 15:44:04 +00:00
Patrick Robertson	2ffe124d95	Add unit test for invalid digicert tsrs	2025-03-11 11:13:36 +00:00
Patrick Robertson	1db8be91db	Improved unit tests for timestamping	2025-03-11 11:08:52 +00:00
Patrick Robertson	3f6acc0917	fully working timestamping enricher	2025-03-11 10:04:46 +00:00
erinhmclark	76bb1496c8	Merge branch 'main' into feat/yt-dlp-pots # Conflicts: # src/auto_archiver/modules/generic_extractor/__manifest__.py	2025-03-07 16:54:01 +00:00
erinhmclark	7e4b44883b	Add temp options for testing	2025-03-04 14:03:39 +00:00
erinhmclark	77b517cfc1	Merge remote-tracking branch 'origin/feat/yt-dlp-pots' into feat/yt-dlp-pots	2025-03-03 22:02:14 +00:00
erinhmclark	2c1753e14b	Added Bgutils PO token provider.	2025-03-03 21:11:41 +00:00
erinhmclark	dd07b0b830	Allow flexible extractor_args in generic_extractor.py.	2025-03-03 21:11:34 +00:00
erinhmclark	0eae2bee6a	Add yt-dlp-get-pot and yt-dlp-getpot-wpc requirements.	2025-03-03 21:08:00 +00:00
Patrick Robertson	a0869bb3b2	Fixed up timestamp verifying - waiting on issue with rfc-client to be fixed Ref: https://github.com/trailofbits/rfc3161-client/issues/104#issuecomment-2693890607	2025-03-03 10:28:30 +00:00
Patrick Robertson	afc117a229	Get downloading certs working	2025-02-26 09:33:56 +00:00
Patrick Robertson	4dcb77c29f	Merge branch 'main' into timestamping_rewrite	2025-02-25 17:10:55 +00:00
Patrick Robertson	898faf6fe4	Further WIP - currently working on verify_signed	2025-02-25 12:08:08 +00:00
Patrick Robertson	6987a4827e	Set poetry packages - remove tsp_client and update cryptography	2025-02-25 11:57:20 +00:00
erinhmclark	2d4f1b5b79	Added Bgutils PO token provider.	2025-02-25 10:49:57 +00:00
Patrick Robertson	01bf88a695	Merge branch 'main' into timestamping_rewrite	2025-02-24 12:03:14 +00:00
erinhmclark	c5127f5fd1	Allow flexible extractor_args in generic_extractor.py.	2025-02-24 11:40:44 +00:00
erinhmclark	158d448cbc	Add yt-dlp-get-pot and yt-dlp-getpot-wpc requirements.	2025-02-24 11:40:39 +00:00
Patrick Robertson	d0c379a3ba	WIP - timestamping enricher	2025-02-11 18:18:19 +00:00
Patrick Robertson	3163cb793a	Fix timestamping enricher for new module structure (temp paths)	2025-02-11 15:26:40 +00:00
Patrick Robertson	7bb4d68a22	Merge branch 'load_modules' into timestamping_rewrite	2025-02-11 15:21:31 +00:00
Patrick Robertson	4c1c8953ca	Add unit tests for timestamping_enricher	2025-01-29 12:20:52 +01:00