mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-08 19:38:29 +03:00
Compare commits
140 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6735fa890b | ||
|
|
48be13fb2a | ||
|
|
e6fdef66df | ||
|
|
33cacd145f | ||
|
|
0f69b5fe0c | ||
|
|
ad2e8397b2 | ||
|
|
144adaad5b | ||
|
|
c7c7eb00a1 | ||
|
|
7e4ba62918 | ||
|
|
9c2b506189 | ||
|
|
8940580638 | ||
|
|
c2821d7c83 | ||
|
|
a590647279 | ||
|
|
1edfdae03e | ||
|
|
6c7f6af4b4 | ||
|
|
8685b6bf13 | ||
|
|
0ce7f5a1b5 | ||
|
|
85d3f2fa02 | ||
|
|
fd540bd03a | ||
|
|
86f328515c | ||
|
|
68992025b0 | ||
|
|
6544934825 | ||
|
|
197599b406 | ||
|
|
96efdcbba1 | ||
|
|
2ec494b4b9 | ||
|
|
1d18399d70 | ||
|
|
3550a009e6 | ||
|
|
dd7d85b4b4 | ||
|
|
c510c04643 | ||
|
|
a0d955fe84 | ||
|
|
5e7c57650b | ||
|
|
1db7d6702d | ||
|
|
b1a8792f9f | ||
|
|
f715100dd5 | ||
|
|
dbcf19d1b8 | ||
|
|
0840b7283c | ||
|
|
b5dc1854a2 | ||
|
|
efab0f9a91 | ||
|
|
bc35116975 | ||
|
|
25f1f5dc93 | ||
|
|
f99dcc63a1 | ||
|
|
48fbfc3b86 | ||
|
|
e7aae76ffe | ||
|
|
1466700b45 | ||
|
|
00b29db390 | ||
|
|
2a0dfaead2 | ||
|
|
a448e2532c | ||
|
|
46a51cce11 | ||
|
|
b7949a489f | ||
|
|
e0e9f93065 | ||
|
|
e06b0c0585 | ||
|
|
95ea9fb231 | ||
|
|
17d2d14680 | ||
|
|
f54b5c5f18 | ||
|
|
456b2746c8 | ||
|
|
2cad5edea8 | ||
|
|
580de88366 | ||
|
|
093ce34a6a | ||
|
|
7872d9356c | ||
|
|
23e7dd0995 | ||
|
|
565275ac37 | ||
|
|
4a02407659 | ||
|
|
ae523eb06f | ||
|
|
d87c0dc3a9 | ||
|
|
1612fef59b | ||
|
|
fbf51f61b9 | ||
|
|
a9ff55a36e | ||
|
|
20bc80b9ef | ||
|
|
5bb0cbf3ff | ||
|
|
3eb9ffddfe | ||
|
|
76e90dd23a | ||
|
|
0450d3fcb9 | ||
|
|
e9ee4d67ba | ||
|
|
43a80dbcda | ||
|
|
cb3ae055d6 | ||
|
|
4cfa6455c7 | ||
|
|
0073a08525 | ||
|
|
46e31808f6 | ||
|
|
4af23e13d1 | ||
|
|
d6be1ff84f | ||
|
|
633290a9cc | ||
|
|
040a864d5c | ||
|
|
b4c33318c4 | ||
|
|
74974ef0ed | ||
|
|
5c6005d843 | ||
|
|
d6a7f31248 | ||
|
|
8aba663534 | ||
|
|
ace97ac7fd | ||
|
|
ad373ae733 | ||
|
|
260e76dd3d | ||
|
|
a9fe959ea1 | ||
|
|
beb7f3893d | ||
|
|
5055402c2a | ||
|
|
3c4625d708 | ||
|
|
31fa7380f5 | ||
|
|
396ec03bae | ||
|
|
e811196711 | ||
|
|
dfde6f1995 | ||
|
|
7b454baa02 | ||
|
|
0f9c6a9a5c | ||
|
|
c980500978 | ||
|
|
93921e71d4 | ||
|
|
675de50ee7 | ||
|
|
fc6946f78a | ||
|
|
2fdf6b7564 | ||
|
|
ba9d67e4bb | ||
|
|
c4e63ebd8c | ||
|
|
b83bfda187 | ||
|
|
cb632723bd | ||
|
|
0c892f3cf1 | ||
|
|
43ef8f2aeb | ||
|
|
e6b1a8c893 | ||
|
|
8548b7def7 | ||
|
|
bbe25537c7 | ||
|
|
5daeae994a | ||
|
|
f5bbfe5d1c | ||
|
|
89ee6f19b6 | ||
|
|
294033f156 | ||
|
|
2ffe124d95 | ||
|
|
1db8be91db | ||
|
|
3f6acc0917 | ||
|
|
76bb1496c8 | ||
|
|
7e4b44883b | ||
|
|
77b517cfc1 | ||
|
|
2c1753e14b | ||
|
|
dd07b0b830 | ||
|
|
0eae2bee6a | ||
|
|
a0869bb3b2 | ||
|
|
afc117a229 | ||
|
|
4dcb77c29f | ||
|
|
898faf6fe4 | ||
|
|
6987a4827e | ||
|
|
2d4f1b5b79 | ||
|
|
01bf88a695 | ||
|
|
c5127f5fd1 | ||
|
|
158d448cbc | ||
|
|
d0c379a3ba | ||
|
|
3163cb793a | ||
|
|
7bb4d68a22 | ||
|
|
4c1c8953ca |
40
.github/dependabot.yml
vendored
Normal file
40
.github/dependabot.yml
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
# To get started with Dependabot version updates, you'll need to specify which
|
||||
# package ecosystems to update and where the package manifests are located.
|
||||
# Please see the documentation for all configuration options:
|
||||
# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
|
||||
|
||||
version: 2
|
||||
updates:
|
||||
- package-ecosystem: "pip"
|
||||
directory: "/"
|
||||
groups:
|
||||
python:
|
||||
patterns:
|
||||
- "*"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
|
||||
- package-ecosystem: "github-actions"
|
||||
directory: "/"
|
||||
groups:
|
||||
actions:
|
||||
patterns:
|
||||
- "*"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
|
||||
- package-ecosystem: "npm"
|
||||
directory: "/scripts/settings/"
|
||||
groups:
|
||||
actions:
|
||||
patterns:
|
||||
- "*"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
|
||||
- package-ecosystem: "docker"
|
||||
# Look for a `Dockerfile` in the `root` directory
|
||||
directory: "/"
|
||||
# Check for updates once a week
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
6
.github/workflows/docker-publish.yaml
vendored
6
.github/workflows/docker-publish.yaml
vendored
@@ -22,7 +22,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
@@ -33,14 +33,14 @@ jobs:
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567
|
||||
uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||
|
||||
- name: Extract metadata (tags, labels) for Docker
|
||||
id: meta
|
||||
uses: docker/metadata-action@369eb591f429131d6889c46b94e711f089e6ca96
|
||||
uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804
|
||||
with:
|
||||
images: bellingcat/auto-archiver
|
||||
|
||||
|
||||
10
.github/workflows/ruff.yaml
vendored
10
.github/workflows/ruff.yaml
vendored
@@ -3,8 +3,18 @@ name: Ruff Formatting & Linting
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
paths-ignore:
|
||||
- "README.md"
|
||||
- ".github"
|
||||
- "poetry.lock"
|
||||
- "scripts/settings"
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
paths-ignore:
|
||||
- "README.md"
|
||||
- ".github"
|
||||
- "poetry.lock"
|
||||
- "scripts/settings"
|
||||
|
||||
jobs:
|
||||
build:
|
||||
|
||||
20
.github/workflows/tests-core.yaml
vendored
20
.github/workflows/tests-core.yaml
vendored
@@ -20,8 +20,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: ["3.10", "3.11", "3.12"]
|
||||
os: [ubuntu-22.04]
|
||||
#TODO: re-enable ubuntu-latest, this is disabled as oscrypto cannot be pinned to github commit and pushed to pypi
|
||||
os: [ubuntu-22.04, ubuntu-latest]
|
||||
defaults:
|
||||
run:
|
||||
working-directory: ./
|
||||
@@ -29,16 +28,23 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Install Poetry
|
||||
run: pipx install poetry
|
||||
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
cache: 'poetry'
|
||||
|
||||
- name: Install dependencies
|
||||
- name: Install latest Poetry
|
||||
run: pipx install poetry
|
||||
|
||||
- name: Cache Poetry and pip artifacts
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: |
|
||||
~/.cache/pypoetry
|
||||
~/.cache/pip
|
||||
key: poetry-${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }}
|
||||
|
||||
- name: Install dependencies from source only
|
||||
run: poetry install --no-interaction --with dev
|
||||
|
||||
- name: Run Core Tests
|
||||
|
||||
17
.github/workflows/tests-download.yaml
vendored
17
.github/workflows/tests-download.yaml
vendored
@@ -22,16 +22,23 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Install poetry
|
||||
run: pipx install poetry
|
||||
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
cache: 'poetry'
|
||||
|
||||
- name: Install dependencies
|
||||
- name: Install latest Poetry
|
||||
run: pipx install poetry
|
||||
|
||||
- name: Cache Poetry and pip artifacts
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: |
|
||||
~/.cache/pypoetry
|
||||
~/.cache/pip
|
||||
key: poetry-${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }}
|
||||
|
||||
- name: Install dependencies from source only
|
||||
run: poetry install --no-interaction --with dev
|
||||
|
||||
- name: Run Download Tests
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM webrecorder/browsertrix-crawler:1.4.2 AS base
|
||||
FROM webrecorder/browsertrix-crawler:1.6.1 AS base
|
||||
|
||||
ENV RUNNING_IN_DOCKER=1 \
|
||||
LANG=C.UTF-8 \
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
<h1 align="center">Auto Archiver</h1>
|
||||
|
||||
[](https://auto-archiver.readthedocs.io/en/latest/?badge=latest)
|
||||
[](https://badge.fury.io/py/auto-archiver)
|
||||
[](https://hub.docker.com/r/bellingcat/auto-archiver)
|
||||
[](https://hub.docker.com/r/bellingcat/auto-archiver)
|
||||
[](https://github.com/bellingcat/auto-archiver/actions/workflows/tests-core.yaml)
|
||||
[](https://github.com/bellingcat/auto-archiver/actions/workflows/tests-download.yaml)
|
||||
<!-- [](https://github.com/bellingcat/auto-archiver/actions/workflows/tests-download.yaml) -->
|
||||
|
||||
<!--  -->
|
||||
<!-- [](https://pypi.python.org/pypi/auto-archiver/) -->
|
||||
<!-- [](https://vk-url-scraper.readthedocs.io/en/latest/?badge=latest) -->
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -106,5 +106,117 @@ Finally,Some important things to remember:
|
||||
|
||||
## Authenticating on XXXX site with username/password
|
||||
|
||||
```{note} This section is still under construction 🚧
|
||||
```{note}
|
||||
This section is still under construction 🚧
|
||||
```
|
||||
|
||||
|
||||
# Proof of Origin Tokens
|
||||
|
||||
YouTube uses **Proof of Origin Tokens (POT)** as part of its bot detection system to verify that requests originate from valid clients. If a token is missing or invalid, some videos may return errors like "Sign in to confirm you're not a bot."
|
||||
|
||||
yt-dlp provides [a detailed guide to POTs](https://github.com/yt-dlp/yt-dlp/wiki/PO-Token-Guide).
|
||||
|
||||
### How Auto Archiver Uses POT
|
||||
This feature is enabled for the Generic Archiver via two yt-dlp plugins:
|
||||
|
||||
- **Client-side plugin**: [yt-dlp-get-pot](https://github.com/coletdjnz/yt-dlp-get-pot)
|
||||
Detects when a token is required and requests one from a provider.
|
||||
|
||||
- **Provider plugin**: [bgutil-ytdlp-pot-provider](https://github.com/Brainicism/bgutil-ytdlp-pot-provider)
|
||||
Includes both a Python plugin and a **Node.js server or script** to generate the token.
|
||||
|
||||
These are installed in our Poetry environment.
|
||||
|
||||
### Integration Methods
|
||||
|
||||
**Docker (Recommended)**:
|
||||
|
||||
When running the Auto Archiver using the Docker image, we use the [Node.js token generation script](https://github.com/Brainicism/bgutil-ytdlp-pot-provider/tree/master/server).
|
||||
This is to avoid managing a separate server process, and is handled automatically inside the Docker container when needed.
|
||||
|
||||
This is already included in the Docker image, however if you need to disable this you can set the config option `bguils_po_token_method` under the `generic_extractor` section of your `orchestration.yaml` config file to "disabled".
|
||||
```yaml
|
||||
generic_extractor:
|
||||
bguils_po_token_method: "disabled"
|
||||
```
|
||||
|
||||
**PyPi/ Local**:
|
||||
|
||||
When using the Auto Archiver PyPI package, or running locally, you will need additional system requirements to run the token generation script, namely either Docker, or Node.js and Yarn.
|
||||
|
||||
See the [bgutil-ytdlp-pot-provider](https://github.com/Brainicism/bgutil-ytdlp-pot-provider?tab=readme-ov-file#a-http-server-option) documentation for more details.
|
||||
|
||||
⚠️WARNING⚠️: This will add the server scripts to the home directory of wherever this is running.
|
||||
|
||||
- You can set the config option `bguils_po_token_method` under the `generic_extractor` section of your `orchestration.yaml` config file to "script" to enable the token generation script process locally.
|
||||
- Alternatively you can run the bgutil-ytdlp-pot-provider server separately using their Docker image or Node.js server.
|
||||
|
||||
### Notes
|
||||
|
||||
- The token generation script is only triggered when needed by yt-dlp, so it should have no effect unless YouTube requests a POT.
|
||||
- If you're running the Auto Archiver in Docker, this is set up automatically.
|
||||
- If you're running locally, you'll need to run the setup script manually or enable the feature in your config.
|
||||
- You can set up both the server and the script, and the plugin will fallback on each other if needed. This is recommended for robustness!
|
||||
|
||||
### Configurations:
|
||||
|
||||
## Configurations Summary
|
||||
|
||||
| Option | Behavior | Docker Default? |
|
||||
|------------| ------------------------------------------------------------------------------------------------------------------------------------------ | --------------- |
|
||||
| `auto` | Docker: Automatically downloads and uses the token generation script. Local: Does nothing; assumes a separate server is running externally. | ✅ Yes |
|
||||
| `script` | Explicitly downloads and uses the token generation script, even locally. | ❌ No |
|
||||
| `disabled` | Disables token generation completely. | ❌ No |
|
||||
|
||||
Example configuration:
|
||||
|
||||
|
||||
```yaml
|
||||
generic_extractor:
|
||||
# ...
|
||||
bguils_po_token_method: "script"
|
||||
# For debugging add the verbose flag here:
|
||||
ytdlp_args: "--no-abort-on-error --abort-on-error --verbose"
|
||||
|
||||
```
|
||||
|
||||
**Advanced Configuration:**
|
||||
|
||||
If you change the default port of the bgutil-ytdlp-pot-provider server, you can pass the updated values using our `extractor_args` option for the gereric extractor.
|
||||
|
||||
```yaml
|
||||
generic_extractor:
|
||||
ytdlp_args: "--no-abort-on-error --abort-on-error --verbose"
|
||||
ytdlp_update_interval: 5
|
||||
bguils_po_token_method: "script"
|
||||
extractor_args:
|
||||
youtube:
|
||||
getpot_bgutil_baseurl: "http://127.0.0.1:8080"
|
||||
player_client: web,tv
|
||||
```
|
||||
For more details on this for bgutils see [here](https://github.com/Brainicism/bgutil-ytdlp-pot-provider?tab=readme-ov-file#usage)
|
||||
|
||||
### Checking the logs
|
||||
|
||||
To verify that the POT process working, look for the following lines in your log after adding the config option:
|
||||
|
||||
```shell
|
||||
[GetPOT] BgUtilScript: Generating POT via script: /Users/you/bgutil-ytdlp-pot-provider/server/build/generate_once.js
|
||||
[debug] [GetPOT] BgUtilScript: Executing command to get POT via script: /Users/you/.nvm/versions/node/v20.18.0/bin/node /Users/you/bgutil-ytdlp-pot-provider/server/build/generate_once.js -v ymCMy8OflKM
|
||||
[debug] [GetPOT] BgUtilScript: stdout:
|
||||
{"poToken":"MlMxojNFhEJvUzGeHEkVRSK_luXtwcDnwSNIOgaUutqB7t99nmlNvtWgYayboopG6ZopZgmQ-6PJCWEMHv89MIiFGGlJRY25Fkwzxmia_8uYgf5AWf==","generatedAt":"2025-03-26T10:45:26.156Z","visitIdentifier":"ymCMy8OflKM"}
|
||||
[debug] [GetPOT] Fetching gvs PO Token for tv client
|
||||
```
|
||||
|
||||
If it can't find the script or something, you'll see something like this:
|
||||
```shell
|
||||
[debug] [GetPOT] Fetching player PO Token for tv client
|
||||
WARNING: [GetPOT] BgUtilScript: Script path doesn't exist: /Users/you/bgutil-ytdlp-pot-provider/server/build/generate_once.js. Please make sure the script has been transpiled correctly.
|
||||
WARNING: [GetPOT] BgUtilHTTP: Error reaching GET http://127.0.0.1:4416/ping (caused by TransportError). Please make sure that the server is reachable at http://127.0.0.1:4416.
|
||||
[debug] [GetPOT] No player PO Token provider available for tv client
|
||||
```
|
||||
|
||||
In this case check that the script has been transpiled correctly and is available at the path specified in the log,
|
||||
or that the server is running and reachable.
|
||||
|
||||
|
||||
@@ -71,7 +71,6 @@ The names of the actual modules have also changed, so for any extractor modules
|
||||
- `telethon_archiver` → `telethon_extractor`
|
||||
- `wacz_archiver_enricher` → `wacz_extractor_enricher`
|
||||
- `wayback_archiver_enricher` → `wayback_extractor_enricher`
|
||||
- `vk_archiver` → `vk_extractor`
|
||||
|
||||
|
||||
#### c) Module Renaming
|
||||
|
||||
169
docs/source/how_to/run_instagrapi_server.md
Normal file
169
docs/source/how_to/run_instagrapi_server.md
Normal file
@@ -0,0 +1,169 @@
|
||||
# InstagrAPI Server
|
||||
|
||||
The instagram API Extractor requires access to a running instance of the InstagrAPI server.
|
||||
We have a lightweight script with the endpoints required for our Instagram API Extractor module which you can run locally, or via Docker.
|
||||
|
||||
|
||||
|
||||
⚠️ Warning: Remember that it's best not to use your own personal account for archiving. [Here's why](../installation/authentication.md#recommendations-for-authentication).
|
||||
## Quick Start: Using Docker
|
||||
|
||||
We've provided a convenient shell script (`run_instagrapi_server.sh`) that simplifies the process of setting up and running the Instagrapi server in Docker. This script handles building the Docker image, setting up credentials, and starting the container.
|
||||
|
||||
### 🔧 Running the script:
|
||||
|
||||
Run this script either from the repository root or from within the `scripts/instagrapi_server` directory:
|
||||
|
||||
```bash
|
||||
./scripts/instagrapi_server/run_instagrapi_server.sh
|
||||
```
|
||||
|
||||
This script will:
|
||||
- Prompt for your Instagram username and password.
|
||||
- Create the necessary `.env` file.
|
||||
- Build the Docker image.
|
||||
- Start the Docker container and authenticate with Instagram, creating a session automatically.
|
||||
|
||||
### ⏱ To run the server again later:
|
||||
```bash
|
||||
docker start ig-instasrv
|
||||
```
|
||||
|
||||
### 🐛 Debugging:
|
||||
View logs:
|
||||
```bash
|
||||
docker logs ig-instasrv
|
||||
```
|
||||
|
||||
|
||||
### Overview: How the Setup Works
|
||||
|
||||
1. You enter your Instagram credentials in a local `.env` file
|
||||
2. You run the server **once locally** to generate a session file
|
||||
3. After that, you can choose to run the server again locally or inside Docker without needing to log in again
|
||||
|
||||
---
|
||||
|
||||
## Optional: Manual / Local Setup
|
||||
|
||||
If you'd prefer to run the server manually (without Docker), you can follow these steps:
|
||||
|
||||
|
||||
1. **Navigate to the server folder (and stay there for the rest of this guide)**:
|
||||
```bash
|
||||
cd scripts/instagrapi_server
|
||||
```
|
||||
|
||||
2. **Create a `secrets/` folder** (if it doesn't already exist in `scripts/instagrapi_server`):
|
||||
```bash
|
||||
mkdir -p secrets
|
||||
```
|
||||
|
||||
3. **Create a `.env` file** inside `secrets/` with your Instagram credentials:
|
||||
```dotenv
|
||||
INSTAGRAM_USERNAME="your_username"
|
||||
INSTAGRAM_PASSWORD="your_password"
|
||||
```
|
||||
|
||||
4. **Install dependencies** using the pyproject.toml file:
|
||||
|
||||
```bash
|
||||
poetry install --no-root
|
||||
```
|
||||
|
||||
5. **Run the server locally**:
|
||||
```bash
|
||||
poetry run uvicorn src.instaserver:app --port 8000
|
||||
```
|
||||
|
||||
6. **Watch for the message**:
|
||||
```
|
||||
Login successful, session saved.
|
||||
```
|
||||
|
||||
✅ Your session is now saved to `secrets/instagrapi_session.json`.
|
||||
|
||||
### To run it again locally:
|
||||
```bash
|
||||
poetry run uvicorn src.instaserver:app --port 8000
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Adding the API Endpoint to Auto Archiver
|
||||
|
||||
The server should now be running within that session, and accessible at http://127.0.0.1:8000
|
||||
|
||||
You can set this in the Auto Archiver orchestration.yaml file like this:
|
||||
```yaml
|
||||
instagram_api_extractor:
|
||||
api_endpoint: http://127.0.0.1:8000
|
||||
```
|
||||
|
||||
|
||||
---
|
||||
|
||||
## 2. Running the Server Again
|
||||
|
||||
Once the session file is created, you should be able to run the server without logging in again.
|
||||
|
||||
### To run it locally (from scripts/instagrapi_server):
|
||||
```bash
|
||||
poetry run uvicorn src.instgrapinstance.instaserver:app --port 8000
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. Running via Docker (After Setup is Complete, either locally or via the script)
|
||||
|
||||
Once the `instagrapi_session.json` and `.env` files are set up, you can pass them Docker and it should authenticate successfully.
|
||||
|
||||
### 🔨 Build the Docker image manually:
|
||||
```bash
|
||||
docker build -t instagrapi-server .
|
||||
```
|
||||
|
||||
### ▶️ Run the container:
|
||||
```bash
|
||||
docker run -d \
|
||||
--env-file secrets/.env \
|
||||
-v "$(pwd)/secrets:/app/secrets" \
|
||||
-p 8000:8000 \
|
||||
--name ig-instasrv \
|
||||
instagrapi-server
|
||||
```
|
||||
|
||||
This passes the /secrets/ directory to docker as well as the environment variables from the `.env` file.
|
||||
|
||||
|
||||
|
||||
---
|
||||
|
||||
## 4. Optional Cleanup
|
||||
|
||||
- **Stop the Docker container**:
|
||||
```bash
|
||||
docker stop ig-instasrv
|
||||
```
|
||||
|
||||
- **Remove the container**:
|
||||
```bash
|
||||
docker rm ig-instasrv
|
||||
```
|
||||
|
||||
- **Remove the Docker image**:
|
||||
```bash
|
||||
docker rmi instagrapi-server
|
||||
```
|
||||
|
||||
### ⏱ To run again later:
|
||||
```bash
|
||||
docker start ig-instasrv
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Notes
|
||||
|
||||
- Never share your `.env` or `instagrapi_session.json` — these contain sensitive login data.
|
||||
- If you want to reset your session, simply delete the `secrets/instagrapi_session.json` file and re-run the local server.
|
||||
@@ -6,6 +6,15 @@ There are two main use cases for authentication:
|
||||
* Some websites require some kind of authentication in order to view the content. Examples include Facebook, Telegram etc.
|
||||
* Some websites use anti-bot systems to block bot-like tools from accessing the website. Adding real login information to auto-archiver can sometimes bypass this.
|
||||
|
||||
```{note}
|
||||
|
||||
The Authentication framework currently only works with the following modules:
|
||||
* Generic Extractor
|
||||
* Screenshot Enricher
|
||||
|
||||
To authenticate for WACZ archiving, see the instructions on the [](../modules/autogen/enricher/wacz_extractor_enricher.md) page.
|
||||
```
|
||||
|
||||
## The Authentication Config
|
||||
|
||||
You can save your authentication information directly inside your orchestration config file, or as a separate file (for security/multi-deploy purposes). Whether storing your settings inside the orchestration file, or as a separate file, the configuration format is the same. Currently, auto-archiver supports the following authentication types:
|
||||
@@ -27,7 +36,7 @@ You can save your authentication information directly inside your orchestration
|
||||
|
||||
The Username & Password, and API settings only work with the Generic Extractor. Other modules (like the screenshot enricher) can only use the `cookies` options. Furthermore, many sites can still detect bots and block username/password logins. Twitter/X and YouTube are two prominent ones that block username/password logging.
|
||||
|
||||
One of the 'Cookies' options is recommended for the most robust archiving.
|
||||
One of the 'Cookies' options is recommended for the most robust archiving, but it still isn't guaranteed to work.
|
||||
```
|
||||
|
||||
```{code} yaml
|
||||
|
||||
@@ -11,7 +11,6 @@ are available on the [extractors](../modules/extractor.md) page. Some sites supp
|
||||
* Twitter
|
||||
* Instagram
|
||||
* Telegram
|
||||
* VKontact
|
||||
* Tiktok
|
||||
* Bluesky
|
||||
|
||||
|
||||
@@ -27,8 +27,8 @@ The way you run the Auto Archiver depends on how you installed it (docker instal
|
||||
If you installed Auto Archiver using docker, open up your terminal, and copy-paste / type the following command:
|
||||
|
||||
```bash
|
||||
docker run -it --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver
|
||||
```
|
||||
docker run -it --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver -- "https://example.com/1/"
|
||||
```
|
||||
|
||||
breaking this command down:
|
||||
1. `docker run` tells docker to start a new container (an instance of the image)
|
||||
@@ -42,6 +42,7 @@ breaking this command down:
|
||||
1. `-v` same as above, this is a volume instruction
|
||||
2. `$PWD/local_archive` is a folder `local_archive/` in case you want to archive locally and have the files accessible outside docker
|
||||
3. `/app/local_archive` is a folder inside docker that you can reference in your orchestration.yml file
|
||||
6. ` -- "https://example.com/1/"` this will pass the URL to archive to the default [command line feeder](../modules/autogen/feeder/cli_feeder.md)
|
||||
|
||||
### Example invocations
|
||||
|
||||
|
||||
1441
poetry.lock
generated
1441
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[project]
|
||||
name = "auto-archiver"
|
||||
version = "0.13.8"
|
||||
version = "1.0.1"
|
||||
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
||||
|
||||
requires-python = ">=3.10,<3.13"
|
||||
@@ -41,23 +41,22 @@ dependencies = [
|
||||
"instaloader (>=0.0.0)",
|
||||
"tqdm (>=0.0.0)",
|
||||
"jinja2 (>=0.0.0)",
|
||||
"pyOpenSSL (==24.2.1)",
|
||||
"cryptography (>=41.0.0,<42.0.0)",
|
||||
"boto3 (>=1.28.0,<2.0.0)",
|
||||
"dataclasses-json (>=0.0.0)",
|
||||
"yt-dlp (>=2025.1.26,<2026.0.0)",
|
||||
"numpy (==2.1.3)",
|
||||
"vk-url-scraper (>=0.0.0)",
|
||||
"requests[socks] (>=0.0.0)",
|
||||
"warcio (>=0.0.0)",
|
||||
"jsonlines (>=0.0.0)",
|
||||
"pysubs2 (>=0.0.0)",
|
||||
"retrying (>=0.0.0)",
|
||||
"tsp-client (>=0.0.0)",
|
||||
"certvalidator (>=0.0.0)",
|
||||
"rich-argparse (>=1.6.0,<2.0.0)",
|
||||
"ruamel-yaml (>=0.18.10,<0.19.0)",
|
||||
"rfc3161-client (>=1.0.1,<2.0.0)",
|
||||
"cryptography (>44.0.1,<45.0.0)",
|
||||
"opentimestamps (>=0.4.5,<0.5.0)",
|
||||
"bgutil-ytdlp-pot-provider (>=1.0.0)",
|
||||
"yt-dlp[curl-cffi,default] (>=2025.5.22,<2026.0.0)",
|
||||
"secretstorage (>=3.3.3,<4.0.0)",
|
||||
]
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
|
||||
2
scripts/instagrapi_server/.gitignore
vendored
Normal file
2
scripts/instagrapi_server/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
secrets*
|
||||
*instagrapi_session.json
|
||||
19
scripts/instagrapi_server/Dockerfile
Normal file
19
scripts/instagrapi_server/Dockerfile
Normal file
@@ -0,0 +1,19 @@
|
||||
FROM python:3.12-slim
|
||||
WORKDIR /app
|
||||
|
||||
# Install Poetry
|
||||
RUN pip install --upgrade pip
|
||||
RUN pip install poetry
|
||||
|
||||
# Copy all source code
|
||||
COPY . .
|
||||
|
||||
# Prevent Poetry from creating a virtual environment
|
||||
RUN poetry config virtualenvs.create false
|
||||
|
||||
# Install dependencies
|
||||
RUN poetry install --no-root
|
||||
|
||||
|
||||
# Use uvicorn to run the FastAPI app
|
||||
CMD ["poetry", "run", "uvicorn", "src.instaserver:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
18
scripts/instagrapi_server/pyproject.toml
Normal file
18
scripts/instagrapi_server/pyproject.toml
Normal file
@@ -0,0 +1,18 @@
|
||||
[project]
|
||||
name = "instaserver"
|
||||
version = "0.1.0"
|
||||
description = "A FastAPI InstagrAPI server"
|
||||
package-mode = false
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"fastapi (>=0.115.12,<0.116.0)",
|
||||
"instagrapi (>=2.1.3,<3.0.0)",
|
||||
"uvicorn (>=0.34.0,<0.35.0)",
|
||||
"pillow (>=11.1.0,<12.0.0)",
|
||||
"python-dotenv (>=1.1.0,<2.0.0)"
|
||||
]
|
||||
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
48
scripts/instagrapi_server/run_instagrapi_server.sh
Executable file
48
scripts/instagrapi_server/run_instagrapi_server.sh
Executable file
@@ -0,0 +1,48 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# run_instagrapi_server.sh
|
||||
# Usage:
|
||||
# From repo root: ./scripts/instagrapi_server/run_instagrapi_server.sh
|
||||
# Or from script dir: ./run_instagrapi_server.sh
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
# Step 1: cd to the script's directory (contains Dockerfile and secrets/)
|
||||
cd "$(dirname "$0")" || exit 1
|
||||
|
||||
# Create secrets/ if it doesn't exist
|
||||
if [[ ! -d "secrets" ]]; then
|
||||
echo "Creating secrets/ directory..."
|
||||
mkdir secrets
|
||||
fi
|
||||
|
||||
echo "Enter your Instagram credentials to store in secrets/.env"
|
||||
read -rp "Instagram Username: " IGUSER
|
||||
read -rsp "Instagram Password: " IGPASS
|
||||
echo ""
|
||||
|
||||
cat <<EOF > secrets/.env
|
||||
INSTAGRAM_USERNAME=$IGUSER
|
||||
INSTAGRAM_PASSWORD=$IGPASS
|
||||
EOF
|
||||
echo "Created secrets/.env with your credentials."
|
||||
|
||||
# Build Docker image
|
||||
IMAGE_NAME="instagrapi-server"
|
||||
echo "Building Docker image '$IMAGE_NAME'..."
|
||||
docker build -t "$IMAGE_NAME" .
|
||||
|
||||
# Run container
|
||||
CONTAINER_NAME="ig-instasrv"
|
||||
echo "Running container '$CONTAINER_NAME'..."
|
||||
docker run -d \
|
||||
--env-file secrets/.env \
|
||||
-v "$(pwd)/secrets:/app/secrets" \
|
||||
-p 8000:8000 \
|
||||
--name "$CONTAINER_NAME" \
|
||||
"$IMAGE_NAME"
|
||||
|
||||
echo "Done! Instagrapi server is running on port 8000."
|
||||
echo "Use 'docker logs $CONTAINER_NAME' to view logs."
|
||||
echo "Use 'docker stop $CONTAINER_NAME' and 'docker rm $CONTAINER_NAME' to stop/remove the container."
|
||||
157
scripts/instagrapi_server/src/instaserver.py
Normal file
157
scripts/instagrapi_server/src/instaserver.py
Normal file
@@ -0,0 +1,157 @@
|
||||
"""https://subzeroid.github.io/instagrapi/
|
||||
|
||||
Run using the following command:
|
||||
uvicorn src.instgrapinstance.instaserver:app --host 0.0.0.0 --port 8000 --reload
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from instagrapi import Client
|
||||
from instagrapi.exceptions import LoginRequired, BadCredentials
|
||||
|
||||
load_dotenv(dotenv_path="secrets/.env")
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
||||
|
||||
INSTAGRAM_USERNAME = os.getenv("INSTAGRAM_USERNAME")
|
||||
INSTAGRAM_PASSWORD = os.getenv("INSTAGRAM_PASSWORD")
|
||||
SESSION_FILE = "secrets/instagrapi_session.json"
|
||||
|
||||
app = FastAPI()
|
||||
cl = Client()
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
def startup_event():
|
||||
"""Login automatically when server starts"""
|
||||
try:
|
||||
login_instagram()
|
||||
except RuntimeError as e:
|
||||
logging.error(f"API failed to start: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def login_instagram():
|
||||
"""Ensures Instagrapi is logged in and session is persistent"""
|
||||
if not INSTAGRAM_USERNAME or not INSTAGRAM_PASSWORD:
|
||||
raise RuntimeError("Instagram credentials are missing.")
|
||||
|
||||
if os.path.exists(SESSION_FILE):
|
||||
try:
|
||||
cl.load_settings(SESSION_FILE)
|
||||
cl.get_timeline_feed()
|
||||
logging.info("Using saved session.")
|
||||
return
|
||||
except LoginRequired:
|
||||
logging.info("Session expired. Logging in again...")
|
||||
|
||||
try:
|
||||
cl.login(INSTAGRAM_USERNAME, INSTAGRAM_PASSWORD)
|
||||
cl.dump_settings(SESSION_FILE)
|
||||
logging.info("Login successful, session saved.")
|
||||
except BadCredentials as bc:
|
||||
raise RuntimeError("Incorrect Instagram username or password.") from bc
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Login failed: {e}") from e
|
||||
|
||||
|
||||
@app.get("/v1/media/by/id")
|
||||
def get_media_by_id(id: str):
|
||||
"""Fetch post details by media ID"""
|
||||
logging.info(f"Fetching media by ID: {id}")
|
||||
try:
|
||||
media = cl.media_info(id)
|
||||
return media.model_dump()
|
||||
except Exception as e:
|
||||
logging.warning(f"Media not found for ID {id}: {e}")
|
||||
raise HTTPException(status_code=404, detail="Post not found") from e
|
||||
|
||||
|
||||
@app.get("/v1/media/by/code")
|
||||
def get_media_by_code(code: str):
|
||||
"""Fetch post details by shortcode"""
|
||||
logging.info(f"Fetching media by shortcode: {code}")
|
||||
try:
|
||||
media_id = cl.media_pk_from_code(code)
|
||||
media = cl.media_info(media_id)
|
||||
return media.model_dump()
|
||||
except Exception as e:
|
||||
logging.warning(f"Media not found for code {code}: {e}")
|
||||
raise HTTPException(status_code=404, detail="Post not found") from e
|
||||
|
||||
|
||||
@app.get("/v2/user/tag/medias")
|
||||
def get_user_tagged_medias(user_id: str, page_id: str = None):
|
||||
logging.info(f"Fetching tagged medias for user_id={user_id} page_id={page_id}")
|
||||
try:
|
||||
# Placeholder for now
|
||||
items, next_page_id = [], None
|
||||
return {"response": {"items": items}, "next_page_id": next_page_id}
|
||||
except Exception as e:
|
||||
logging.warning(f"Tagged media not found for {user_id}: {e}")
|
||||
raise HTTPException(status_code=404, detail="Tagged media not found") from e
|
||||
|
||||
|
||||
@app.get("/v1/user/highlights")
|
||||
def get_user_highlights(user_id: str):
|
||||
logging.info(f"Fetching highlights list for user_id={user_id}")
|
||||
try:
|
||||
highlights = cl.user_highlights(user_id)
|
||||
return [h.model_dump() for h in highlights]
|
||||
except Exception as e:
|
||||
logging.warning(f"Highlights not found for {user_id}: {e}")
|
||||
raise HTTPException(status_code=404, detail="No highlights found") from e
|
||||
|
||||
|
||||
@app.get("/v2/highlight/by/id")
|
||||
def get_highlight_by_id(id: str):
|
||||
logging.info(f"Fetching highlight details for id={id}")
|
||||
try:
|
||||
highlight = cl.highlight_info(id)
|
||||
return {"response": {"reels": {f"highlight:{id}": highlight.model_dump()}}}
|
||||
except Exception as e:
|
||||
logging.warning(f"Highlight not found for id {id}: {e}")
|
||||
raise HTTPException(status_code=404, detail="Highlight not found") from e
|
||||
|
||||
|
||||
@app.get("/v1/user/stories/by/username")
|
||||
def get_stories(username: str):
|
||||
logging.info(f"Fetching stories for username={username}")
|
||||
try:
|
||||
user_id = cl.user_id_from_username(username)
|
||||
stories = cl.user_stories(user_id)
|
||||
return [story.model_dump() for story in stories]
|
||||
except Exception as e:
|
||||
logging.warning(f"Stories not found for {username}: {e}")
|
||||
raise HTTPException(status_code=404, detail="Stories not found") from e
|
||||
|
||||
|
||||
@app.get("/v2/user/by/username")
|
||||
def get_user_by_username(username: str):
|
||||
logging.info(f"Fetching user profile for username={username}")
|
||||
try:
|
||||
user = cl.user_info_by_username(username)
|
||||
return {"user": user.model_dump()}
|
||||
except Exception as e:
|
||||
logging.warning(f"User not found: {username}: {e}")
|
||||
raise HTTPException(status_code=404, detail="User not found") from e
|
||||
|
||||
|
||||
@app.get("/v1/user/medias/chunk")
|
||||
def get_user_medias(user_id: str, end_cursor: str = None):
|
||||
logging.info(f"Fetching paginated medias for user_id={user_id}, end_cursor={end_cursor}")
|
||||
try:
|
||||
posts, next_cursor = cl.user_medias_paginated(user_id, end_cursor=end_cursor)
|
||||
return [[post.model_dump() for post in posts], next_cursor]
|
||||
except Exception as e:
|
||||
logging.warning(f"No posts found for user_id={user_id}: {e}")
|
||||
raise HTTPException(status_code=404, detail="No posts found") from e
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
901
scripts/settings/package-lock.json
generated
901
scripts/settings/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -214,11 +214,8 @@ class LazyBaseModule:
|
||||
|
||||
# check external dependencies are installed
|
||||
def check_deps(deps, check):
|
||||
for dep in deps:
|
||||
if not len(dep):
|
||||
# clear out any empty strings that a user may have erroneously added
|
||||
continue
|
||||
if not check(dep):
|
||||
for dep in filter(lambda d: len(d.strip()) > 0, deps):
|
||||
if not check(dep.strip()):
|
||||
logger.error(
|
||||
f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \
|
||||
Have you installed the required dependencies for the '{self.name}' module? See the documentation for more information."
|
||||
@@ -277,6 +274,9 @@ class LazyBaseModule:
|
||||
# finally, get the class instance
|
||||
instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)()
|
||||
|
||||
# save the instance for future easy loading
|
||||
self._instance = instance
|
||||
|
||||
# set the name, display name and module factory
|
||||
instance.name = self.name
|
||||
instance.display_name = self.display_name
|
||||
@@ -289,8 +289,6 @@ class LazyBaseModule:
|
||||
instance.config_setup(config)
|
||||
instance.setup()
|
||||
|
||||
# save the instance for future easy loading
|
||||
self._instance = instance
|
||||
return instance
|
||||
|
||||
def __repr__(self):
|
||||
|
||||
@@ -5,6 +5,7 @@ formatting, database operations and clean up.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from packaging import version
|
||||
from typing import Generator, Union, List, Type, TYPE_CHECKING
|
||||
import argparse
|
||||
import os
|
||||
@@ -387,8 +388,10 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
except (KeyboardInterrupt, Exception) as e:
|
||||
if not isinstance(e, KeyboardInterrupt) and not isinstance(e, SetupError):
|
||||
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
|
||||
if loaded_module and module_type == "extractor":
|
||||
loaded_module.cleanup()
|
||||
|
||||
# access the _instance here because loaded_module may not return if there's an error
|
||||
if lazy_module._instance and module_type == "extractor":
|
||||
lazy_module._instance.cleanup()
|
||||
raise e
|
||||
|
||||
if not loaded_module:
|
||||
@@ -434,16 +437,19 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
|
||||
def check_for_updates(self):
|
||||
response = requests.get("https://pypi.org/pypi/auto-archiver/json").json()
|
||||
latest_version = response["info"]["version"]
|
||||
latest_version = version.parse(response["info"]["version"])
|
||||
current_version = version.parse(__version__)
|
||||
# check version compared to current version
|
||||
if latest_version != __version__:
|
||||
if latest_version > current_version:
|
||||
if os.environ.get("RUNNING_IN_DOCKER"):
|
||||
update_cmd = "`docker pull bellingcat/auto-archiver:latest`"
|
||||
else:
|
||||
update_cmd = "`pip install --upgrade auto-archiver`"
|
||||
logger.warning("")
|
||||
logger.warning("********* IMPORTANT: UPDATE AVAILABLE ********")
|
||||
logger.warning(f"A new version of auto-archiver is available (v{latest_version}, you have {__version__})")
|
||||
logger.warning(
|
||||
f"A new version of auto-archiver is available (v{latest_version}, you have v{current_version})"
|
||||
)
|
||||
logger.warning(f"Make sure to update to the latest version using: {update_cmd}")
|
||||
logger.warning("")
|
||||
|
||||
|
||||
@@ -4,12 +4,6 @@ import argparse
|
||||
import json
|
||||
|
||||
|
||||
def example_validator(value):
|
||||
if "example" not in value:
|
||||
raise argparse.ArgumentTypeError(f"{value} is not a valid value for this argument")
|
||||
return value
|
||||
|
||||
|
||||
def positive_number(value):
|
||||
if value < 0:
|
||||
raise argparse.ArgumentTypeError(f"{value} is not a positive number")
|
||||
|
||||
@@ -74,6 +74,11 @@ If you are having issues with the extractor, you can review the version of `yt-d
|
||||
"default": "inf",
|
||||
"help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
|
||||
},
|
||||
"bguils_po_token_method": {
|
||||
"default": "auto",
|
||||
"help": "Set up a Proof of origin token provider. This process has additional requirements. See [authentication](https://auto-archiver.readthedocs.io/en/latest/how_to/authentication_how_to.html) for more information.",
|
||||
"choices": ["auto", "script", "disabled"],
|
||||
},
|
||||
"extractor_args": {
|
||||
"default": {},
|
||||
"help": "Additional arguments to pass to the yt-dlp extractor. See https://github.com/yt-dlp/yt-dlp/blob/master/README.md#extractor-arguments.",
|
||||
|
||||
@@ -1,12 +1,18 @@
|
||||
import mimetypes
|
||||
import shutil
|
||||
import sys
|
||||
import datetime
|
||||
import os
|
||||
import importlib
|
||||
import subprocess
|
||||
import zipfile
|
||||
|
||||
from typing import Generator, Type
|
||||
from urllib.request import urlretrieve
|
||||
|
||||
import yt_dlp
|
||||
from yt_dlp.extractor.common import InfoExtractor
|
||||
from yt_dlp.utils import MaxDownloadsReached
|
||||
import pysubs2
|
||||
|
||||
from loguru import logger
|
||||
@@ -25,45 +31,138 @@ class GenericExtractor(Extractor):
|
||||
_dropins = {}
|
||||
|
||||
def setup(self):
|
||||
# check for file .ytdlp-update in the secrets folder
|
||||
self.check_for_extractor_updates()
|
||||
self.setup_po_tokens()
|
||||
|
||||
def check_for_extractor_updates(self):
|
||||
"""Checks whether yt-dlp or its plugins need updating and triggers a restart if so."""
|
||||
if self.ytdlp_update_interval < 0:
|
||||
return
|
||||
|
||||
use_secrets = os.path.exists("secrets")
|
||||
path = os.path.join("secrets" if use_secrets else "", ".ytdlp-update")
|
||||
next_update_check = None
|
||||
if os.path.exists(path):
|
||||
with open(path, "r") as f:
|
||||
next_update_check = datetime.datetime.fromisoformat(f.read())
|
||||
update_file = os.path.join("secrets" if os.path.exists("secrets") else "", ".ytdlp-update")
|
||||
next_check = None
|
||||
if os.path.exists(update_file):
|
||||
with open(update_file, "r") as f:
|
||||
next_check = datetime.datetime.fromisoformat(f.read())
|
||||
|
||||
if not next_update_check or next_update_check < datetime.datetime.now():
|
||||
self.update_ytdlp()
|
||||
if next_check and next_check > datetime.datetime.now():
|
||||
return
|
||||
|
||||
next_update_check = datetime.datetime.now() + datetime.timedelta(days=self.ytdlp_update_interval)
|
||||
with open(path, "w") as f:
|
||||
f.write(next_update_check.isoformat())
|
||||
yt_dlp_updated = self.update_package("yt-dlp")
|
||||
bgutil_updated = self.update_package("bgutil-ytdlp-pot-provider")
|
||||
|
||||
def update_ytdlp(self):
|
||||
logger.info("Checking and updating yt-dlp...")
|
||||
logger.info(
|
||||
f"Tip: change the 'ytdlp_update_interval' setting to control how often yt-dlp is updated. Set to -1 to disable or 0 to enable on every run. Current setting: {self.ytdlp_update_interval}"
|
||||
)
|
||||
# Write the new timestamp
|
||||
with open(update_file, "w") as f:
|
||||
next_check = datetime.datetime.now() + datetime.timedelta(days=self.ytdlp_update_interval)
|
||||
f.write(next_check.isoformat())
|
||||
|
||||
if yt_dlp_updated or bgutil_updated:
|
||||
if os.environ.get("AUTO_ARCHIVER_ALLOW_RESTART", "1") != "1":
|
||||
logger.warning("yt-dlp or plugin was updated — please restart auto-archiver manually")
|
||||
else:
|
||||
logger.warning("yt-dlp or plugin was updated — restarting auto-archiver")
|
||||
logger.warning(" ======= RESTARTING ======= ")
|
||||
os.execv(sys.executable, [sys.executable] + sys.argv)
|
||||
|
||||
def update_package(self, package_name: str) -> bool:
|
||||
logger.info(f"Checking and updating {package_name}...")
|
||||
from importlib.metadata import version as get_version
|
||||
|
||||
old_version = get_version("yt-dlp")
|
||||
old_version = get_version(package_name)
|
||||
try:
|
||||
# try and update with pip (this works inside poetry environment and in a normal virtualenv)
|
||||
result = subprocess.run(["pip", "install", "--upgrade", "yt-dlp"], check=True, capture_output=True)
|
||||
result = subprocess.run(["pip", "install", "--upgrade", package_name], check=True, capture_output=True)
|
||||
if f"Successfully installed {package_name}" in result.stdout.decode():
|
||||
new_version = importlib.metadata.version(package_name)
|
||||
logger.info(f"{package_name} updated from {old_version} to {new_version}")
|
||||
return True
|
||||
logger.info(f"{package_name} already up to date")
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating {package_name}: {e}")
|
||||
return False
|
||||
|
||||
if "Successfully installed yt-dlp" in result.stdout.decode():
|
||||
new_version = importlib.metadata.version("yt-dlp")
|
||||
logger.info(f"yt-dlp successfully (from {old_version} to {new_version})")
|
||||
importlib.reload(yt_dlp)
|
||||
def setup_po_tokens(self) -> None:
|
||||
"""Setup Proof of Origin Token method conditionally.
|
||||
Uses provider: https://github.com/Brainicism/bgutil-ytdlp-pot-provider.
|
||||
"""
|
||||
in_docker = os.environ.get("RUNNING_IN_DOCKER")
|
||||
if self.bguils_po_token_method == "disabled":
|
||||
# This allows disabling of the PO Token generation script in the Docker implementation.
|
||||
logger.warning("Proof of Origin Token generation is disabled.")
|
||||
return
|
||||
|
||||
if self.bguils_po_token_method == "auto" and not in_docker:
|
||||
logger.info(
|
||||
"Proof of Origin Token method not explicitly set. "
|
||||
"If you're running an external HTTP server separately, you can safely ignore this message. "
|
||||
"To reduce the likelihood of bot detection, enable one of the methods described in the documentation: "
|
||||
"https://auto-archiver.readthedocs.io/en/settings_page/installation/authentication.html#proof-of-origin-tokens"
|
||||
)
|
||||
return
|
||||
|
||||
# Either running in Docker, or "script" method is set beyond this point
|
||||
self.setup_token_generation_script()
|
||||
|
||||
def setup_token_generation_script(self) -> None:
|
||||
"""This function sets up the Proof of Origin Token generation script method for
|
||||
bgutil-ytdlp-pot-provider if enabled or in Docker."""
|
||||
missing_tools = [tool for tool in ("node", "yarn", "npx") if shutil.which(tool) is None]
|
||||
if missing_tools:
|
||||
logger.error(
|
||||
f"Cannot set up PO Token script; missing required tools: {', '.join(missing_tools)}. "
|
||||
"Install these tools or run bgutils via Docker. "
|
||||
"See: https://github.com/Brainicism/bgutil-ytdlp-pot-provider"
|
||||
)
|
||||
return
|
||||
try:
|
||||
from importlib.metadata import version as get_version
|
||||
|
||||
plugin_version = get_version("bgutil-ytdlp-pot-provider")
|
||||
base_dir = os.path.expanduser("~/bgutil-ytdlp-pot-provider")
|
||||
server_dir = os.path.join(base_dir, "server")
|
||||
version_file = os.path.join(server_dir, ".VERSION")
|
||||
transpiled_script = os.path.join(server_dir, "build", "generate_once.js")
|
||||
|
||||
# Skip setup if version is correct and transpiled script exists
|
||||
if os.path.isfile(transpiled_script) and os.path.isfile(version_file):
|
||||
with open(version_file) as vf:
|
||||
if vf.read().strip() == plugin_version:
|
||||
logger.info("PO Token script already set up and up to date.")
|
||||
else:
|
||||
logger.info("yt-dlp already up to date")
|
||||
# Remove an outdated directory and pull a new version
|
||||
if os.path.exists(base_dir):
|
||||
shutil.rmtree(base_dir)
|
||||
os.makedirs(base_dir, exist_ok=True)
|
||||
|
||||
zip_url = (
|
||||
f"https://github.com/Brainicism/bgutil-ytdlp-pot-provider/archive/refs/tags/{plugin_version}.zip"
|
||||
)
|
||||
zip_path = os.path.join(base_dir, f"{plugin_version}.zip")
|
||||
logger.info(f"Downloading bgutils release zip for version {plugin_version}...")
|
||||
urlretrieve(zip_url, zip_path)
|
||||
with zipfile.ZipFile(zip_path, "r") as z:
|
||||
z.extractall(base_dir)
|
||||
os.remove(zip_path)
|
||||
|
||||
extracted_root = os.path.join(base_dir, f"bgutil-ytdlp-pot-provider-{plugin_version}")
|
||||
shutil.move(os.path.join(extracted_root, "server"), server_dir)
|
||||
shutil.rmtree(extracted_root)
|
||||
logger.info("Installing dependencies and transpiling PoT Generator script...")
|
||||
subprocess.run(["yarn", "install", "--frozen-lockfile"], cwd=server_dir, check=True)
|
||||
subprocess.run(["npx", "tsc"], cwd=server_dir, check=True)
|
||||
|
||||
with open(version_file, "w") as vf:
|
||||
vf.write(plugin_version)
|
||||
|
||||
script_path = os.path.join(server_dir, "build", "generate_once.js")
|
||||
if not os.path.exists(script_path):
|
||||
logger.error("generate_once.js not found after transpilation.")
|
||||
return
|
||||
|
||||
self.extractor_args.setdefault("youtubepot-bgutilscript", {})["script_path"] = script_path
|
||||
logger.info(f"PO Token script configured at: {script_path}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating yt-dlp: {e}")
|
||||
logger.error(f"Failed to set up PO Token script: {e}")
|
||||
|
||||
def suitable_extractors(self, url: str) -> Generator[str, None, None]:
|
||||
"""
|
||||
@@ -204,9 +303,9 @@ class GenericExtractor(Extractor):
|
||||
result.set_url(url)
|
||||
|
||||
if "description" in video_data and not result.get("content"):
|
||||
result.set_content(video_data["description"])
|
||||
result.set_content(video_data.pop("description"))
|
||||
# extract comments if enabled
|
||||
if self.comments:
|
||||
if self.comments and video_data.get("comments", []) is not None:
|
||||
result.set(
|
||||
"comments",
|
||||
[
|
||||
@@ -265,7 +364,12 @@ class GenericExtractor(Extractor):
|
||||
# this time download
|
||||
ydl.params["getcomments"] = self.comments
|
||||
# TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
|
||||
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
|
||||
try:
|
||||
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
|
||||
except MaxDownloadsReached: # proceed as normal once MaxDownloadsReached is raised
|
||||
pass
|
||||
logger.success(data)
|
||||
|
||||
if "entries" in data:
|
||||
entries = data.get("entries", [])
|
||||
if not len(entries):
|
||||
@@ -273,14 +377,33 @@ class GenericExtractor(Extractor):
|
||||
return False
|
||||
else:
|
||||
entries = [data]
|
||||
|
||||
result = Metadata()
|
||||
|
||||
def _helper_get_filename(entry: dict) -> str:
|
||||
entry_url = entry.get("url")
|
||||
|
||||
filename = ydl.prepare_filename(entry)
|
||||
base_filename, _ = os.path.splitext(filename) # '/get/path/to/file' ignore '.ext'
|
||||
directory = os.path.dirname(base_filename) # '/get/path/to'
|
||||
basename = os.path.basename(base_filename) # 'file'
|
||||
for f in os.listdir(directory):
|
||||
if (
|
||||
f.startswith(basename)
|
||||
or (entry_url and os.path.splitext(f)[0] in entry_url)
|
||||
and "video/" in (mimetypes.guess_type(f)[0] or "")
|
||||
):
|
||||
return os.path.join(directory, f)
|
||||
return False
|
||||
|
||||
for entry in entries:
|
||||
try:
|
||||
filename = ydl.prepare_filename(entry)
|
||||
if not os.path.exists(filename):
|
||||
filename = filename.split(".")[0] + ".mkv"
|
||||
filename = _helper_get_filename(entry)
|
||||
|
||||
if not filename or not os.path.exists(filename):
|
||||
# file was not downloaded or could not be retrieved, example: sensitive videos on YT without using cookies.
|
||||
continue
|
||||
|
||||
logger.debug(f"Using filename {filename} for entry {entry.get('id', 'unknown')}")
|
||||
|
||||
new_media = Media(filename)
|
||||
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
|
||||
@@ -299,6 +422,9 @@ class GenericExtractor(Extractor):
|
||||
result.add_media(new_media)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing entry {entry}: {e}")
|
||||
if not len(result.media):
|
||||
logger.warning(f"No media found for entry {entry}, skipping.")
|
||||
return False
|
||||
|
||||
return self.add_metadata(data, info_extractor, url, result)
|
||||
|
||||
@@ -357,6 +483,13 @@ class GenericExtractor(Extractor):
|
||||
|
||||
dropin_submodule = self.dropin_for_name(info_extractor.ie_key())
|
||||
|
||||
def _helper_for_successful_extract_info(data, info_extractor, url, ydl):
|
||||
if data.get("is_live", False) and not self.livestreams:
|
||||
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
|
||||
return False
|
||||
# it's a valid video, that the youtubdedl can download out of the box
|
||||
return self.get_metadata_for_video(data, info_extractor, url, ydl)
|
||||
|
||||
try:
|
||||
if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor):
|
||||
logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
|
||||
@@ -364,11 +497,12 @@ class GenericExtractor(Extractor):
|
||||
|
||||
# don't download since it can be a live stream
|
||||
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
|
||||
if data.get("is_live", False) and not self.livestreams:
|
||||
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
|
||||
return False
|
||||
# it's a valid video, that the youtubdedl can download out of the box
|
||||
result = self.get_metadata_for_video(data, info_extractor, url, ydl)
|
||||
|
||||
result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)
|
||||
|
||||
except MaxDownloadsReached:
|
||||
# yt-dlp raises an error when the max downloads limit is reached, and it shouldn't for our purposes, so we consider that a success
|
||||
result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)
|
||||
|
||||
except Exception as e:
|
||||
if info_extractor.IE_NAME == "generic":
|
||||
@@ -422,6 +556,8 @@ class GenericExtractor(Extractor):
|
||||
"--write-subs" if self.subtitles else "--no-write-subs",
|
||||
"--write-auto-subs" if self.subtitles else "--no-write-auto-subs",
|
||||
"--live-from-start" if self.live_from_start else "--no-live-from-start",
|
||||
"--postprocessor-args",
|
||||
"ffmpeg:-bitexact", # ensure bitexact output to avoid mismatching hashes for same video
|
||||
]
|
||||
|
||||
# proxy handling
|
||||
|
||||
@@ -88,10 +88,7 @@ class GsheetsFeederDB(Feeder, Database):
|
||||
if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:
|
||||
# ALLOW rules exist AND sheet name not explicitly allowed
|
||||
return False
|
||||
if len(self.block_worksheets) and sheet_name in self.block_worksheets:
|
||||
# BLOCK rules exist AND sheet name is blocked
|
||||
return False
|
||||
return True
|
||||
return not (self.block_worksheets and sheet_name in self.block_worksheets)
|
||||
|
||||
def missing_required_columns(self, gw: GWorksheet) -> list:
|
||||
missing = []
|
||||
@@ -101,7 +98,7 @@ class GsheetsFeederDB(Feeder, Database):
|
||||
return missing
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
logger.info(f"STARTED {item}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, "status", "Archive in progress")
|
||||
|
||||
@@ -161,9 +158,8 @@ class GsheetsFeederDB(Feeder, Database):
|
||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
|
||||
batch_if_valid("screenshot", "\n".join(screenshot.urls))
|
||||
|
||||
if thumbnail := item.get_first_image("thumbnail"):
|
||||
if hasattr(thumbnail, "urls"):
|
||||
batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")')
|
||||
if (thumbnail := item.get_first_image("thumbnail")) and hasattr(thumbnail, "urls"):
|
||||
batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")')
|
||||
|
||||
if browsertrix := item.get_media_by_id("browsertrix"):
|
||||
batch_if_valid("wacz", "\n".join(browsertrix.urls))
|
||||
|
||||
@@ -31,9 +31,11 @@
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Archives various types of Instagram content using the Instagrapi API.
|
||||
Archives Instagram content using a deployment of the [Instagrapi API](https://subzeroid.github.io/instagrapi/).
|
||||
|
||||
Requires setting up an Instagrapi API deployment and providing an access token and API endpoint.
|
||||
Requires either getting a token from using a hosted [(paid) service](https://api.instagrapi.com/docs) and setting this in the configuration file.
|
||||
Alternatively you can run your own server. We have a basic script which you can use for this which can be ran locally or using Docker.
|
||||
For more information, read the [how to guide](https://auto-archiver.readthedocs.io/en/latest/how_to/run_instagrapi_server.html) on this.
|
||||
|
||||
### Features
|
||||
- Connects to an Instagrapi API deployment to fetch Instagram profiles, posts, stories, highlights, reels, and tagged content.
|
||||
|
||||
@@ -88,6 +88,9 @@ class InstagramTbotExtractor(Extractor):
|
||||
|
||||
if message:
|
||||
result.set_content(message).set_title(message[:128])
|
||||
elif result.is_empty():
|
||||
logger.debug(f"No media found for link {url=} for {self.name}: {message}")
|
||||
return False
|
||||
return result.success("insta-via-bot")
|
||||
|
||||
def _send_url_to_bot(self, url: str):
|
||||
@@ -104,13 +107,13 @@ class InstagramTbotExtractor(Extractor):
|
||||
message = ""
|
||||
time.sleep(3)
|
||||
# media is added before text by the bot so it can be used as a stop-logic mechanism
|
||||
while attempts < max(self.timeout - 3, 3) and (not message or not len(seen_media)):
|
||||
while attempts < max(self.timeout - 3, 15) and (not message or not len(seen_media)):
|
||||
attempts += 1
|
||||
time.sleep(1)
|
||||
for post in self.client.iter_messages(chat, min_id=since_id):
|
||||
since_id = max(since_id, post.id)
|
||||
# Skip known filler message:
|
||||
if post.message == "The bot receives information through https://hikerapi.com/p/hJqpppqi":
|
||||
if "The bot receives information through https://hikerapi.com/" in post.message:
|
||||
continue
|
||||
if post.media and post.id not in seen_media:
|
||||
filename_dest = os.path.join(tmp_dir, f"{chat.id}_{post.id}")
|
||||
|
||||
@@ -19,7 +19,7 @@
|
||||
},
|
||||
"session_file": {
|
||||
"default": "secrets/anon",
|
||||
"help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value.",
|
||||
"help": "Path of the file to save the telegram login session for future usage, '.session' will be appended to the provided path.",
|
||||
},
|
||||
"join_channels": {
|
||||
"default": True,
|
||||
|
||||
@@ -1,4 +1,10 @@
|
||||
import os
|
||||
import shutil
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from datetime import date
|
||||
|
||||
from telethon.sync import TelegramClient
|
||||
from telethon.errors import ChannelInvalidError
|
||||
from telethon.tl.functions.messages import ImportChatInviteRequest
|
||||
@@ -8,11 +14,9 @@ from telethon.errors.rpcerrorlist import (
|
||||
InviteRequestSentError,
|
||||
InviteHashExpiredError,
|
||||
)
|
||||
from loguru import logger
|
||||
|
||||
from tqdm import tqdm
|
||||
import re
|
||||
import time
|
||||
import os
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
@@ -31,10 +35,22 @@ class TelethonExtractor(Extractor):
|
||||
"""
|
||||
logger.info(f"SETUP {self.name} checking login...")
|
||||
|
||||
# in case the user already added '.session' to the session_file
|
||||
base_session_name = self.session_file.removesuffix(".session")
|
||||
base_session_filepath = f"{base_session_name}.session"
|
||||
|
||||
if self.session_file and not os.path.exists(base_session_filepath):
|
||||
logger.warning(
|
||||
f"SETUP - Session file {base_session_filepath} does not exist for {self.name}, creating an empty one."
|
||||
)
|
||||
Path(base_session_filepath).touch()
|
||||
|
||||
# make a copy of the session that is used exclusively with this archiver instance
|
||||
new_session_file = os.path.join("secrets/", f"telethon-{time.strftime('%Y-%m-%d')}{random_str(8)}.session")
|
||||
shutil.copy(self.session_file + ".session", new_session_file)
|
||||
self.session_file = new_session_file.replace(".session", "")
|
||||
self.session_file = os.path.join(
|
||||
os.path.dirname(base_session_filepath), f"telethon-{date.today().strftime('%Y-%m-%d')}{random_str(8)}"
|
||||
)
|
||||
logger.debug(f"Making a copy of the session file {base_session_filepath} to {self.session_file}.session")
|
||||
shutil.copy(base_session_filepath, f"{self.session_file}.session")
|
||||
|
||||
# initiate the client
|
||||
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
|
||||
@@ -87,8 +103,8 @@ class TelethonExtractor(Extractor):
|
||||
pbar.update()
|
||||
|
||||
def cleanup(self) -> None:
|
||||
logger.info(f"CLEANUP {self.name}.")
|
||||
session_file_name = self.session_file + ".session"
|
||||
logger.info(f"CLEANUP {self.name} - removing session file {self.session_file}.session")
|
||||
session_file_name = f"{self.session_file}.session"
|
||||
if os.path.exists(session_file_name):
|
||||
os.remove(session_file_name)
|
||||
|
||||
@@ -174,7 +190,7 @@ class TelethonExtractor(Extractor):
|
||||
if getattr(original_post, "grouped_id", None) is None:
|
||||
return [original_post] if getattr(original_post, "media", False) else []
|
||||
|
||||
search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)]
|
||||
search_ids = list(range(original_post.id - max_amp, original_post.id + max_amp + 1))
|
||||
posts = self.client.get_messages(chat, ids=search_ids)
|
||||
media = []
|
||||
for post in posts:
|
||||
|
||||
@@ -3,30 +3,38 @@
|
||||
"type": ["enricher"],
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "slugify", "tsp_client", "asn1crypto", "certvalidator", "certifi"],
|
||||
"python": ["loguru", "slugify", "cryptography", "rfc3161_client", "certifi"],
|
||||
},
|
||||
"configs": {
|
||||
"tsa_urls": {
|
||||
"default": [
|
||||
# [Adobe Approved Trust List] and [Windows Cert Store]
|
||||
"http://timestamp.digicert.com",
|
||||
"http://timestamp.identrust.com",
|
||||
# "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping
|
||||
# "https://timestamp.sectigo.com", # wait 15 seconds between each request.
|
||||
# [Adobe: European Union Trusted Lists].
|
||||
# "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request.
|
||||
# [Windows Cert Store]
|
||||
"http://timestamp.globalsign.com/tsa/r6advanced1",
|
||||
# [Adobe: European Union Trusted Lists] and [Windows Cert Store]
|
||||
# "http://ts.quovadisglobal.com/eu", # not valid for timestamping
|
||||
# "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain
|
||||
# "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain
|
||||
# "http://tsa.sep.bg", # self-signed certificate in certificate chain
|
||||
# "http://tsa.izenpe.com", #unable to get local issuer certificate
|
||||
# "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate
|
||||
"http://tss.accv.es:8318/tsa",
|
||||
],
|
||||
# See https://github.com/trailofbits/rfc3161-client/issues/46 for a list of valid TSAs
|
||||
# Full list of TSAs: https://gist.github.com/Manouchehri/fd754e402d98430243455713efada710
|
||||
"http://timestamp.identrust.com",
|
||||
"http://timestamp.ssl.trustwave.com",
|
||||
"http://zeitstempel.dfn.de",
|
||||
"http://ts.ssl.com",
|
||||
# "http://tsa.izenpe.com", # self-signed
|
||||
"http://tsa.lex-persona.com/tsa",
|
||||
# "http://ca.signfiles.com/TSAServer.aspx", # self-signed
|
||||
# "http://tsa.sinpe.fi.cr/tsaHttp/", # self-signed
|
||||
# "http://tsa.cra.ge/signserver/tsa?workerName=qtsa", # self-signed
|
||||
"http://tss.cnbs.gob.hn/TSS/HttpTspServer",
|
||||
"http://dss.nowina.lu/pki-factory/tsa/good-tsa",
|
||||
# "https://freetsa.org/tsr", # self-signed
|
||||
],
|
||||
"help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
|
||||
},
|
||||
"cert_authorities": {
|
||||
"default": None,
|
||||
"help": "Path to a file containing trusted Certificate Authorities (CAs) in PEM format. If empty, the default system authorities are used.",
|
||||
"type": "str",
|
||||
},
|
||||
"allow_selfsigned": {
|
||||
"default": False,
|
||||
"help": "Whether or not to allow and save self-signed Timestamping certificates. This allows for a greater range of timestamping servers to be used, \
|
||||
but they are not trusted authorities",
|
||||
"type": "bool"
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
|
||||
@@ -1,15 +1,23 @@
|
||||
import os
|
||||
from loguru import logger
|
||||
from tsp_client import TSPSigner, SigningSettings, TSPVerifier
|
||||
from tsp_client.algorithms import DigestAlgorithm
|
||||
|
||||
from importlib.metadata import version
|
||||
from asn1crypto.cms import ContentInfo
|
||||
from certvalidator import CertificateValidator, ValidationContext
|
||||
from asn1crypto import pem
|
||||
import hashlib
|
||||
|
||||
from slugify import slugify
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
from rfc3161_client import (decode_timestamp_response,TimestampRequestBuilder,TimeStampResponse, VerifierBuilder)
|
||||
from rfc3161_client import VerificationError as Rfc3161VerificationError
|
||||
from rfc3161_client.base import HashAlgorithm
|
||||
from rfc3161_client.tsp import SignedData
|
||||
from cryptography import x509
|
||||
from cryptography.hazmat.primitives import serialization
|
||||
import certifi
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.version import __version__
|
||||
|
||||
|
||||
class TimestampingEnricher(Enricher):
|
||||
@@ -21,6 +29,25 @@ class TimestampingEnricher(Enricher):
|
||||
See https://gist.github.com/Manouchehri/fd754e402d98430243455713efada710 for list of timestamp authorities.
|
||||
"""
|
||||
|
||||
session = None
|
||||
|
||||
def setup(self):
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(
|
||||
{
|
||||
"Content-Type": "application/timestamp-query",
|
||||
"User-Agent": f"Auto-Archiver {__version__}",
|
||||
"Accept": "application/timestamp-reply",
|
||||
}
|
||||
)
|
||||
|
||||
def cleaup(self) -> None:
|
||||
"""
|
||||
Terminates the underlying network session.
|
||||
"""
|
||||
if self.session:
|
||||
self.session.close()
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"RFC3161 timestamping existing files for {url=}")
|
||||
@@ -34,8 +61,8 @@ class TimestampingEnricher(Enricher):
|
||||
logger.warning(f"No hashes found in {url=}")
|
||||
return
|
||||
|
||||
tmp_dir = self.tmp_dir
|
||||
hashes_fn = os.path.join(tmp_dir, "hashes.txt")
|
||||
|
||||
hashes_fn = os.path.join(self.tmp_dir, "hashes.txt")
|
||||
|
||||
data_to_sign = "\n".join(hashes)
|
||||
with open(hashes_fn, "w") as f:
|
||||
@@ -43,62 +70,160 @@ class TimestampingEnricher(Enricher):
|
||||
hashes_media = Media(filename=hashes_fn)
|
||||
|
||||
timestamp_tokens = []
|
||||
from slugify import slugify
|
||||
|
||||
for tsa_url in self.tsa_urls:
|
||||
try:
|
||||
signing_settings = SigningSettings(tsp_server=tsa_url, digest_algorithm=DigestAlgorithm.SHA256)
|
||||
signer = TSPSigner()
|
||||
message = bytes(data_to_sign, encoding="utf8")
|
||||
# send TSQ and get TSR from the TSA server
|
||||
signed = signer.sign(message=message, signing_settings=signing_settings)
|
||||
# fail if there's any issue with the certificates, uses certifi list of trusted CAs
|
||||
TSPVerifier(certifi.where()).verify(signed, message=message)
|
||||
# download and verify timestamping certificate
|
||||
cert_chain = self.download_and_verify_certificate(signed)
|
||||
# continue with saving the timestamp token
|
||||
tst_fn = os.path.join(tmp_dir, f"timestamp_token_{slugify(tsa_url)}")
|
||||
with open(tst_fn, "wb") as f:
|
||||
f.write(signed)
|
||||
timestamp_tokens.append(Media(filename=tst_fn).set("tsa", tsa_url).set("cert_chain", cert_chain))
|
||||
message = bytes(data_to_sign, encoding='utf8')
|
||||
|
||||
logger.debug(f"Timestamping {url=} with {tsa_url=}")
|
||||
signed: TimeStampResponse = self.sign_data(tsa_url, message)
|
||||
|
||||
# fail if there's any issue with the certificates, uses certifi list of trusted CAs or the user-defined `cert_authorities`
|
||||
root_cert = self.verify_signed(signed, message)
|
||||
|
||||
if not root_cert:
|
||||
if self.allow_selfsigned:
|
||||
logger.warning(f"Allowing self-signed certificat from TSA {tsa_url=}")
|
||||
else:
|
||||
raise ValueError(f"No valid root certificate found for {tsa_url=}. Are you sure it's a trusted TSA? Or define an alternative trusted root with `cert_authorities`. (tried: {self.cert_authorities or certifi.where()})")
|
||||
|
||||
# save the timestamping certificate
|
||||
cert_chain = self.save_certificate(signed, root_cert)
|
||||
|
||||
timestamp_token_path = self.save_timestamp_token(signed.time_stamp_token(), tsa_url)
|
||||
timestamp_tokens.append(Media(filename=timestamp_token_path).set("tsa", tsa_url).set("cert_chain", cert_chain))
|
||||
except Exception as e:
|
||||
logger.warning(f"Error while timestamping {url=} with {tsa_url=}: {e}")
|
||||
|
||||
if len(timestamp_tokens):
|
||||
hashes_media.set("timestamp_authority_files", timestamp_tokens)
|
||||
hashes_media.set("certifi v", version("certifi"))
|
||||
hashes_media.set("tsp_client v", version("tsp_client"))
|
||||
hashes_media.set("certvalidator v", version("certvalidator"))
|
||||
hashes_media.set("rfc3161-client v", version("rfc3161_client"))
|
||||
hashes_media.set("cryptography v", version("cryptography"))
|
||||
to_enrich.add_media(hashes_media, id="timestamped_hashes")
|
||||
to_enrich.set("timestamped", True)
|
||||
logger.success(f"{len(timestamp_tokens)} timestamp tokens created for {url=}")
|
||||
else:
|
||||
logger.warning(f"No successful timestamps for {url=}")
|
||||
|
||||
def download_and_verify_certificate(self, signed: bytes) -> list[Media]:
|
||||
def save_timestamp_token(self, timestamp_token: bytes, tsa_url: str) -> str:
|
||||
"""
|
||||
Takes a timestamp token, and saves it to a file with the TSA URL as part of the filename.
|
||||
"""
|
||||
tst_path = os.path.join(self.tmp_dir, f"timestamp_token_{slugify(tsa_url)}")
|
||||
with open(tst_path, "wb") as f:
|
||||
f.write(timestamp_token)
|
||||
return tst_path
|
||||
|
||||
def verify_signed(self, timestamp_response: TimeStampResponse, message: bytes) -> x509.Certificate:
|
||||
"""
|
||||
Verify a Signed Timestamp Response is trusted by a known Certificate Authority.
|
||||
|
||||
Args:
|
||||
timestamp_response (TimeStampResponse): The signed timestamp response.
|
||||
message (bytes): The message that was timestamped.
|
||||
|
||||
Returns:
|
||||
x509.Certificate: A valid root certificate that was used to sign the timestamp response, or None
|
||||
|
||||
Raises:
|
||||
ValueError: If no valid root certificate was found in the trusted root store.
|
||||
"""
|
||||
|
||||
trusted_root_path = self.cert_authorities or certifi.where()
|
||||
cert_authorities = []
|
||||
|
||||
with open(trusted_root_path, 'rb') as f:
|
||||
cert_authorities = x509.load_pem_x509_certificates(f.read())
|
||||
|
||||
if not cert_authorities:
|
||||
raise ValueError(f"No trusted roots found in {trusted_root_path}.")
|
||||
|
||||
timestamp_certs = self.tst_certs(timestamp_response)
|
||||
intermediate_certs = timestamp_certs[1:-1]
|
||||
|
||||
message_hash = None
|
||||
hash_algorithm = timestamp_response.tst_info.message_imprint.hash_algorithm
|
||||
if hash_algorithm == x509.ObjectIdentifier(value="2.16.840.1.101.3.4.2.3"):
|
||||
message_hash = hashlib.sha512(message).digest()
|
||||
elif hash_algorithm == x509.ObjectIdentifier(value="2.16.840.1.101.3.4.2.1"):
|
||||
message_hash = hashlib.sha256(message).digest()
|
||||
else:
|
||||
raise ValueError(f"Unsupported hash algorithm: {hash_algorithm}")
|
||||
|
||||
for certificate in cert_authorities:
|
||||
builder = VerifierBuilder()
|
||||
builder.add_root_certificate(certificate)
|
||||
|
||||
for intermediate_cert in intermediate_certs:
|
||||
builder.add_intermediate_certificate(intermediate_cert)
|
||||
|
||||
verifier = builder.build()
|
||||
|
||||
|
||||
try:
|
||||
verifier.verify(timestamp_response, message_hash)
|
||||
return certificate
|
||||
except Rfc3161VerificationError:
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
def sign_data(self, tsa_url: str, bytes_data: bytes) -> TimeStampResponse:
|
||||
# see https://github.com/sigstore/sigstore-python/blob/99948d5b80525a5a104e904ffea58169dc6e0629/sigstore/_internal/timestamp.py#L84-L121
|
||||
|
||||
timestamp_request = (
|
||||
TimestampRequestBuilder().data(bytes_data).nonce(nonce=True).build()
|
||||
)
|
||||
try:
|
||||
response = self.session.post(tsa_url, data=timestamp_request.as_bytes(), timeout=10)
|
||||
response.raise_for_status()
|
||||
except requests.RequestException as e:
|
||||
logger.error(f"Error while sending request to {tsa_url=}: {e}")
|
||||
raise
|
||||
|
||||
# Check that we can parse the response but do not *verify* it
|
||||
try:
|
||||
timestamp_response = decode_timestamp_response(response.content)
|
||||
except ValueError as e:
|
||||
logger.error(f"Invalid timestamp response from server {tsa_url}: {e}")
|
||||
raise
|
||||
return timestamp_response
|
||||
|
||||
def tst_certs(self, tsp_response: TimeStampResponse):
|
||||
signed_data: SignedData = tsp_response.signed_data
|
||||
certs = [x509.load_der_x509_certificate(c) for c in signed_data.certificates]
|
||||
# reorder the certs to be in the correct order
|
||||
ordered_certs = []
|
||||
if len(certs) == 1:
|
||||
return certs
|
||||
|
||||
while(len(ordered_certs) < len(certs)):
|
||||
if len(ordered_certs) == 0:
|
||||
for cert in certs:
|
||||
if not [c for c in certs if cert.subject == c.issuer]:
|
||||
ordered_certs.append(cert)
|
||||
break
|
||||
else:
|
||||
for cert in certs:
|
||||
if cert.subject == ordered_certs[-1].issuer:
|
||||
ordered_certs.append(cert)
|
||||
break
|
||||
return ordered_certs
|
||||
|
||||
def save_certificate(self, tsp_response: TimeStampResponse, verified_root_cert: x509.Certificate) -> list[Media]:
|
||||
# returns the leaf certificate URL, fails if not set
|
||||
tst = ContentInfo.load(signed)
|
||||
|
||||
trust_roots = []
|
||||
with open(certifi.where(), "rb") as f:
|
||||
for _, _, der_bytes in pem.unarmor(f.read(), multiple=True):
|
||||
trust_roots.append(der_bytes)
|
||||
context = ValidationContext(trust_roots=trust_roots)
|
||||
certificates = self.tst_certs(tsp_response)
|
||||
|
||||
certificates = tst["content"]["certificates"]
|
||||
first_cert = certificates[0].dump()
|
||||
intermediate_certs = []
|
||||
for i in range(1, len(certificates)): # cannot use list comprehension [1:]
|
||||
intermediate_certs.append(certificates[i].dump())
|
||||
|
||||
validator = CertificateValidator(first_cert, intermediate_certs=intermediate_certs, validation_context=context)
|
||||
path = validator.validate_usage({"digital_signature"}, extended_key_usage={"time_stamping"})
|
||||
if verified_root_cert:
|
||||
# add the verified root certificate (if there is one - self signed certs will have None here)
|
||||
certificates += [verified_root_cert]
|
||||
|
||||
cert_chain = []
|
||||
for cert in path:
|
||||
cert_fn = os.path.join(self.tmp_dir, f"{str(cert.serial_number)[:20]}.crt")
|
||||
for i, cert in enumerate(certificates):
|
||||
cert_fn = os.path.join(self.tmp_dir, f"{i+1} – {str(cert.serial_number)[:20]}.crt")
|
||||
with open(cert_fn, "wb") as f:
|
||||
f.write(cert.dump())
|
||||
cert_chain.append(Media(filename=cert_fn).set("subject", cert.subject.native["common_name"]))
|
||||
f.write(cert.public_bytes(encoding=serialization.Encoding.PEM))
|
||||
cert_chain.append(Media(filename=cert_fn).set("subject", cert.subject.get_attributes_for_oid(x509.NameOID.COMMON_NAME)[0].value))
|
||||
|
||||
return cert_chain
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
from .vk_extractor import VkExtractor
|
||||
@@ -1,37 +0,0 @@
|
||||
{
|
||||
"name": "VKontakte Extractor",
|
||||
"type": ["extractor"],
|
||||
"requires_setup": True,
|
||||
"depends": ["core", "utils"],
|
||||
"dependencies": {
|
||||
"python": ["loguru", "vk_url_scraper"],
|
||||
},
|
||||
"configs": {
|
||||
"username": {"required": True, "help": "valid VKontakte username"},
|
||||
"password": {"required": True, "help": "valid VKontakte password"},
|
||||
"session_file": {
|
||||
"default": "secrets/vk_config.v2.json",
|
||||
"help": "valid VKontakte password",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
The `VkExtractor` fetches posts, text, and images from VK (VKontakte) social media pages.
|
||||
This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract
|
||||
and download content. Note that VK videos are handled separately by the `YTDownloader`.
|
||||
|
||||
### Features
|
||||
- Extracts text, timestamps, and metadata from VK `/wall` posts.
|
||||
- Downloads associated images and attaches them to the resulting `Metadata` object.
|
||||
- Processes multiple segments of VK URLs that contain mixed content (e.g., wall, photo).
|
||||
- Outputs structured metadata and media using `Metadata` and `Media` objects.
|
||||
|
||||
### Setup
|
||||
To use the `VkArchiver`, you must provide valid VKontakte login credentials and session information:
|
||||
- **Username**: A valid VKontakte account username.
|
||||
- **Password**: The corresponding password for the VKontakte account.
|
||||
- **Session File**: Optional. Path to a session configuration file (`.json`) for persistent VK login.
|
||||
|
||||
Credentials can be set in the configuration file or directly via environment variables. Ensure you
|
||||
have access to the VKontakte API by creating an account at [VKontakte](https://vk.com/).
|
||||
""",
|
||||
}
|
||||
@@ -1,43 +0,0 @@
|
||||
from loguru import logger
|
||||
from vk_url_scraper import VkScraper
|
||||
|
||||
from auto_archiver.utils.misc import dump_payload
|
||||
from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
|
||||
class VkExtractor(Extractor):
|
||||
""" "
|
||||
VK videos are handled by YTDownloader, this archiver gets posts text and images.
|
||||
Currently only works for /wall posts
|
||||
"""
|
||||
|
||||
def setup(self) -> None:
|
||||
self.vks = VkScraper(self.username, self.password, session_file=self.session_file)
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
if "vk.com" not in item.netloc:
|
||||
return False
|
||||
|
||||
# some urls can contain multiple wall/photo/... parts and all will be fetched
|
||||
vk_scrapes = self.vks.scrape(url)
|
||||
if not len(vk_scrapes):
|
||||
return False
|
||||
logger.debug(f"VK: got {len(vk_scrapes)} scraped instances")
|
||||
|
||||
result = Metadata()
|
||||
for scrape in vk_scrapes:
|
||||
if not result.get_title():
|
||||
result.set_title(scrape["text"])
|
||||
if not result.get_timestamp():
|
||||
result.set_timestamp(scrape["datetime"])
|
||||
|
||||
result.set_content(dump_payload(vk_scrapes))
|
||||
|
||||
filenames = self.vks.download_media(vk_scrapes, self.tmp_dir)
|
||||
for filename in filenames:
|
||||
result.add_media(Media(filename))
|
||||
|
||||
return result.success("vk")
|
||||
@@ -40,27 +40,31 @@
|
||||
Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving.
|
||||
[Browsertrix-crawler](https://crawler.docs.browsertrix.com/user-guide/) is a headless browser-based crawler that archives web pages in WACZ format.
|
||||
|
||||
## Setup
|
||||
|
||||
**Docker**
|
||||
If you are using the Docker file to run Auto Archiver (recommended), then everything is set up and you can use WACZ out of the box!
|
||||
Otherwise, if you are using a local install of Auto Archiver (e.g. pip or dev install), then you will need to install Docker and run
|
||||
the docker daemon to be able to run the `browsertrix-crawler` tool.
|
||||
|
||||
**Browsertrix Profiles**
|
||||
A browsertrix profile is a custom browser profile (login information, browser extensions, etc.) that can be used to archive private or dynamic content.
|
||||
You can run the WACZ Enricher without a profile, but for more resilient archiving, it is recommended to create a profile. See the [Browsertrix documentation](https://crawler.docs.browsertrix.com/user-guide/browser-profiles/)
|
||||
for more information.
|
||||
|
||||
** Docker in Docker **
|
||||
If you are running Auto Archiver within a Docker container, you will need to enable Docker in Docker to run the `browsertrix-crawler` tool.
|
||||
This can be done by setting the `WACZ_ENABLE_DOCKER` environment variable to `1`.
|
||||
|
||||
## Features
|
||||
- Archives web pages into .WACZ format using Docker or direct invocation of `browsertrix-crawler`.
|
||||
- Supports custom profiles for archiving private or dynamic content.
|
||||
- Extracts media (images, videos, audio) and screenshots from the archive, optionally adding them to the enrichment pipeline.
|
||||
- Generates metadata from the archived page's content and structure (e.g., titles, text).
|
||||
|
||||
## Setup
|
||||
|
||||
### Using Docker
|
||||
If you are using the Auto Archiver [Docker image](https://auto-archiver.readthedocs.io/en/latest/installation/installation.html#installing-with-docker)
|
||||
to run Auto Archiver (recommended), then everything is set up and you can use WACZ out of the box!
|
||||
Otherwise, if you are using a local install of Auto Archiver (e.g. pip or dev install), then you will need to install Docker and run
|
||||
the docker daemon to be able to run the `browsertrix-crawler` tool.
|
||||
|
||||
### Browsertrix Profiles
|
||||
A browsertrix profile is a custom browser profile (login information, browser extensions, etc.) that can be used to archive private or dynamic content.
|
||||
You can run the WACZ Enricher without a profile, but for more resilient archiving, it is recommended to create a profile.
|
||||
See the [Browsertrix documentation](https://crawler.docs.browsertrix.com/user-guide/browser-profiles/) for more information on how to use the `create-login-profile` tool.
|
||||
|
||||
|
||||
|
||||
### Docker in Docker
|
||||
If you are running Auto Archiver within a Docker container, you will need to enable Docker in Docker to run the `browsertrix-crawler` tool.
|
||||
This can be done by setting the `WACZ_ENABLE_DOCKER` environment variable to `1`.
|
||||
|
||||
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -86,6 +86,12 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
||||
if self.docker_in_docker:
|
||||
cmd.extend(["--cwd", self.cwd_dind])
|
||||
|
||||
if self.auth_for_site(url):
|
||||
# there's an auth for this site, but browsertrix only supports username/password auth
|
||||
logger.warning(
|
||||
"The WACZ enricher / Browsertrix does not support using the 'authentication' information for logging in. You should consider creating a Browser Profile for WACZ archiving. More information: https://auto-archiver.readthedocs.io/en/latest/modules/autogen/extractor/wacz_extractor_enricher.html#browsertrix-profiles"
|
||||
)
|
||||
|
||||
# call docker if explicitly enabled or we are running on the host (not in docker)
|
||||
if self.use_docker:
|
||||
logger.debug(f"generating WACZ in Docker for {url=}")
|
||||
@@ -188,7 +194,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
||||
shutil.copyfileobj(infile, outfile)
|
||||
|
||||
# get media out of .warc
|
||||
counter = 0
|
||||
counter_warc_files = 0
|
||||
counter_screenshots = 0
|
||||
seen_urls = set()
|
||||
|
||||
with open(warc_filename, "rb") as warc_stream:
|
||||
@@ -197,12 +204,12 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
||||
if (
|
||||
record.rec_type == "resource" and record.content_type == "image/png" and self.extract_screenshot
|
||||
): # screenshots
|
||||
fn = os.path.join(tmp_dir, f"warc-file-{counter}.png")
|
||||
fn = os.path.join(tmp_dir, f"warc-file-{counter_screenshots}.png")
|
||||
with open(fn, "wb") as outf:
|
||||
outf.write(record.raw_stream.read())
|
||||
m = Media(filename=fn)
|
||||
to_enrich.add_media(m, "browsertrix-screenshot")
|
||||
counter += 1
|
||||
to_enrich.add_media(m, f"browsertrix-screenshot-{counter_screenshots}")
|
||||
counter_screenshots += 1
|
||||
if not self.extract_media:
|
||||
continue
|
||||
|
||||
@@ -225,7 +232,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
||||
|
||||
# create local file and add media
|
||||
ext = mimetypes.guess_extension(content_type)
|
||||
warc_fn = f"warc-file-{counter}{ext}"
|
||||
warc_fn = f"warc-file-{counter_screenshots}{ext}"
|
||||
fn = os.path.join(tmp_dir, warc_fn)
|
||||
|
||||
record_url_best_qual = UrlUtil.twitter_best_quality_url(record_url)
|
||||
@@ -250,6 +257,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
||||
continue
|
||||
|
||||
to_enrich.add_media(m, warc_fn)
|
||||
counter += 1
|
||||
counter_warc_files += 1
|
||||
seen_urls.add(record_url)
|
||||
logger.info(f"WACZ extract_media/extract_screenshot finished, found {counter} relevant media file(s)")
|
||||
logger.info(
|
||||
f"WACZ extract_media/extract_screenshot finished, found {counter_warc_files + counter_screenshots} relevant media file(s)"
|
||||
)
|
||||
|
||||
@@ -10,14 +10,31 @@ from typing import Dict, Tuple
|
||||
import hashlib
|
||||
|
||||
import pytest
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from auto_archiver.core.module import ModuleFactory
|
||||
|
||||
# Test names inserted into this list will be run last. This is useful for expensive/costly tests
|
||||
# that you only want to run if everything else succeeds (e.g. API calls). The order here is important
|
||||
# what comes first will be run first (at the end of all other tests not mentioned)
|
||||
# format is the name of the module (python file) without the .py extension
|
||||
TESTS_TO_RUN_LAST = ["test_twitter_api_archiver"]
|
||||
TESTS_TO_RUN_LAST = ["test_generic_archiver", "test_twitter_api_archiver"]
|
||||
|
||||
|
||||
# don't check for ytdlp updates in tests
|
||||
@pytest.fixture(autouse=True)
|
||||
def skip_check_for_update(mocker):
|
||||
update_ytdlp = mocker.patch(
|
||||
"auto_archiver.modules.generic_extractor.generic_extractor.GenericExtractor.update_package"
|
||||
)
|
||||
update_ytdlp.return_value = False
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def get_lazy_module():
|
||||
def _get_lazy_module(module_name):
|
||||
return ModuleFactory().get_module_lazy(module_name)
|
||||
|
||||
return _get_lazy_module
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -134,12 +151,21 @@ def unpickle():
|
||||
|
||||
@pytest.fixture
|
||||
def mock_binary_dependencies(mocker):
|
||||
mocker.patch("subprocess.run").return_value = mocker.Mock(returncode=0)
|
||||
mock_shutil_which = mocker.patch("shutil.which")
|
||||
# Mock all binary dependencies as available
|
||||
mock_shutil_which.return_value = "/usr/bin/fake_binary"
|
||||
return mock_shutil_which
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_media(tmp_path) -> Media:
|
||||
"""Fixture creating a Media object with temporary source file"""
|
||||
src_file = tmp_path / "source.txt"
|
||||
src_file.write_text("test content")
|
||||
return Media(_key="subdir/test.txt", filename=str(src_file))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_datetime():
|
||||
return datetime(2023, 1, 1, 12, 0, tzinfo=timezone.utc)
|
||||
|
||||
@@ -1,6 +1,11 @@
|
||||
from auto_archiver.core import Extractor
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class ExampleExtractor(Extractor):
|
||||
def download(self, item):
|
||||
print("download")
|
||||
logger.info("download")
|
||||
|
||||
def cleanup(self):
|
||||
logger.info("cleanup")
|
||||
|
||||
@@ -1,27 +1,29 @@
|
||||
from auto_archiver.core import Extractor, Enricher, Feeder, Database, Storage, Formatter, Metadata
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class ExampleModule(Extractor, Enricher, Feeder, Database, Storage, Formatter):
|
||||
def download(self, item):
|
||||
print("download")
|
||||
logger.info("download")
|
||||
|
||||
def __iter__(self):
|
||||
yield Metadata().set_url("https://example.com")
|
||||
|
||||
def done(self, result):
|
||||
print("done")
|
||||
logger.info("done")
|
||||
|
||||
def enrich(self, to_enrich):
|
||||
print("enrich")
|
||||
logger.info("enrich")
|
||||
|
||||
def get_cdn_url(self, media):
|
||||
return "nice_url"
|
||||
|
||||
def save(self, item):
|
||||
print("save")
|
||||
logger.info("save")
|
||||
|
||||
def uploadf(self, file, key, **kwargs):
|
||||
print("uploadf")
|
||||
logger.info("uploadf")
|
||||
|
||||
def format(self, item):
|
||||
print("format")
|
||||
logger.info("format")
|
||||
|
||||
BIN
tests/data/timestamping/digicert.tsr
Normal file
BIN
tests/data/timestamping/digicert.tsr
Normal file
Binary file not shown.
BIN
tests/data/timestamping/rfc3161-client-issue-104.tsr
Normal file
BIN
tests/data/timestamping/rfc3161-client-issue-104.tsr
Normal file
Binary file not shown.
BIN
tests/data/timestamping/self_signed.tsr
Normal file
BIN
tests/data/timestamping/self_signed.tsr
Normal file
Binary file not shown.
Binary file not shown.
BIN
tests/data/timestamping/valid_timestamp.tsr
Normal file
BIN
tests/data/timestamping/valid_timestamp.tsr
Normal file
Binary file not shown.
215
tests/enrichers/test_timestamping_enricher.py
Normal file
215
tests/enrichers/test_timestamping_enricher.py
Normal file
@@ -0,0 +1,215 @@
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
from rfc3161_client import (
|
||||
TimeStampResponse,
|
||||
decode_timestamp_response,
|
||||
)
|
||||
import requests
|
||||
|
||||
from auto_archiver.modules.timestamping_enricher.timestamping_enricher import TimestampingEnricher
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def timestamp_response() -> TimeStampResponse:
|
||||
with open("tests/data/timestamping/valid_timestamp.tsr", "rb") as f:
|
||||
return decode_timestamp_response(f.read())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def wrong_order_timestamp_response() -> TimeStampResponse:
|
||||
with open("tests/data/timestamping/rfc3161-client-issue-104.tsr", "rb") as f:
|
||||
return decode_timestamp_response(f.read())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def selfsigned_response() -> TimeStampResponse:
|
||||
with open("tests/data/timestamping/self_signed.tsr", "rb") as f:
|
||||
return decode_timestamp_response(f.read())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def digicert_response() -> TimeStampResponse:
|
||||
with open("tests/data/timestamping/digicert.tsr", "rb") as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def filehash():
|
||||
return "4b7b4e39f12b8c725e6e603e6d4422500316df94211070682ef10260ff5759ef"
|
||||
|
||||
|
||||
@pytest.mark.download
|
||||
def test_enriching(setup_module, sample_media):
|
||||
tsp: TimestampingEnricher = setup_module("timestamping_enricher")
|
||||
|
||||
# tests the current TSAs set as default in the __manifest__ to make sure they are all still working
|
||||
|
||||
# test the enrich method
|
||||
metadata = Metadata().set_url("https://example.com")
|
||||
sample_media.set("hash", "4b7b4e39f12b8c725e6e603e6d4422500316df94211070682ef10260ff5759ef")
|
||||
metadata.add_media(sample_media)
|
||||
tsp.enrich(metadata)
|
||||
|
||||
|
||||
def test_full_enriching_selfsigned(setup_module, sample_media, mocker, selfsigned_response, filehash):
|
||||
mock_post = mocker.patch("requests.sessions.Session.post")
|
||||
mock_post.return_value.status_code = 200
|
||||
mock_decode_timestamp_response = mocker.patch(
|
||||
"auto_archiver.modules.timestamping_enricher.timestamping_enricher.decode_timestamp_response"
|
||||
)
|
||||
mock_decode_timestamp_response.return_value = selfsigned_response
|
||||
|
||||
tsp: TimestampingEnricher = setup_module("timestamping_enricher", {"tsa_urls": ["http://timestamp.identrust.com"]})
|
||||
metadata = Metadata().set_url("https://example.com")
|
||||
sample_media.set("hash", filehash)
|
||||
metadata.add_media(sample_media)
|
||||
tsp.enrich(metadata)
|
||||
|
||||
assert len(metadata.media) == 1 # doesn't allow self-signed
|
||||
|
||||
# set self-signed on tsp
|
||||
tsp.allow_selfsigned = True
|
||||
tsp.enrich(metadata)
|
||||
|
||||
assert len(metadata.media) == 2
|
||||
|
||||
|
||||
def test_full_enriching(setup_module, sample_media, mocker, timestamp_response, filehash):
|
||||
mock_post = mocker.patch("requests.sessions.Session.post")
|
||||
mock_post.return_value.status_code = 200
|
||||
mock_decode_timestamp_response = mocker.patch(
|
||||
"auto_archiver.modules.timestamping_enricher.timestamping_enricher.decode_timestamp_response"
|
||||
)
|
||||
mock_decode_timestamp_response.return_value = timestamp_response
|
||||
|
||||
tsp: TimestampingEnricher = setup_module("timestamping_enricher", {"tsa_urls": ["http://timestamp.identrust.com"]})
|
||||
metadata = Metadata().set_url("https://example.com")
|
||||
sample_media.set("hash", filehash)
|
||||
metadata.add_media(sample_media)
|
||||
tsp.enrich(metadata)
|
||||
|
||||
assert metadata.get("timestamped") is True
|
||||
assert len(metadata.media) == 2 # the original 'sample_media' and the new 'timestamp_media'
|
||||
|
||||
timestamp_media = metadata.media[1]
|
||||
assert timestamp_media.filename == f"{tsp.tmp_dir}/hashes.txt"
|
||||
assert Path(timestamp_media.filename).read_text() == filehash
|
||||
|
||||
# we only have one authority file because we only used one TSA
|
||||
assert len(timestamp_media.get("timestamp_authority_files")) == 1
|
||||
timestamp_authority_file = timestamp_media.get("timestamp_authority_files")[0]
|
||||
assert Path(timestamp_authority_file.filename).read_bytes() == timestamp_response.time_stamp_token()
|
||||
|
||||
cert_chain = timestamp_authority_file.get("cert_chain")
|
||||
assert len(cert_chain) == 3
|
||||
assert cert_chain[0].filename == f"{tsp.tmp_dir}/1 – 85078758028491331763.crt"
|
||||
assert cert_chain[1].filename == f"{tsp.tmp_dir}/2 – 85078371663472981624.crt"
|
||||
assert cert_chain[2].filename == f"{tsp.tmp_dir}/3 – 13298821034946342390.crt"
|
||||
|
||||
|
||||
def test_full_enriching_multiple_tsa(setup_module, sample_media, mocker, timestamp_response, filehash):
|
||||
mock_post = mocker.patch("requests.sessions.Session.post")
|
||||
mock_post.return_value.status_code = 200
|
||||
|
||||
mock_decode_timestamp_response = mocker.patch(
|
||||
"auto_archiver.modules.timestamping_enricher.timestamping_enricher.decode_timestamp_response"
|
||||
)
|
||||
mock_decode_timestamp_response.return_value = timestamp_response
|
||||
|
||||
tsp: TimestampingEnricher = setup_module(
|
||||
"timestamping_enricher", {"tsa_urls": ["http://example.com/timestamp1", "http://example.com/timestamp2"]}
|
||||
)
|
||||
metadata = Metadata().set_url("https://example.com")
|
||||
sample_media.set("hash", filehash)
|
||||
metadata.add_media(sample_media)
|
||||
tsp.enrich(metadata)
|
||||
|
||||
assert metadata.get("timestamped") is True
|
||||
assert len(metadata.media) == 2 # the original 'sample_media' and the new 'timestamp_media'
|
||||
|
||||
timestamp_media = metadata.media[1]
|
||||
assert len(timestamp_media.get("timestamp_authority_files")) == 2
|
||||
for timestamp_token_media in timestamp_media.get("timestamp_authority_files"):
|
||||
assert Path(timestamp_token_media.filename).read_bytes() == timestamp_response.time_stamp_token()
|
||||
assert len(timestamp_token_media.get("cert_chain")) == 3
|
||||
|
||||
|
||||
def test_fails_for_digicert(setup_module, mocker, digicert_response):
|
||||
"""
|
||||
Digicert TSRs are not compliant with RFC 3161.
|
||||
See https://github.com/trailofbits/rfc3161-client/issues/104#issuecomment-2621960840
|
||||
"""
|
||||
mocker.patch("requests.sessions.Session.post", return_value=requests.Response())
|
||||
mocker.patch("requests.Response.raise_for_status")
|
||||
mocker.patch("requests.Response.content", new_callable=mocker.PropertyMock, return_value=digicert_response)
|
||||
|
||||
tsa_url = "http://timestamp.digicert.com"
|
||||
tsp: TimestampingEnricher = setup_module("timestamping_enricher")
|
||||
|
||||
data = b"4b7b4e39f12b8c725e6e603e6d4422500316df94211070682ef10260ff5759ef"
|
||||
with pytest.raises(ValueError) as e:
|
||||
tsp.sign_data(tsa_url, data)
|
||||
assert "ASN.1 parse error: ParseError" in str(e.value)
|
||||
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_tsr(setup_module):
|
||||
tsa_url = "http://timestamp.identrust.com"
|
||||
tsp: TimestampingEnricher = setup_module("timestamping_enricher")
|
||||
|
||||
data = b"4b7b4e39f12b8c725e6e603e6d4422500316df94211070682ef10260ff5759ef"
|
||||
result: TimeStampResponse = tsp.sign_data(tsa_url, data)
|
||||
assert isinstance(result, TimeStampResponse)
|
||||
|
||||
verified_root_cert = tsp.verify_signed(result, data)
|
||||
assert verified_root_cert.subject.rfc4514_string() == "CN=IdenTrust Commercial Root CA 1,O=IdenTrust,C=US"
|
||||
|
||||
# test downloading the cert
|
||||
cert_chain = tsp.save_certificate(result, verified_root_cert)
|
||||
assert len(cert_chain) == 3
|
||||
|
||||
|
||||
def test_verify_save(setup_module, timestamp_response):
|
||||
tsp: TimestampingEnricher = setup_module("timestamping_enricher")
|
||||
|
||||
verified_root_cert = tsp.verify_signed(
|
||||
timestamp_response, b"4b7b4e39f12b8c725e6e603e6d4422500316df94211070682ef10260ff5759ef"
|
||||
)
|
||||
assert verified_root_cert.subject.rfc4514_string() == "CN=IdenTrust Commercial Root CA 1,O=IdenTrust,C=US"
|
||||
|
||||
cert_chain = tsp.save_certificate(timestamp_response, verified_root_cert)
|
||||
assert len(cert_chain) == 3
|
||||
|
||||
assert cert_chain[0].filename == f"{tsp.tmp_dir}/1 – 85078758028491331763.crt"
|
||||
assert cert_chain[1].filename == f"{tsp.tmp_dir}/2 – 85078371663472981624.crt"
|
||||
assert cert_chain[2].filename == f"{tsp.tmp_dir}/3 – 13298821034946342390.crt"
|
||||
|
||||
|
||||
def test_order_crt_correctly(setup_module, wrong_order_timestamp_response):
|
||||
# reference: https://github.com/trailofbits/rfc3161-client/issues/104#issuecomment-2711244010
|
||||
tsp: TimestampingEnricher = setup_module("timestamping_enricher")
|
||||
|
||||
# get the certificates, make sure the reordering is working:
|
||||
|
||||
ordered_certs = tsp.tst_certs(wrong_order_timestamp_response)
|
||||
assert len(ordered_certs) == 2
|
||||
assert ordered_certs[0].subject.rfc4514_string() == "CN=TrustID Timestamp Authority,O=IdenTrust,C=US"
|
||||
assert ordered_certs[1].subject.rfc4514_string() == "CN=TrustID Timestamping CA 3,O=IdenTrust,C=US"
|
||||
|
||||
|
||||
def test_invalid_tsa_invalid_response(setup_module, mocker):
|
||||
mocker.patch("requests.sessions.Session.post", return_value=requests.Response())
|
||||
raise_for_status = mocker.patch("requests.Response.raise_for_status")
|
||||
raise_for_status.side_effect = requests.exceptions.HTTPError("404 Client Error")
|
||||
tsp = setup_module("timestamping_enricher")
|
||||
|
||||
with pytest.raises(requests.exceptions.HTTPError, match="404 Client Error"):
|
||||
tsp.sign_data("http://bellingcat.com/page-not-found/", b"my-message")
|
||||
|
||||
|
||||
def test_fail_on_selfsigned_cert(setup_module, selfsigned_response):
|
||||
tsp = setup_module("timestamping_enricher")
|
||||
root_cert = tsp.verify_signed(selfsigned_response, b"my-message")
|
||||
assert root_cert is None
|
||||
@@ -119,4 +119,4 @@ def test_extract_media(wacz_enricher, metadata, tmp_path, mocker) -> None:
|
||||
metadata.add_media(Media("something.wacz"), "browsertrix")
|
||||
wacz_enricher.extract_media_from_wacz(metadata, str(wacz_file))
|
||||
assert len(metadata.media) == 2
|
||||
assert metadata.media[1].properties.get("id") == "browsertrix-screenshot"
|
||||
assert metadata.media[1].properties.get("id") == "browsertrix-screenshot-0"
|
||||
|
||||
@@ -25,5 +25,5 @@ class TestExtractorBase(object):
|
||||
else:
|
||||
assert status == test_response.status
|
||||
|
||||
assert title == test_response.get_title()
|
||||
assert timestamp, test_response.get("timestamp")
|
||||
assert title in test_response.get_title()
|
||||
assert timestamp == test_response.get("timestamp")
|
||||
|
||||
@@ -29,6 +29,7 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
"proxy": None,
|
||||
"cookies_from_browser": False,
|
||||
"cookie_file": None,
|
||||
"pot_provider": False,
|
||||
}
|
||||
|
||||
def test_load_dropin(self):
|
||||
@@ -36,7 +37,7 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
package = "auto_archiver.modules.generic_extractor"
|
||||
assert self.extractor.dropin_for_name("bluesky", package=package)
|
||||
|
||||
# test loading dropings via filepath
|
||||
# test loading dropins via filepath
|
||||
path = os.path.join(dirname(dirname(__file__)), "data/")
|
||||
assert self.extractor.dropin_for_name("dropin", additional_paths=[path])
|
||||
|
||||
@@ -121,7 +122,7 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
== "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
|
||||
)
|
||||
assert len(result.media) == 2
|
||||
assert Path(result.media[0].filename).name == "J---aiyznGQ.webm"
|
||||
assert "J---aiyznGQ" in Path(result.media[0].filename).name
|
||||
assert Path(result.media[1].filename).name == "hqdefault.jpg"
|
||||
|
||||
@pytest.mark.download
|
||||
@@ -218,7 +219,7 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
post = self.extractor.download(make_item(url))
|
||||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
"Bellingcat - This month's Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it's crucial to investigate the people behind these services",
|
||||
"Bellingcat - This month's Bellingchat Premium is with @KolinaKoltai",
|
||||
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc),
|
||||
)
|
||||
|
||||
@@ -291,3 +292,42 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
post = self.extractor.download(make_item(url))
|
||||
assert "Bellingcat researcher Kolina Koltai delves deeper into Clothoff" in post.get("content")
|
||||
assert post.get_title() == "Bellingcat"
|
||||
|
||||
|
||||
class TestGenericExtractorPoToken:
|
||||
@pytest.fixture
|
||||
def extractor(self, mocker):
|
||||
extractor = GenericExtractor()
|
||||
extractor.extractor_args = {}
|
||||
extractor.setup_token_generation_script = mocker.Mock()
|
||||
return extractor
|
||||
|
||||
def test_po_token_disabled_does_not_call_setup(self, extractor):
|
||||
extractor.bguils_po_token_method = "disabled"
|
||||
extractor.in_docker = True
|
||||
extractor.setup_po_tokens()
|
||||
extractor.setup_token_generation_script.assert_not_called()
|
||||
|
||||
def test_po_token_default_in_docker_calls_setup(self, extractor, mocker):
|
||||
extractor.bguils_po_token_method = "auto"
|
||||
mocker.patch.dict(os.environ, {"RUNNING_IN_DOCKER": "1"})
|
||||
extractor.setup_po_tokens()
|
||||
extractor.setup_token_generation_script.assert_called_once()
|
||||
|
||||
def test_po_token_default_local_does_not_call_setup(self, extractor, caplog, mocker):
|
||||
extractor.bguils_po_token_method = "auto"
|
||||
# clears env vars for this test
|
||||
mocker.patch.dict(os.environ, {}, clear=True)
|
||||
extractor.setup_po_tokens()
|
||||
extractor.setup_token_generation_script.assert_not_called()
|
||||
assert "Proof of Origin Token method not explicitly set" in caplog.text
|
||||
|
||||
def test_po_token_script_always_calls_setup(self, extractor):
|
||||
extractor.bguils_po_token_method = "script"
|
||||
extractor.in_docker = False
|
||||
extractor.setup_po_tokens()
|
||||
extractor.setup_token_generation_script.assert_called_once()
|
||||
extractor.setup_token_generation_script.reset_mock()
|
||||
extractor.in_docker = True
|
||||
extractor.setup_po_tokens()
|
||||
extractor.setup_token_generation_script.assert_called_once()
|
||||
|
||||
@@ -68,6 +68,12 @@ def test_download_invalid(extractor, metadata_sample, mocker):
|
||||
assert extractor.download(metadata_sample) is False
|
||||
|
||||
|
||||
def test_fails_with_empty_response(extractor, metadata_sample, mocker):
|
||||
mocker.patch.object(extractor, "_send_url_to_bot", return_value=(mocker.MagicMock(), 101))
|
||||
mocker.patch.object(extractor, "_process_messages", return_value="")
|
||||
assert extractor.download(metadata_sample) is False
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Requires authentication.")
|
||||
class TestInstagramTbotExtractorReal(TestExtractorBase):
|
||||
# To run these tests set the TELEGRAM_API_ID and TELEGRAM_API_HASH environment variables, and ensure the session file exists.
|
||||
|
||||
26
tests/extractors/test_telethon_extractor.py
Normal file
26
tests/extractors/test_telethon_extractor.py
Normal file
@@ -0,0 +1,26 @@
|
||||
import os
|
||||
from datetime import date
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def mock_client_setup(mocker):
|
||||
mocker.patch("telethon.client.auth.AuthMethods.start")
|
||||
|
||||
|
||||
def test_setup_fails_clear_session_file(get_lazy_module, tmp_path, mocker):
|
||||
start = mocker.patch("telethon.client.auth.AuthMethods.start")
|
||||
start.side_effect = Exception("Test exception")
|
||||
|
||||
# make sure the default setup file is created
|
||||
session_file = tmp_path / "test.session"
|
||||
|
||||
lazy_module = get_lazy_module("telethon_extractor")
|
||||
|
||||
with pytest.raises(Exception):
|
||||
lazy_module.load({"telethon_extractor": {"session_file": str(session_file), "api_id": 123, "api_hash": "ABC"}})
|
||||
|
||||
assert session_file.exists()
|
||||
assert f"telethon-{date.today().strftime('%Y-%m-%d')}" in lazy_module._instance.session_file
|
||||
assert os.path.exists(lazy_module._instance.session_file + ".session")
|
||||
@@ -1,77 +0,0 @@
|
||||
import pytest
|
||||
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.modules.vk_extractor import VkExtractor
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_vk_scraper(mocker):
|
||||
"""Fixture to mock VkScraper."""
|
||||
return mocker.patch("auto_archiver.modules.vk_extractor.vk_extractor.VkScraper")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vk_extractor(setup_module, mock_vk_scraper) -> VkExtractor:
|
||||
"""Fixture to initialize VkExtractor with mocked VkScraper."""
|
||||
extractor_module = "vk_extractor"
|
||||
configs = {
|
||||
"username": "name",
|
||||
"password": "password123",
|
||||
"session_file": "secrets/vk_config.v2.json",
|
||||
}
|
||||
vk = setup_module(extractor_module, configs)
|
||||
vk.vks = mock_vk_scraper.return_value
|
||||
return vk
|
||||
|
||||
|
||||
def test_netloc(vk_extractor, metadata):
|
||||
# metadata url set as: "https://example.com/"
|
||||
assert vk_extractor.download(metadata) is False
|
||||
|
||||
|
||||
def test_vk_url_but_scrape_returns_empty(vk_extractor, metadata):
|
||||
metadata.set_url("https://vk.com/valid-wall")
|
||||
vk_extractor.vks.scrape.return_value = []
|
||||
assert vk_extractor.download(metadata) is False
|
||||
assert metadata.netloc == "vk.com"
|
||||
vk_extractor.vks.scrape.assert_called_once_with(metadata.get_url())
|
||||
|
||||
|
||||
def test_successful_scrape_and_download(vk_extractor, metadata, mocker):
|
||||
mock_scrapes = [
|
||||
{"text": "Post Title", "datetime": "2023-01-01T00:00:00", "id": 1},
|
||||
{"text": "Another Post", "datetime": "2023-01-02T00:00:00", "id": 2},
|
||||
]
|
||||
mock_filenames = ["image1.jpg", "image2.png"]
|
||||
vk_extractor.vks.scrape.return_value = mock_scrapes
|
||||
vk_extractor.vks.download_media.return_value = mock_filenames
|
||||
metadata.set_url("https://vk.com/valid-wall")
|
||||
result = vk_extractor.download(metadata)
|
||||
# Test metadata
|
||||
assert result.is_success()
|
||||
assert result.status == "vk: success"
|
||||
assert result.get_title() == "Post Title"
|
||||
assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
|
||||
assert "Another Post" in result.metadata["content"]
|
||||
# Test Media objects
|
||||
assert len(result.media) == 2
|
||||
assert result.media[0].filename == "image1.jpg"
|
||||
assert result.media[1].filename == "image2.png"
|
||||
vk_extractor.vks.download_media.assert_called_once_with(mock_scrapes, vk_extractor.tmp_dir)
|
||||
|
||||
|
||||
def test_adds_first_title_and_timestamp(vk_extractor):
|
||||
metadata = Metadata().set_url("https://vk.com/no-metadata")
|
||||
metadata.set_url("https://vk.com/no-metadata")
|
||||
mock_scrapes = [
|
||||
{"text": "value", "datetime": "2023-01-01T00:00:00"},
|
||||
{"text": "value2", "datetime": "2023-01-02T00:00:00"},
|
||||
]
|
||||
vk_extractor.vks.scrape.return_value = mock_scrapes
|
||||
vk_extractor.vks.download_media.return_value = []
|
||||
result = vk_extractor.download(metadata)
|
||||
|
||||
assert result.get_title() == "value"
|
||||
# formatted timestamp
|
||||
assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
|
||||
assert result.is_success()
|
||||
@@ -237,3 +237,23 @@ def test_wrong_step_type(test_args, caplog):
|
||||
with pytest.raises(SetupError) as err:
|
||||
orchestrator.setup(args)
|
||||
assert "Module 'example_extractor' is not a feeder" in str(err.value)
|
||||
|
||||
|
||||
def test_load_failed_extractor_cleanup(test_args, mocker, caplog):
|
||||
orchestrator = ArchivingOrchestrator()
|
||||
|
||||
# hack to set up the paths so we can patch properly
|
||||
orchestrator.module_factory.setup_paths([TEST_MODULES])
|
||||
|
||||
# patch example_module.setup to throw an exception
|
||||
mocker.patch(
|
||||
"auto_archiver.modules.example_extractor.example_extractor.ExampleExtractor.setup",
|
||||
side_effect=Exception("Test exception"),
|
||||
)
|
||||
|
||||
with pytest.raises(Exception):
|
||||
orchestrator.setup(test_args + ["--extractors", "example_extractor"])
|
||||
|
||||
assert "Error during setup of modules: Test exception" in caplog.text
|
||||
# make sure the 'cleanup' is called
|
||||
assert "cleanup" in caplog.text
|
||||
|
||||
Reference in New Issue
Block a user