mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-08 03:18:28 +03:00
Merge branch 'main' into feat/yt-dlp-pots
# Conflicts: # src/auto_archiver/modules/generic_extractor/__manifest__.py
This commit is contained in:
6
.github/workflows/docker-publish.yaml
vendored
6
.github/workflows/docker-publish.yaml
vendored
@@ -11,7 +11,7 @@ on:
|
||||
|
||||
env:
|
||||
# Use docker.io for Docker Hub if empty
|
||||
REGISTRY: ghcr.io
|
||||
REGISTRY: docker.io
|
||||
# github.repository as <account>/<repo>
|
||||
IMAGE_NAME: ${{ github.repository }}
|
||||
|
||||
@@ -45,10 +45,12 @@ jobs:
|
||||
images: bellingcat/auto-archiver
|
||||
|
||||
- name: Build and push Docker image
|
||||
uses: docker/build-push-action@v2
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/amd64,linux/arm64
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:cache
|
||||
cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:cache,mode=max
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -33,3 +33,5 @@ dist*
|
||||
docs/_build/
|
||||
docs/source/autoapi/
|
||||
docs/source/modules/autogen/
|
||||
scripts/settings_page.html
|
||||
.vite
|
||||
|
||||
@@ -9,6 +9,7 @@ build:
|
||||
os: ubuntu-22.04
|
||||
tools:
|
||||
python: "3.10"
|
||||
nodejs: "22"
|
||||
jobs:
|
||||
post_install:
|
||||
- pip install poetry
|
||||
@@ -17,6 +18,11 @@ build:
|
||||
# See https://github.com/readthedocs/readthedocs.org/pull/11152/
|
||||
- VIRTUAL_ENV=$READTHEDOCS_VIRTUALENV_PATH poetry install --with docs
|
||||
|
||||
# generate the config editor page. Schema then HTML
|
||||
- VIRTUAL_ENV=$READTHEDOCS_VIRTUALENV_PATH poetry run python scripts/generate_settings_schema.py
|
||||
# install node dependencies and build the settings
|
||||
- cd scripts/settings && npm install && npm run build && yes | cp dist/index.html ../../docs/source/installation/settings_base.html && cd ../..
|
||||
|
||||
|
||||
sphinx:
|
||||
configuration: docs/source/conf.py
|
||||
|
||||
15
Dockerfile
15
Dockerfile
@@ -7,13 +7,24 @@ ENV RUNNING_IN_DOCKER=1 \
|
||||
PYTHONFAULTHANDLER=1 \
|
||||
PATH="/root/.local/bin:$PATH"
|
||||
|
||||
|
||||
ARG TARGETARCH
|
||||
|
||||
# Installing system dependencies
|
||||
RUN add-apt-repository ppa:mozillateam/ppa && \
|
||||
apt-get update && \
|
||||
apt-get install -y --no-install-recommends gcc ffmpeg fonts-noto exiftool && \
|
||||
apt-get install -y --no-install-recommends firefox-esr && \
|
||||
ln -s /usr/bin/firefox-esr /usr/bin/firefox && \
|
||||
wget https://github.com/mozilla/geckodriver/releases/download/v0.35.0/geckodriver-v0.35.0-linux64.tar.gz && \
|
||||
ln -s /usr/bin/firefox-esr /usr/bin/firefox
|
||||
|
||||
ARG GECKODRIVER_VERSION=0.36.0
|
||||
|
||||
RUN if [ $(uname -m) = "aarch64" ]; then \
|
||||
GECKODRIVER_ARCH=linux-aarch64; \
|
||||
else \
|
||||
GECKODRIVER_ARCH=linux64; \
|
||||
fi && \
|
||||
wget https://github.com/mozilla/geckodriver/releases/download/v${GECKODRIVER_VERSION}/geckodriver-v${GECKODRIVER_VERSION}-${GECKODRIVER_ARCH}.tar.gz && \
|
||||
tar -xvzf geckodriver* -C /usr/local/bin && \
|
||||
chmod +x /usr/local/bin/geckodriver && \
|
||||
rm geckodriver-v* && \
|
||||
|
||||
@@ -31,4 +31,5 @@ docker_development
|
||||
testing
|
||||
docs
|
||||
release
|
||||
settings_page
|
||||
```
|
||||
@@ -13,3 +13,8 @@
|
||||
manual release to docker hub
|
||||
* `docker image tag auto-archiver bellingcat/auto-archiver:latest`
|
||||
* `docker push bellingcat/auto-archiver`
|
||||
|
||||
|
||||
### Building the Settings Page
|
||||
|
||||
The Settings page is built as part of the python-publish workflow and packaged within the app.
|
||||
31
docs/source/development/settings_page.md
Normal file
31
docs/source/development/settings_page.md
Normal file
@@ -0,0 +1,31 @@
|
||||
# Configuration Editor
|
||||
|
||||
The [configuration editor](../installation/config_editor.md), is an easy-to-use UI for users to edit their auto-archiver settings.
|
||||
|
||||
The single-file app is built using React and vite. To get started developing the package, follow these steps:
|
||||
|
||||
1. Make sure you have Node v22 installed.
|
||||
|
||||
```{note} Tip: if you don't have node installed:
|
||||
|
||||
Use `nvm` to manage your node installations. Use:
|
||||
`curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.1/install.sh | bash` to install `nvm` and then `nvm i 22` to install Node v22
|
||||
```
|
||||
|
||||
2. Generate the `schema.json` file for the currently installed modules using `python scripts/generate_settings_schema.py`
|
||||
3. Go to the settings folder `cd scripts/settings/` and build your environment with `npm i`
|
||||
4. Run a development version of the page with `npm run dev` and then open localhost:5173.
|
||||
5. Build a release version of the page with `npm run build`
|
||||
|
||||
A release version creates a single-file app called `dist/index.html`. This file should be copied to `docs/source/installation/settings_base.html` so that it can be integrated into the sphinx docs.
|
||||
|
||||
```{note}
|
||||
|
||||
The single-file app dist/index.html does not include any `<html>` or `<head>` tags as it is designed to be built into a RTD docs page. Edit `index.html` in the settings folder if you wish to modify the built page.
|
||||
```
|
||||
|
||||
## Readthedocs Integration
|
||||
|
||||
The configuration editor is built as part of the RTD deployment (see `.readthedocs.yaml` file). This command is run every time RTD is built:
|
||||
|
||||
`cd scripts/settings && npm install && npm run build && yes | cp dist/index.html ../../docs/source/installation/settings_base.html && cd ../..`
|
||||
@@ -46,7 +46,7 @@ First, we need to install an extension in our browser to export the cookies for
|
||||
|
||||
**2. Export the cookies**
|
||||
|
||||
```{note} See the note [here](../installation/authentication.md#recommendations-for-authentication) on why you shouldn't use your own personal account for achiving.
|
||||
```{note} See the note [here](../installation/authentication.md#recommendations-for-authentication) on why you shouldn't use your own personal account for archiving.
|
||||
```
|
||||
|
||||
Once the extension is installed in your preferred browser, login to Twitter in this browser, and then activate the extension and export the cookies. You can choose to export all your cookies for your browser, or just cookies for this specific site. In the image below, we're only exporting cookies for Twitter/x.com:
|
||||
|
||||
@@ -8,18 +8,25 @@ This guide explains how to set up Google Sheets to process URLs automatically an
|
||||
|
||||
### 1. Setting up your Google Sheet
|
||||
|
||||
Any Google sheet must have at least *one* column, with the name 'link' (you can change this name afterwards). This is the column with the URLs that you want the Auto Archiver to archive. Your sheet can have many other columns that the Auto Archiver can use, and you can also include any other columns for your own personal use.
|
||||
Any Google sheet must have at least *one* column, with the name 'link' (you can change this name afterwards). This is the column with the URLs that you want the Auto Archiver to archive.
|
||||
Your sheet can have many other columns that the Auto Archiver can use, and you can also include any additional columns for your own personal use. The order of the columns does not matter, the naming just needs to be correctly assigned to its corresponding value in the configuration file.
|
||||
|
||||
We recommend copying [this template Google Sheet](https://docs.google.com/spreadsheets/d/1NJZo_XZUBKTI1Ghlgi4nTPVvCfb0HXAs6j5tNGas72k/edit?usp=sharing) as a starting point for your project.
|
||||
We recommend copying [this template Google Sheet](https://docs.google.com/spreadsheets/d/1NJZo_XZUBKTI1Ghlgi4nTPVvCfb0HXAs6j5tNGas72k/edit?usp=sharing) as a starting point for your project, as this matches the default column names.
|
||||
|
||||
Here's an overview of all the columns, and what a complete sheet would look like.
|
||||
|
||||
Inputs:
|
||||
**Inputs:**
|
||||
|
||||
* **Link** *(required)*: the URL of the post to archive
|
||||
These are processed by the Gsheet Feeder and passed to the Auto Archiver.
|
||||
|
||||
* **Link** *(required)*: the URL of the post that is to be archived
|
||||
* **Destination folder**: custom folder for archived file (regardless of storage)
|
||||
|
||||
Outputs:
|
||||
**Outputs:**
|
||||
|
||||
These are updated by the Gsheet DB module during the archiving process.
|
||||
Note the required columns are only required if you are using the Gsheet DB module as well as the feeder.
|
||||
|
||||
* **Archive status** *(required)*: Status of archive operation
|
||||
* **Archive location**: URL of archived post
|
||||
* **Archive date**: Date archived
|
||||
@@ -33,9 +40,11 @@ Outputs:
|
||||
* **WACZ**: Link to a WACZ web archive of post
|
||||
* **ReplayWebpage**: Link to a ReplayWebpage viewer of the WACZ archive
|
||||
|
||||
For example, this is a spreadsheet configured with all of the columns for the auto archiver and a few URLs to archive. (Note that the column names are not case sensitive.)
|
||||
For example, this is a spreadsheet configured with all of the columns for the auto archiver and a few URLs to archive.
|
||||
In this example the Ghseet Feeder and Gsheet DB are being used, and the archive is in progress.
|
||||
(Note that the column names are not case sensitive.)
|
||||
|
||||

|
||||

|
||||
|
||||
We'll change the name of the 'Destination Folder' column in step 3.
|
||||
|
||||
@@ -51,43 +60,47 @@ Once you've downloaded the file, save it to `secrets/service_account.json`
|
||||
|
||||
Now that you've set up your Google sheet, and you've set up the service account so Auto Archiver can access the sheet, the final step is to set your configuration.
|
||||
|
||||
First, make sure you have `gsheet_feeder` set in the `steps.feeders` section of your config. If you wish to store the results of the archiving process back in your Google sheet, make sure to also set the `ghseet_db` settig in the `steps.databases` section. Here's how this might look:
|
||||
First, make sure you have `gsheet_feeder_db` set in the `steps.feeders` section of your config. If you wish to store the results of the archiving process back in your Google sheet, make sure to also set the `ghseet_db` settig in the `steps.databases` section. Here's how this might look:
|
||||
|
||||
```{code} yaml
|
||||
steps:
|
||||
feeders:
|
||||
- gsheet_feeder
|
||||
- gsheet_feeder_db
|
||||
...
|
||||
databases:
|
||||
- gsheet_db # optional, if you also want to store the results in the Google sheet
|
||||
- gsheet_feeder_db # optional, if you also want to store the results in the Google sheet and tract the status of active archivals.
|
||||
...
|
||||
```
|
||||
|
||||
Next, set up the `gsheet_feeder` configuration settings in the 'Configurations' part of the config `orchestration.yaml` file. Open up he file, and set the `gsheet_feeder.sheet` setting or the `gsheet_feeder.sheet_id` setting. The `sheet` should be the name of your sheet, as it shows in the top left of the sheet. For example, the sheet [here](https://docs.google.com/spreadsheets/d/1NJZo_XZUBKTI1Ghlgi4nTPVvCfb0HXAs6j5tNGas72k/edit?gid=0#gid=0) is called 'Public Auto Archiver template'.
|
||||
Next, set up the `gsheet_feeder_db` configuration settings in the 'Configurations' part of the config `orchestration.yaml` file. Open up the file, and set the `gsheet_feeder_db.sheet` setting or the `gsheet_feeder_db.sheet_id` setting. The `sheet` should be the name of your sheet, as it shows in the top left of the sheet.
|
||||
For example, the sheet [here](https://docs.google.com/spreadsheets/d/1NJZo_XZUBKTI1Ghlgi4nTPVvCfb0HXAs6j5tNGas72k/edit?gid=0#gid=0) is called 'Public Auto Archiver template'.
|
||||
|
||||
Here's how this might look:
|
||||
|
||||
```{code} yaml
|
||||
...
|
||||
gsheet_feeder:
|
||||
gsheet_feeder_db:
|
||||
sheet: 'My Awesome Sheet'
|
||||
...
|
||||
```
|
||||
|
||||
You can also pass these settings directly on the command line without having to edit the file, here'a an example of how to do that (using docker):
|
||||
|
||||
`docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver:dockerize --gsheet_feeder.sheet "Auto archive test 2023-2"`.
|
||||
`docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver:dockerize --gsheet_feeder_db.sheet "My Awesome Sheet 2"`.
|
||||
|
||||
Here, the sheet name has been overridden/specified in the command line invocation.
|
||||
|
||||
### 3a. (Optional) Changing the column names
|
||||
|
||||
In step 1, we said we would change the name of the 'Destination Folder'. Perhaps you don't like this name, or already have a sheet with a different name. In our example here, we want to name this column 'Save Folder'. To do this, we need to edit the `ghseet_feeder.column` setting in the configuration file. For more information on this setting, see the [Gsheet Feeder docs](../modules/autogen/feeder/gsheet_feeder.md#configuration-options). We will first copy the default settings from the Gsheet Feeder docs for the 'column' settings, and then edit the 'Destination Folder' section to rename it 'Save Folder'. Our final configuration section looks like:
|
||||
In step 1, we said we would change the name of the 'Destination Folder'. Perhaps you don't like this name, or already have a sheet with a different name. In our example here, we want to name this column 'Save Folder'. To do this, we need to edit the `ghseet_feeder_db.column` setting in the configuration file.
|
||||
For more information on this setting, see the [Gsheet Feeder Database docs](../modules/autogen/feeder/gsheet_feeder_db.md#configuration-options). We will first copy the default settings from the Gsheet Feeder docs for the 'column' settings, and then edit the 'Destination Folder' section to rename it 'Save Folder'. Our final configuration section looks like:
|
||||
|
||||
```{code} yaml
|
||||
...
|
||||
gsheet_feeder:
|
||||
gsheet_feeder_db:
|
||||
sheet: 'My Awesome Sheet'
|
||||
header: 1
|
||||
service_account: secrets/service_account.json
|
||||
columns:
|
||||
url: link
|
||||
status: archive status
|
||||
@@ -103,20 +116,44 @@ gsheet_feeder:
|
||||
pdq_hash: perceptual hashes
|
||||
wacz: wacz
|
||||
replaywebpage: replaywebpage
|
||||
|
||||
```
|
||||
## 4. Running the Auto Archiver
|
||||
### Feeding the URLs to the Auto Archiver
|
||||
|
||||
## Viewing the Results after archiving
|
||||
The URLs to be archived should be added to the Google Sheet, and optionally a folder value. Leave all the other configured columns empty (but you may add additional columns for your own use, as long as they don't conflict with the column names mapped in the configuration file).
|
||||
The Auto Archiver will archive any URLs which have an empty 'status' column
|
||||
|
||||
With the `ghseet_db` installed, once you start running the Auto Archiver, it will updates the "Archive status" column.
|
||||
### Viewing the Results after archiving
|
||||
|
||||

|
||||
With the `ghseet_feeder_db` installed, once you start running the Auto Archiver, it will update the "Archive status" column.
|
||||
The status will be set to "Archive in progress" once the archival starts. If the archival is stopped during a run, either manually or because an error is raised the status value should be cleared.
|
||||
|
||||

|
||||
|
||||
The links are downloaded and archived, and the spreadsheet is updated to the following:
|
||||
|
||||

|
||||

|
||||
|
||||
Note that the first row is skipped, as it is assumed to be a header row (`--gsheet_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked.
|
||||
Note that the first row is skipped, as it is assumed to be a header row (`--gsheet_feeder_db.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked.
|
||||
|
||||
The "archive location" link contains the path of the archived file, in local storage, S3, or in Google Drive.
|
||||
|
||||

|
||||

|
||||
|
||||
### Troubleshooting
|
||||
|
||||
**Hanging Archival in progress status**
|
||||
|
||||
Occasionally system crashes or other unexpected events can cause the Auto Archiver to exit without cleaning up the status value.
|
||||
If you are sure that all archival processes have stopped but you still see "Archive in progress" in the status column, you can manually clear the status column to allow the Auto Archiver to retry that archival on the next run.
|
||||
|
||||
**Nothing archived status**
|
||||
|
||||
Sometimes this means the tool is genuinely unable to extract the content at this point in time, but sometimes it can be resolved with different configurations.
|
||||
Try:
|
||||
- Turning on additional 'extractor' types in the configuration file (this can appear as 'no archiver' in the status column).
|
||||
- Changing credentials or refreshing session files for extractors which require them
|
||||
- Check if the extractors can accept any additional configurations such as adding a cookie file.
|
||||
|
||||
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
# Upgrading to v0.13
|
||||
# Upgrading from v0.12
|
||||
|
||||
```{note} This how-to is only relevant for people who used Auto Archiver before February 2025 (versions prior to 0.13).
|
||||
|
||||
If you are new to Auto Archiver, then you are already using the latest configuration format and this how-to is not relevant for you.
|
||||
```
|
||||
|
||||
Version 0.13 of Auto Archiver has breaking changes in the configuration format, which means earlier configuration formats will not work without slight modifications.
|
||||
Versions 0.13+ of Auto Archiver has breaking changes in the configuration format, which means earlier configuration formats will not work without slight modifications.
|
||||
|
||||
## How do I know if I need to update my configuration format?
|
||||
|
||||
@@ -22,15 +22,13 @@ your configuration file or on the command line (using --feeders)
|
||||
```{code} yaml
|
||||
|
||||
steps:
|
||||
feeder: gsheet_feeder
|
||||
feeder: cli_feeder
|
||||
...
|
||||
```
|
||||
|
||||
## Updating your configuration file
|
||||
The next two sections outline the two methods you have for updating your file.
|
||||
|
||||
To update your configuration file, you can either:
|
||||
|
||||
### 1. Manually edit the configuration file and change the values.
|
||||
## 1. Manually edit the configuration file and change the values.
|
||||
|
||||
This is recommended if you want to keep all your old settings. Follow the steps below to change the relevant settings:
|
||||
|
||||
@@ -75,28 +73,49 @@ The names of the actual modules have also changed, so for any extractor modules
|
||||
- `wayback_archiver_enricher` → `wayback_extractor_enricher`
|
||||
- `vk_archiver` → `vk_extractor`
|
||||
|
||||
Additionally, the `youtube_archiver` has been renamed to `generic_extractor` as it is considered the default/fallback extractor. Read more about the [generic extractor](../modules/autogen/extractor/generic_extractor.md).
|
||||
|
||||
#### c) Module Renaming
|
||||
|
||||
|
||||
The `youtube_archiver` has been renamed to `generic_extractor` as it is considered the default/fallback extractor. Read more about the [generic extractor](../modules/autogen/extractor/generic_extractor.md).
|
||||
|
||||
The `atlos` modules have been merged into one, as have the `gsheets` feeder and database.
|
||||
|
||||
- `atlos_feeder` → `atlos_feeder_db_storage`
|
||||
- `atlos_storage` → `atlos_feeder_db_storage`
|
||||
- `atlos_db` → `atlos_feeder_db_storage`
|
||||
- `gsheet_feeder` → `gsheet_feeder_db`
|
||||
- `gsheet_db` → `gsheet_feeder_db`
|
||||
|
||||
|
||||
Example:
|
||||
```{code} yaml
|
||||
steps:
|
||||
feeders:
|
||||
- gsheet_feeder_db # formerly gsheet_feeder
|
||||
...
|
||||
archivers:
|
||||
- telethon_archiver
|
||||
- youtube_archiver
|
||||
- vk_archiver
|
||||
|
||||
# renaming 'archiver' to 'extractor', and renaming the youtube_archiver the above config will become:
|
||||
steps:
|
||||
extractors: # formerly 'archivers'
|
||||
- telethon_extractor # formerly telethon_archiver
|
||||
- generic_extractor # formerly youtube_archiver
|
||||
- vk_extractor # formerly vk_archiver
|
||||
databases:
|
||||
- gsheet_feeder_db # formerly gsheet_db
|
||||
...
|
||||
extractors:
|
||||
- telethon_extractor
|
||||
- vk_extractor
|
||||
- generic_extractor
|
||||
|
||||
```
|
||||
|
||||
#### c) Redundant / Obsolete Modules
|
||||
```{note}
|
||||
|
||||
Don't forget to also rename the configuration settings. For example:
|
||||
|
||||
```{code} yaml
|
||||
gsheet_feeder_db: # formerly gsheet_feeder
|
||||
service_account: secrets/service_account.json
|
||||
sheet: My Google Sheet
|
||||
...
|
||||
```
|
||||
|
||||
#### d) Redundant / Obsolete Modules
|
||||
|
||||
With v0.13 of Auto Archiver, the following modules have been removed and their features have been built in to the generic_extractor. You should remove them from the 'steps' section of your configuration file:
|
||||
|
||||
@@ -104,7 +123,7 @@ With v0.13 of Auto Archiver, the following modules have been removed and their f
|
||||
* `tiktok_archiver` - use the `generic_extractor` to extract TikTok videos.
|
||||
|
||||
|
||||
### 2. Auto-generate a new config, then copy over your settings.
|
||||
## 2. Auto-generate a new config, then copy over your settings.
|
||||
|
||||
Using this method, you can have Auto Archiver auto-generate a configuration file for you, then you can copy over the desired settings from your old config file. This is probably the easiest method and quickest to setup, but it may require some trial and error as you copy over your settings.
|
||||
|
||||
|
||||
5
docs/source/installation/config_editor.md
Normal file
5
docs/source/installation/config_editor.md
Normal file
@@ -0,0 +1,5 @@
|
||||
# Configuration Editor
|
||||
|
||||
```{raw} html
|
||||
:file: settings.html
|
||||
```
|
||||
48685
docs/source/installation/settings.html
Normal file
48685
docs/source/installation/settings.html
Normal file
File diff suppressed because one or more lines are too long
@@ -6,6 +6,7 @@
|
||||
|
||||
installation.md
|
||||
configurations.md
|
||||
config_editor.md
|
||||
authentication.md
|
||||
requirements.md
|
||||
config_cheatsheet.md
|
||||
|
||||
52
scripts/generate_settings_schema.py
Normal file
52
scripts/generate_settings_schema.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import json
|
||||
import os
|
||||
import io
|
||||
|
||||
from ruamel.yaml import YAML
|
||||
|
||||
from auto_archiver.core.module import ModuleFactory
|
||||
from auto_archiver.core.consts import MODULE_TYPES
|
||||
from auto_archiver.core.config import EMPTY_CONFIG
|
||||
|
||||
class SchemaEncoder(json.JSONEncoder):
|
||||
def default(self, obj):
|
||||
if isinstance(obj, set):
|
||||
return list(obj)
|
||||
return json.JSONEncoder.default(self, obj)
|
||||
|
||||
# Get available modules
|
||||
module_factory = ModuleFactory()
|
||||
available_modules = module_factory.available_modules()
|
||||
|
||||
modules_by_type = {}
|
||||
# Categorize modules by type
|
||||
for module in available_modules:
|
||||
for type in module.manifest.get('type', []):
|
||||
modules_by_type.setdefault(type, []).append(module)
|
||||
|
||||
all_modules_ordered_by_type = sorted(available_modules, key=lambda x: (MODULE_TYPES.index(x.type[0]), not x.requires_setup))
|
||||
|
||||
yaml: YAML = YAML()
|
||||
|
||||
config_string = io.BytesIO()
|
||||
yaml.dump(EMPTY_CONFIG, config_string)
|
||||
config_string = config_string.getvalue().decode('utf-8')
|
||||
output_schema = {
|
||||
'modules': dict((module.name,
|
||||
{
|
||||
'name': module.name,
|
||||
'display_name': module.display_name,
|
||||
'manifest': module.manifest,
|
||||
'configs': module.configs or None
|
||||
}
|
||||
) for module in all_modules_ordered_by_type),
|
||||
'steps': dict((f"{module_type}s", [module.name for module in modules_by_type[module_type]]) for module_type in MODULE_TYPES),
|
||||
'configs': [m.name for m in all_modules_ordered_by_type if m.configs],
|
||||
'module_types': MODULE_TYPES,
|
||||
'empty_config': config_string
|
||||
}
|
||||
|
||||
current_file_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
output_file = os.path.join(current_file_dir, 'settings/src/schema.json')
|
||||
with open(output_file, 'w') as file:
|
||||
json.dump(output_schema, file, indent=4, cls=SchemaEncoder)
|
||||
24
scripts/settings/.gitignore
vendored
Normal file
24
scripts/settings/.gitignore
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
# Logs
|
||||
logs
|
||||
*.log
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
pnpm-debug.log*
|
||||
lerna-debug.log*
|
||||
|
||||
node_modules
|
||||
dist
|
||||
dist-ssr
|
||||
*.local
|
||||
|
||||
# Editor directories and files
|
||||
.vscode/*
|
||||
!.vscode/extensions.json
|
||||
.idea
|
||||
.DS_Store
|
||||
*.suo
|
||||
*.ntvs*
|
||||
*.njsproj
|
||||
*.sln
|
||||
*.sw?
|
||||
3
scripts/settings/index.html
Normal file
3
scripts/settings/index.html
Normal file
@@ -0,0 +1,3 @@
|
||||
|
||||
<div id="root"></div>
|
||||
<script type="module" src="/src/main.tsx"></script>
|
||||
3743
scripts/settings/package-lock.json
generated
Normal file
3743
scripts/settings/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
31
scripts/settings/package.json
Normal file
31
scripts/settings/package.json
Normal file
@@ -0,0 +1,31 @@
|
||||
{
|
||||
"name": "material-ui-vite-ts",
|
||||
"private": true,
|
||||
"version": "5.0.0",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "vite",
|
||||
"build": "vite build",
|
||||
"preview": "vite preview"
|
||||
},
|
||||
"dependencies": {
|
||||
"@dnd-kit/core": "^6.3.1",
|
||||
"@dnd-kit/sortable": "^10.0.0",
|
||||
"@emotion/react": "latest",
|
||||
"@emotion/styled": "latest",
|
||||
"@mui/icons-material": "latest",
|
||||
"@mui/material": "latest",
|
||||
"react": "19.0.0",
|
||||
"react-dom": "19.0.0",
|
||||
"react-markdown": "^10.0.0",
|
||||
"yaml": "^2.7.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/react": "latest",
|
||||
"@types/react-dom": "latest",
|
||||
"@vitejs/plugin-react": "latest",
|
||||
"typescript": "latest",
|
||||
"vite": "latest",
|
||||
"vite-plugin-singlefile": "^2.1.0"
|
||||
}
|
||||
}
|
||||
450
scripts/settings/src/App.tsx
Normal file
450
scripts/settings/src/App.tsx
Normal file
@@ -0,0 +1,450 @@
|
||||
import * as React from 'react';
|
||||
import { useEffect, useState, useRef } from 'react';
|
||||
import Container from '@mui/material/Container';
|
||||
import Typography from '@mui/material/Typography';
|
||||
import Box from '@mui/material/Box';
|
||||
import FileUploadIcon from '@mui/icons-material/FileUpload';
|
||||
//
|
||||
import {
|
||||
DndContext,
|
||||
closestCenter,
|
||||
KeyboardSensor,
|
||||
PointerSensor,
|
||||
useSensor,
|
||||
useSensors,
|
||||
DragOverlay
|
||||
} from "@dnd-kit/core";
|
||||
import {
|
||||
arrayMove,
|
||||
SortableContext,
|
||||
sortableKeyboardCoordinates,
|
||||
rectSortingStrategy
|
||||
} from "@dnd-kit/sortable";
|
||||
|
||||
import type { DragStartEvent, DragEndEvent, UniqueIdentifier } from "@dnd-kit/core";
|
||||
|
||||
|
||||
import { Module } from './types';
|
||||
|
||||
import { modules, steps, module_types, empty_config } from './schema.json';
|
||||
import {
|
||||
Stack,
|
||||
Button,
|
||||
} from '@mui/material';
|
||||
import Grid from '@mui/material/Grid2';
|
||||
|
||||
import { parseDocument, Document, YAMLSeq, YAMLMap, Scalar } from 'yaml'
|
||||
import StepCard from './StepCard';
|
||||
|
||||
|
||||
function FileDrop({ setYamlFile }: { setYamlFile: React.Dispatch<React.SetStateAction<Document>> }) {
|
||||
|
||||
const [showError, setShowError] = useState(false);
|
||||
const [label, setLabel] = useState(<>Drag and drop your orchestration.yaml file here, or click to select a file.</>);
|
||||
const wrapperRef = useRef(null);
|
||||
|
||||
function openYAMLFile(event: any) {
|
||||
let file = event.target.files[0];
|
||||
if (file.type.indexOf('yaml') === -1) {
|
||||
setShowError(true);
|
||||
setLabel(<>Invalid type, only YAML files are accepted.</>)
|
||||
return;
|
||||
}
|
||||
let reader = new FileReader();
|
||||
reader.onload = function (e) {
|
||||
let contents = e.target ? e.target.result : '';
|
||||
try {
|
||||
let document = parseDocument(contents as string);
|
||||
if (document.errors.length > 0) {
|
||||
// not a valid yaml file
|
||||
setShowError(true);
|
||||
setLabel(<>Invalid file. Make sure your Orchestration is a valid YAML file with a 'steps' section in it.</>)
|
||||
return;
|
||||
} else {
|
||||
setShowError(false);
|
||||
setLabel(<>File loaded successfully.</>)
|
||||
}
|
||||
// do some basic validation of 'steps'
|
||||
let steps = document.get('steps');
|
||||
if (!steps) {
|
||||
setShowError(true);
|
||||
setLabel(<>Invalid file. Your orchestration file must have a 'steps' section in it.</>)
|
||||
return;
|
||||
}
|
||||
const replacements = {
|
||||
feeder: 'feeders',
|
||||
formatter: 'formatters',
|
||||
archivers: 'extractors',
|
||||
};
|
||||
|
||||
let error = false;
|
||||
for (let stepType of Object.keys(replacements)) {
|
||||
if (steps.get(stepType) !== undefined) {
|
||||
setShowError(true);
|
||||
setLabel(<>Invalid file. Your orchestration file appears to be in the old (v0.12) format with a '{stepType}' section.<br/>You should manually update your orchestration file first (hint: {stepType} → {replacements[stepType]})</>);
|
||||
error = true;
|
||||
return;
|
||||
}
|
||||
};
|
||||
setYamlFile(document);
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
}
|
||||
}
|
||||
reader.readAsText(file);
|
||||
}
|
||||
return (
|
||||
<>
|
||||
<div
|
||||
style={{
|
||||
position: 'relative',
|
||||
width: '100%',
|
||||
border: 'dashed',
|
||||
borderRadius:'5px',
|
||||
textAlign: 'center',
|
||||
borderWidth: '1px',
|
||||
padding: '20px' }}
|
||||
onDragEnter={(e) => {
|
||||
e.currentTarget.style.backgroundColor = 'var(--mui-palette-LinearProgress-infoBg)';
|
||||
}}
|
||||
onDragLeave={(e) => {
|
||||
e.currentTarget.style.backgroundColor = '';
|
||||
}}
|
||||
onDrop={(e) => {
|
||||
e.currentTarget.style.backgroundColor = '';
|
||||
}}
|
||||
>
|
||||
<FileUploadIcon style={{ fontSize: 50 }} />
|
||||
<input style={{
|
||||
opacity: 0,
|
||||
position: 'absolute',
|
||||
top: 0,
|
||||
left: 0,
|
||||
width: '100%',
|
||||
height: '100%',
|
||||
cursor: 'pointer',
|
||||
}}
|
||||
type="file" id="file"
|
||||
accept=".yaml"
|
||||
onChange={openYAMLFile} />
|
||||
<Typography variant="body1" color={showError ? 'error' : ''} >
|
||||
{label}
|
||||
</Typography>
|
||||
</div>
|
||||
</>
|
||||
);
|
||||
}
|
||||
|
||||
function ModuleTypes({ stepType, setEnabledModules, enabledModules, configValues }: { stepType: string, setEnabledModules: any, enabledModules: any, configValues: any }) {
|
||||
const [showError, setShowError] = useState<boolean>(false);
|
||||
const [activeId, setActiveId] = useState<UniqueIdentifier>();
|
||||
const [items, setItems] = useState<string[]>([]);
|
||||
|
||||
useEffect(() => {
|
||||
setItems(enabledModules[stepType].map(([name, enabled]: [string, boolean]) => name));
|
||||
}
|
||||
, [enabledModules]);
|
||||
|
||||
const toggleModule = (event: any) => {
|
||||
// make sure that 'feeder' and 'formatter' types only have one value
|
||||
let name = event.target.id;
|
||||
let checked = event.target.checked;
|
||||
if (stepType === 'feeders' || stepType === 'formatters') {
|
||||
// check how many modules of this type are enabled
|
||||
const checkedModules = enabledModules[stepType].filter(([m, enabled]: [string, boolean]) => {
|
||||
return (m !== name && enabled) || (checked && m === name)
|
||||
});
|
||||
if (checkedModules.length > 1) {
|
||||
setShowError(true);
|
||||
} else {
|
||||
setShowError(false);
|
||||
}
|
||||
} else {
|
||||
setShowError(false);
|
||||
}
|
||||
let newEnabledModules = { ...enabledModules };
|
||||
newEnabledModules[stepType] = enabledModules[stepType].map(([m, enabled]: [string, boolean]) => {
|
||||
return (m === name) ? [m, checked] : [m, enabled];
|
||||
});
|
||||
setEnabledModules(newEnabledModules);
|
||||
}
|
||||
|
||||
const sensors = useSensors(
|
||||
useSensor(PointerSensor),
|
||||
useSensor(KeyboardSensor, {
|
||||
coordinateGetter: sortableKeyboardCoordinates
|
||||
})
|
||||
);
|
||||
|
||||
const handleDragStart = (event: DragStartEvent) => {
|
||||
setActiveId(event.active.id);
|
||||
};
|
||||
|
||||
const handleDragEnd = (event: DragEndEvent) => {
|
||||
setActiveId(undefined);
|
||||
const { active, over } = event;
|
||||
|
||||
if (active.id !== over?.id) {
|
||||
const oldIndex = items.indexOf(active.id as string);
|
||||
const newIndex = items.indexOf(over?.id as string);
|
||||
|
||||
let newArray = arrayMove(items, oldIndex, newIndex);
|
||||
// set it also on steps
|
||||
let newEnabledModules = { ...enabledModules };
|
||||
newEnabledModules[stepType] = enabledModules[stepType].sort((a, b) => {
|
||||
return newArray.indexOf(a[0]) - newArray.indexOf(b[0]);
|
||||
})
|
||||
setEnabledModules(newEnabledModules);
|
||||
}
|
||||
};
|
||||
return (
|
||||
<>
|
||||
<Box sx={{ my: 4 }}>
|
||||
<Typography id={stepType} variant="h6" style={{ textTransform: 'capitalize' }} >
|
||||
{stepType}
|
||||
</Typography>
|
||||
<Typography variant="body1" >
|
||||
Select the <a href="<a href={`https://auto-archiver.readthedocs.io/en/latest/modules/${stepType.slice(0,-1)}.html`}" target="_blank">{stepType}</a> you wish to enable. Drag to reorder.
|
||||
</Typography>
|
||||
</Box>
|
||||
{showError ? <Typography variant="body1" color="error" >Only one {stepType.slice(0,-1)} can be enabled at a time.</Typography> : null}
|
||||
|
||||
<DndContext
|
||||
sensors={sensors}
|
||||
collisionDetection={closestCenter}
|
||||
onDragEnd={handleDragEnd}
|
||||
onDragStart={handleDragStart}
|
||||
>
|
||||
<Grid container spacing={1} key={stepType}>
|
||||
<SortableContext items={items} strategy={rectSortingStrategy}>
|
||||
{items.map((name: string) => {
|
||||
let m: Module = modules[name];
|
||||
return (
|
||||
<StepCard key={name} type={stepType} module={m} toggleModule={toggleModule} enabledModules={enabledModules} configValues={configValues} />
|
||||
);
|
||||
})}
|
||||
<DragOverlay>
|
||||
{activeId ? (
|
||||
<div
|
||||
style={{
|
||||
width: "100%",
|
||||
height: "100%",
|
||||
backgroundColor: "grey",
|
||||
opacity: 0.1,
|
||||
}}
|
||||
></div>
|
||||
|
||||
) : null}
|
||||
</DragOverlay>
|
||||
</SortableContext>
|
||||
</Grid>
|
||||
</DndContext>
|
||||
</>
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
export default function App() {
|
||||
const [yamlFile, setYamlFile] = useState<Document>(new Document());
|
||||
const [enabledModules, setEnabledModules] = useState<{}>(Object.fromEntries(Object.keys(steps).map(type => [type, steps[type].map((name: string) => [name, false])])));
|
||||
const [configValues, setConfigValues] = useState<{
|
||||
[key: string]: {
|
||||
[key: string
|
||||
]: any
|
||||
}
|
||||
}>(
|
||||
Object.keys(modules).reduce((acc, module) => {
|
||||
acc[module] = {};
|
||||
return acc;
|
||||
}, {})
|
||||
);
|
||||
|
||||
const saveSettings = function (copy: boolean = false) {
|
||||
// edit the yamlFile
|
||||
|
||||
// generate the steps config
|
||||
let stepsConfig = enabledModules;
|
||||
|
||||
let finalYamlFile: Document = null;
|
||||
if (!yamlFile || yamlFile.contents == null) {
|
||||
// create the yaml file from
|
||||
finalYamlFile = parseDocument(empty_config as string);
|
||||
} else {
|
||||
finalYamlFile = yamlFile;
|
||||
}
|
||||
|
||||
// set the steps
|
||||
module_types.forEach((type: string) => {
|
||||
let stepType = type + 's';
|
||||
let existingSteps = finalYamlFile.getIn(['steps', stepType]) as YAMLSeq;
|
||||
stepsConfig[stepType].forEach(([name, enabled]: [string, boolean]) => {
|
||||
let index = existingSteps.items.findIndex((item) => {
|
||||
return (item.value || item) === name
|
||||
});
|
||||
let stepItem = finalYamlFile.getIn(['steps', stepType], true) as YAMLSeq;
|
||||
|
||||
if (enabled && index === -1) {
|
||||
finalYamlFile.addIn(['steps', stepType], name);
|
||||
stepItem.commentBefore = stepItem.commentBefore?.replace("\n - " + name, '');
|
||||
stepItem.comment = stepItem.comment?.replace("\n - " + name, '');
|
||||
} else if (!enabled && index !== -1) {
|
||||
// set the value to empty and add a comment before with the commented value
|
||||
finalYamlFile.deleteIn(['steps', stepType, index]);
|
||||
stepItem.commentBefore += "\n - " + name;
|
||||
finalYamlFile.setIn(['steps', stepType], stepItem);
|
||||
}
|
||||
});
|
||||
// sort the items
|
||||
existingSteps.items.sort((a: Scalar | string, b: Scalar | string) => {
|
||||
return (stepsConfig[stepType].findIndex((val: [string, boolean]) => {return val[0] === (a.value || a)}) -
|
||||
stepsConfig[stepType].findIndex((val: [string, boolean]) => {return val[0] === (b.value || b)}))
|
||||
});
|
||||
existingSteps.flow = existingSteps.items.length ? false : true;
|
||||
});
|
||||
|
||||
// set all other settings
|
||||
// loop through each item that isn't 'steps' in the finalYamlFile and check if it exists in configValues
|
||||
|
||||
Object.keys(configValues).forEach((module_name: string) => {
|
||||
// get an existing key
|
||||
let existingConfig = finalYamlFile.get(module_name, true) as YAMLMap;
|
||||
if (existingConfig) {
|
||||
Object.keys(configValues[module_name]).forEach((config_name: string) => {
|
||||
let existingConfigYAML = existingConfig.get(config_name, true) as Scalar;
|
||||
if (existingConfigYAML) {
|
||||
existingConfigYAML.value = configValues[module_name][config_name];
|
||||
existingConfig.set(config_name, existingConfigYAML);
|
||||
} else {
|
||||
existingConfig.set(config_name, configValues[module_name][config_name]);
|
||||
}
|
||||
});
|
||||
finalYamlFile.set(module_name, existingConfig);
|
||||
} else {
|
||||
if (configValues[module_name] && Object.keys(configValues[module_name]).length > 0) {
|
||||
finalYamlFile.set(module_name, configValues[module_name]);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
if (copy) {
|
||||
navigator.clipboard.writeText(String(finalYamlFile)).then(() => {
|
||||
alert("Settings copied to clipboard.");
|
||||
});
|
||||
} else {
|
||||
// offer the file for download
|
||||
const blob = new Blob([String(finalYamlFile)], { type: 'application/x-yaml' });
|
||||
const url = URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = 'orchestration.yaml';
|
||||
a.click();
|
||||
}
|
||||
}
|
||||
|
||||
useEffect(() => {
|
||||
// load the configs, and set the default values if they exist
|
||||
let newConfigValues = {};
|
||||
Object.keys(modules).map((module: string) => {
|
||||
let m = modules[module];
|
||||
let configs = m.configs;
|
||||
if (!configs) {
|
||||
return;
|
||||
}
|
||||
newConfigValues[module] = {};
|
||||
Object.keys(configs).map((config: string) => {
|
||||
let config_args = configs[config];
|
||||
if (config_args.default !== undefined) {
|
||||
newConfigValues[module][config] = config_args.default;
|
||||
}
|
||||
});
|
||||
})
|
||||
setConfigValues(newConfigValues);
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
if (!yamlFile || yamlFile.contents == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
let settings = yamlFile.toJS();
|
||||
// make a deep copy of settings
|
||||
let stepSettings = settings['steps'];
|
||||
|
||||
let newEnabledModules = Object.fromEntries(Object.keys(steps).map((type: string) => {
|
||||
return [type, steps[type].map((name: string) => {
|
||||
return [name, stepSettings[type].indexOf(name) !== -1];
|
||||
}).sort((a, b) => {
|
||||
let aIndex = stepSettings[type].indexOf(a[0]);
|
||||
let bIndex = stepSettings[type].indexOf(b[0]);
|
||||
if (aIndex === -1 && bIndex === -1) {
|
||||
return a - b;
|
||||
}
|
||||
if (bIndex === -1) {
|
||||
return -1;
|
||||
}
|
||||
if (aIndex === -1) {
|
||||
return 1;
|
||||
}
|
||||
return aIndex - bIndex;
|
||||
})];
|
||||
}).sort((a, b) => {
|
||||
return module_types.indexOf(a[0]) - module_types.indexOf(b[0]);
|
||||
}));
|
||||
setEnabledModules(newEnabledModules);
|
||||
|
||||
// set the config values
|
||||
let newConfigValues = settings;
|
||||
delete newConfigValues['steps'];
|
||||
|
||||
|
||||
setConfigValues(Object.keys(modules).reduce((acc, module) => {
|
||||
acc[module] = newConfigValues[module] || {};
|
||||
return acc;
|
||||
}, {}));
|
||||
}, [yamlFile]);
|
||||
|
||||
|
||||
|
||||
return (
|
||||
<Container maxWidth="lg">
|
||||
<Box sx={{ my: 4 }}>
|
||||
<Box sx={{ my: 4 }}>
|
||||
<Typography variant="h5" >
|
||||
1. Select your orchestration.yaml settings file.
|
||||
</Typography>
|
||||
<Typography variant="body1">Or skip this step to start from scratch</Typography>
|
||||
<FileDrop setYamlFile={setYamlFile} />
|
||||
</Box>
|
||||
<Box sx={{ my: 4 }}>
|
||||
<Typography variant="h5" >
|
||||
2. Choose the Modules you wish to enable/disable
|
||||
</Typography>
|
||||
{Object.keys(steps).map((stepType: string) => {
|
||||
return (
|
||||
<Box key={stepType} sx={{ my: 4 }}>
|
||||
<ModuleTypes stepType={stepType} setEnabledModules={setEnabledModules} enabledModules={enabledModules} configValues={configValues} />
|
||||
</Box>
|
||||
);
|
||||
})}
|
||||
</Box>
|
||||
<Box sx={{ my: 4 }}>
|
||||
<Typography variant="h5" >
|
||||
3. Configure your Enabled Modules
|
||||
</Typography>
|
||||
<Typography variant="body1" >
|
||||
Next to each module you've enabled, you can click 'Configure' to set the module's settings.
|
||||
</Typography>
|
||||
</Box>
|
||||
<Box sx={{ my: 4 }}>
|
||||
<Typography variant="h5" >
|
||||
4. Save your settings
|
||||
</Typography>
|
||||
<Stack direction="row" spacing={2} sx={{ my: 2 }}>
|
||||
<Button variant="contained" color="primary" onClick={() => saveSettings(true)}>Copy Settings to Clipboard</Button>
|
||||
<Button variant="contained" color="primary" onClick={() => saveSettings()}>Save Settings to File</Button>
|
||||
</Stack>
|
||||
</Box>
|
||||
</Box>
|
||||
</Container>
|
||||
);
|
||||
}
|
||||
258
scripts/settings/src/StepCard.tsx
Normal file
258
scripts/settings/src/StepCard.tsx
Normal file
@@ -0,0 +1,258 @@
|
||||
import { useState } from "react";
|
||||
import { useSortable } from "@dnd-kit/sortable";
|
||||
import ReactMarkdown from 'react-markdown';
|
||||
|
||||
import { CSS } from "@dnd-kit/utilities";
|
||||
|
||||
import {
|
||||
Card,
|
||||
CardActions,
|
||||
CardHeader,
|
||||
Button,
|
||||
Dialog,
|
||||
DialogTitle,
|
||||
DialogContent,
|
||||
Box,
|
||||
IconButton,
|
||||
Checkbox,
|
||||
Select,
|
||||
MenuItem,
|
||||
FormControl,
|
||||
FormControlLabel,
|
||||
FormHelperText,
|
||||
TextField,
|
||||
Stack,
|
||||
Typography,
|
||||
InputAdornment,
|
||||
} from '@mui/material';
|
||||
import Grid from '@mui/material/Grid2';
|
||||
import DragIndicatorIcon from '@mui/icons-material/DragIndicator';
|
||||
import Visibility from '@mui/icons-material/Visibility';
|
||||
import VisibilityOff from '@mui/icons-material/VisibilityOff';
|
||||
import HelpIconOutlined from '@mui/icons-material/HelpOutline';
|
||||
import { Module, Config } from "./types";
|
||||
|
||||
|
||||
// adds 'capitalize' method to String prototype
|
||||
declare global {
|
||||
interface String {
|
||||
capitalize(): string;
|
||||
}
|
||||
}
|
||||
String.prototype.capitalize = function (this: string) {
|
||||
return this.charAt(0).toUpperCase() + this.slice(1);
|
||||
};
|
||||
|
||||
const StepCard = ({
|
||||
type,
|
||||
module,
|
||||
toggleModule,
|
||||
enabledModules,
|
||||
configValues
|
||||
}: {
|
||||
type: string,
|
||||
module: Module,
|
||||
toggleModule: any,
|
||||
enabledModules: any,
|
||||
configValues: any
|
||||
}) => {
|
||||
const {
|
||||
attributes,
|
||||
listeners,
|
||||
setNodeRef,
|
||||
transform,
|
||||
transition,
|
||||
isDragging
|
||||
} = useSortable({ id: module.name });
|
||||
|
||||
|
||||
const style = {
|
||||
...Card.style,
|
||||
transform: CSS.Transform.toString(transform),
|
||||
transition,
|
||||
zIndex: isDragging ? "100" : "auto",
|
||||
opacity: isDragging ? 0.3 : 1
|
||||
};
|
||||
|
||||
let name = module.name;
|
||||
const [helpOpen, setHelpOpen] = useState(false);
|
||||
const [configOpen, setConfigOpen] = useState(false);
|
||||
const enabled = enabledModules[type].find((m: any) => m[0] === name)[1];
|
||||
|
||||
return (
|
||||
<Grid ref={setNodeRef} size={{ xs: 6, sm: 4, md: 3 }} style={style}>
|
||||
<Card >
|
||||
<CardHeader
|
||||
title={
|
||||
<FormControlLabel
|
||||
style={{paddingRight: '0 !important'}}
|
||||
control={<Checkbox title="Check to enable this module" sx={{paddingTop:0, paddingBottom:0}} id={name} onClick={toggleModule} checked={enabled} />}
|
||||
label={module.display_name} />
|
||||
}
|
||||
/>
|
||||
<CardActions>
|
||||
<Box sx={{ justifyContent: 'space-between', display: 'flex', width: '100%' }}>
|
||||
<Box>
|
||||
<IconButton title="Module information" size="small" onClick={() => setHelpOpen(true)}>
|
||||
<HelpIconOutlined />
|
||||
</IconButton>
|
||||
{enabled && module.configs && name != 'cli_feeder' ? (
|
||||
<Button size="small" onClick={() => setConfigOpen(true)}>Configure</Button>
|
||||
) : null}
|
||||
</Box>
|
||||
<IconButton size="small" title="Drag to reorder" sx={{ cursor: 'grab' }} {...listeners} {...attributes}>
|
||||
<DragIndicatorIcon/>
|
||||
</IconButton>
|
||||
</Box>
|
||||
</CardActions>
|
||||
</Card>
|
||||
<Dialog
|
||||
open={helpOpen}
|
||||
onClose={() => setHelpOpen(false)}
|
||||
maxWidth="lg"
|
||||
>
|
||||
<DialogTitle>
|
||||
{module.display_name}
|
||||
</DialogTitle>
|
||||
<DialogContent>
|
||||
<ReactMarkdown>
|
||||
{module.manifest.description.split("\n").map((line: string) => line.trim()).join("\n")}
|
||||
</ReactMarkdown>
|
||||
</DialogContent>
|
||||
</Dialog>
|
||||
{module.configs && name != 'cli_feeder' && <ConfigPanel module={module} open={configOpen} setOpen={setConfigOpen} configValues={configValues} />}
|
||||
</Grid>
|
||||
)
|
||||
}
|
||||
|
||||
function ConfigField({ config_value, module, configValues }: { config_value: any, module: Module, configValues: any }) {
|
||||
const [showPassword, setShowPassword] = useState(false);
|
||||
const handleClickShowPassword = () => setShowPassword((show) => !show);
|
||||
|
||||
const handleMouseDownPassword = (event: React.MouseEvent<HTMLButtonElement>) => {
|
||||
event.preventDefault();
|
||||
};
|
||||
|
||||
const handleMouseUpPassword = (event: React.MouseEvent<HTMLButtonElement>) => {
|
||||
event.preventDefault();
|
||||
};
|
||||
|
||||
function setConfigValue(config: any, value: any) {
|
||||
configValues[module.name][config] = value;
|
||||
}
|
||||
const config_args: Config = module.configs[config_value];
|
||||
const config_name: string = config_value.replace(/_/g, " ");
|
||||
const config_display_name = config_name.capitalize();
|
||||
const value = configValues[module.name][config_value] || config_args.default;
|
||||
|
||||
|
||||
const config_value_lower = config_value.toLowerCase();
|
||||
const is_password = config_value_lower.includes('password') ||
|
||||
config_value_lower.includes('secret') ||
|
||||
config_value_lower.includes('token') ||
|
||||
config_value_lower.includes('key') ||
|
||||
config_value_lower.includes('api_hash') ||
|
||||
config_args.type === 'password';
|
||||
|
||||
const text_input_type = is_password ? 'password' : (config_args.type === 'int' ? 'number' : 'text');
|
||||
|
||||
return (
|
||||
<Box>
|
||||
<Typography variant='body1' style={{ fontWeight: 'bold' }}>{config_display_name} {config_args.required && (`(required)`)} </Typography>
|
||||
<FormControl size="small">
|
||||
{config_args.type === 'bool' ?
|
||||
<FormControlLabel control={
|
||||
<Checkbox defaultChecked={value} size="small" id={`${module}.${config_value}`}
|
||||
onChange={(e) => {
|
||||
setConfigValue(config_value, e.target.checked);
|
||||
}}
|
||||
/>} label={config_args.help.capitalize()}
|
||||
/>
|
||||
:
|
||||
(
|
||||
config_args.choices !== undefined ?
|
||||
<Select size="small" id={`${module}.${config_value}`}
|
||||
defaultValue={config_args.default}
|
||||
value={value}
|
||||
onChange={(e) => {
|
||||
setConfigValue(config_value, e.target.value);
|
||||
}}
|
||||
>
|
||||
{config_args.choices.map((choice: any) => {
|
||||
return (
|
||||
<MenuItem key={`${module}.${config_value}.${choice}`}
|
||||
value={choice}>{choice}</MenuItem>
|
||||
);
|
||||
})}
|
||||
</Select>
|
||||
:
|
||||
(config_args.type === 'json_loader' ?
|
||||
<TextField multiline size="small" id={`${module}.${config_value}`} defaultValue={JSON.stringify(value, null, 2)} rows={6} onChange={
|
||||
(e) => {
|
||||
try {
|
||||
let val = JSON.parse(e.target.value);
|
||||
setConfigValue(config_value, val);
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
}
|
||||
}
|
||||
} />
|
||||
:
|
||||
<TextField size="small" id={`${module}.${config_value}`} defaultValue={value} type={showPassword ? 'text' : text_input_type}
|
||||
onChange={(e) => {
|
||||
setConfigValue(config_value, e.target.value);
|
||||
}}
|
||||
required={config_args.required}
|
||||
slotProps={ is_password ? {
|
||||
input: { endAdornment: (
|
||||
<InputAdornment position="end">
|
||||
<IconButton
|
||||
aria-label="toggle password visibility"
|
||||
onClick={handleClickShowPassword}
|
||||
onMouseDown={handleMouseDownPassword}
|
||||
onMouseUp={handleMouseUpPassword}
|
||||
>
|
||||
{showPassword ? <VisibilityOff /> : <Visibility />}
|
||||
</IconButton>
|
||||
</InputAdornment>
|
||||
)}
|
||||
} : {}}
|
||||
/>
|
||||
)
|
||||
)
|
||||
}
|
||||
{config_args.type !== 'bool' && (
|
||||
<FormHelperText >{config_args.help.capitalize()}</FormHelperText>
|
||||
)}
|
||||
</FormControl>
|
||||
</Box>
|
||||
)
|
||||
}
|
||||
|
||||
function ConfigPanel({ module, open, setOpen, configValues }: { module: Module, open: boolean, setOpen: any, configValues: any }) {
|
||||
|
||||
return (
|
||||
<>
|
||||
<Dialog
|
||||
open={open}
|
||||
onClose={() => setOpen(false)}
|
||||
maxWidth="lg"
|
||||
>
|
||||
<DialogTitle>
|
||||
{module.display_name}
|
||||
</DialogTitle>
|
||||
<DialogContent>
|
||||
<Stack direction="column" spacing={1}>
|
||||
{Object.keys(module.configs).map((config_value: any) => {
|
||||
return (
|
||||
<ConfigField key={config_value} config_value={config_value} module={module} configValues={configValues} />
|
||||
);
|
||||
})}
|
||||
</Stack>
|
||||
</DialogContent>
|
||||
</Dialog>
|
||||
</>
|
||||
);
|
||||
}
|
||||
|
||||
export default StepCard;
|
||||
44
scripts/settings/src/main.tsx
Normal file
44
scripts/settings/src/main.tsx
Normal file
@@ -0,0 +1,44 @@
|
||||
import * as React from 'react';
|
||||
import * as ReactDOM from 'react-dom/client';
|
||||
import { ThemeProvider } from '@mui/material/styles';
|
||||
import { CssBaseline } from '@mui/material';
|
||||
import App from './App';
|
||||
import { createTheme } from '@mui/material/styles';
|
||||
import { red } from '@mui/material/colors';
|
||||
import { useState, useEffect } from 'react';
|
||||
|
||||
function RootApp() {
|
||||
const [mode, setMode] = useState('light');
|
||||
|
||||
useEffect(() => {
|
||||
setMode(window.localStorage.getItem('theme') || 'light');
|
||||
}, []);
|
||||
|
||||
var observer = new MutationObserver(function(mutations) {
|
||||
setMode(window.localStorage.getItem('theme') || 'light');
|
||||
|
||||
})
|
||||
observer.observe(document.documentElement, {attributes: true, attributeFilter: ['data-theme']});
|
||||
|
||||
// A custom theme for this app
|
||||
const theme = createTheme({
|
||||
palette: {
|
||||
mode: mode == 'light' ? 'light' : 'dark',
|
||||
},
|
||||
cssVariables: true
|
||||
});
|
||||
|
||||
return (
|
||||
<ThemeProvider theme={theme}>
|
||||
<CssBaseline />
|
||||
<App />
|
||||
</ThemeProvider>
|
||||
);
|
||||
}
|
||||
|
||||
ReactDOM.createRoot(document.getElementById('root')!).render(
|
||||
<React.StrictMode>
|
||||
<RootApp />
|
||||
</React.StrictMode>,
|
||||
);
|
||||
|
||||
2118
scripts/settings/src/schema.json
Normal file
2118
scripts/settings/src/schema.json
Normal file
File diff suppressed because it is too large
Load Diff
21
scripts/settings/src/types.d.ts
vendored
Normal file
21
scripts/settings/src/types.d.ts
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
export interface Config {
|
||||
name: string;
|
||||
description: string;
|
||||
type: string?;
|
||||
default: any;
|
||||
help: string;
|
||||
choices: string[];
|
||||
required: boolean;
|
||||
}
|
||||
|
||||
interface Manifest {
|
||||
description: string;
|
||||
}
|
||||
|
||||
export interface Module {
|
||||
name: string;
|
||||
description: string;
|
||||
configs: { [key: string]: Config };
|
||||
manifest: Manifest;
|
||||
display_name: string;
|
||||
}
|
||||
21
scripts/settings/tsconfig.json
Normal file
21
scripts/settings/tsconfig.json
Normal file
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "ESNext",
|
||||
"useDefineForClassFields": true,
|
||||
"lib": ["DOM", "DOM.Iterable", "ESNext"],
|
||||
"allowJs": false,
|
||||
"skipLibCheck": true,
|
||||
"esModuleInterop": false,
|
||||
"allowSyntheticDefaultImports": true,
|
||||
"strict": true,
|
||||
"forceConsistentCasingInFileNames": true,
|
||||
"module": "ESNext",
|
||||
"moduleResolution": "Node",
|
||||
"resolveJsonModule": true,
|
||||
"isolatedModules": true,
|
||||
"noEmit": true,
|
||||
"jsx": "react-jsx"
|
||||
},
|
||||
"include": ["src"],
|
||||
"references": [{ "path": "./tsconfig.node.json" }]
|
||||
}
|
||||
9
scripts/settings/tsconfig.node.json
Normal file
9
scripts/settings/tsconfig.node.json
Normal file
@@ -0,0 +1,9 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"composite": true,
|
||||
"module": "ESNext",
|
||||
"moduleResolution": "Node",
|
||||
"allowSyntheticDefaultImports": true
|
||||
},
|
||||
"include": ["vite.config.ts"]
|
||||
}
|
||||
12
scripts/settings/vite.config.ts
Normal file
12
scripts/settings/vite.config.ts
Normal file
@@ -0,0 +1,12 @@
|
||||
import { defineConfig } from 'vite';
|
||||
import react from '@vitejs/plugin-react';
|
||||
import { viteSingleFile } from "vite-plugin-singlefile"
|
||||
|
||||
// https://vite.dev/config/
|
||||
export default defineConfig({
|
||||
plugins: [react(), viteSingleFile()],
|
||||
build: {
|
||||
minify: false,
|
||||
sourcemap: true,
|
||||
}
|
||||
});
|
||||
@@ -105,8 +105,8 @@ class BaseModule(ABC):
|
||||
for key in self.authentication.keys():
|
||||
if key in site or site in key:
|
||||
logger.debug(f"Could not find exact authentication information for site '{site}'. \
|
||||
did find information for '{key}' which is close, is this what you meant? \
|
||||
If so, edit your authentication settings to make sure it exactly matches.")
|
||||
did find information for '{key}' which is close, is this what you meant? \
|
||||
If so, edit your authentication settings to make sure it exactly matches.")
|
||||
|
||||
def get_ytdlp_cookiejar(args):
|
||||
import yt_dlp
|
||||
|
||||
@@ -80,7 +80,10 @@ class ModuleFactory:
|
||||
|
||||
available = self.available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
|
||||
if not available:
|
||||
raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
|
||||
message = f"Module '{module_name}' not found. Are you sure it's installed/exists?"
|
||||
if 'archiver' in module_name:
|
||||
message += f" Did you mean {module_name.replace('archiver', 'extractor')}?"
|
||||
raise IndexError(message)
|
||||
return available[0]
|
||||
|
||||
def available_modules(self, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
|
||||
|
||||
@@ -15,6 +15,7 @@ from copy import copy
|
||||
|
||||
from rich_argparse import RichHelpFormatter
|
||||
from loguru import logger
|
||||
import requests
|
||||
|
||||
from .metadata import Metadata, Media
|
||||
from auto_archiver.version import __version__
|
||||
@@ -72,10 +73,20 @@ class ArchivingOrchestrator:
|
||||
|
||||
self.basic_parser = parser
|
||||
return parser
|
||||
|
||||
def check_steps(self, config):
|
||||
for module_type in MODULE_TYPES:
|
||||
if not config['steps'].get(f"{module_type}s", []):
|
||||
if module_type == 'feeder' or module_type == 'formatter' and config['steps'].get(f"{module_type}"):
|
||||
raise SetupError(f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n {module_type}s:\n - [your_{module_type}_name_here]\n {'extractors:...' if module_type == 'feeder' else '...'}\n")
|
||||
if module_type == 'extractor' and config['steps'].get('archivers'):
|
||||
raise SetupError(f"As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_here]\n enrichers:...\n")
|
||||
raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")
|
||||
|
||||
def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
|
||||
|
||||
|
||||
# modules parser to get the overridden 'steps' values
|
||||
modules_parser = argparse.ArgumentParser(
|
||||
add_help=False,
|
||||
@@ -100,6 +111,7 @@ class ArchivingOrchestrator:
|
||||
# but should we add them? Or should we just add them to the 'complete' parser?
|
||||
|
||||
if is_valid_config(yaml_config):
|
||||
self.check_steps(yaml_config)
|
||||
# only load the modules enabled in config
|
||||
# TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
|
||||
enabled_modules = []
|
||||
@@ -115,10 +127,6 @@ class ArchivingOrchestrator:
|
||||
simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup]
|
||||
self.add_individual_module_args(simple_modules, parser)
|
||||
|
||||
# for simple mode, we use the cli_feeder and any modules that don't require setup
|
||||
if not yaml_config['steps']['feeders']:
|
||||
yaml_config['steps']['feeders'] = ['cli_feeder']
|
||||
|
||||
# add them to the config
|
||||
for module in simple_modules:
|
||||
for module_type in module.type:
|
||||
@@ -171,9 +179,6 @@ class ArchivingOrchestrator:
|
||||
if not parser:
|
||||
parser = self.parser
|
||||
|
||||
# allow passing URLs directly on the command line
|
||||
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
|
||||
|
||||
parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
|
||||
(token, username etc.) that extractors can use to log into \
|
||||
a website. If passing this on the command line, use a JSON string. \
|
||||
@@ -193,7 +198,11 @@ class ArchivingOrchestrator:
|
||||
modules = self.module_factory.available_modules()
|
||||
|
||||
for module in modules:
|
||||
|
||||
if module.name == 'cli_feeder':
|
||||
# special case. For the CLI feeder, allow passing URLs directly on the command line without setting --cli_feeder.urls=
|
||||
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
|
||||
continue
|
||||
|
||||
if not module.configs:
|
||||
# this module has no configs, don't show anything in the help
|
||||
# (TODO: do we want to show something about this module though, like a description?)
|
||||
@@ -277,36 +286,16 @@ class ArchivingOrchestrator:
|
||||
raise SetupError(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
|
||||
|
||||
for module in modules_to_load:
|
||||
if module == 'cli_feeder':
|
||||
# cli_feeder is a pseudo module, it just takes the command line args for [URLS]
|
||||
urls = self.config['urls']
|
||||
if not urls:
|
||||
raise SetupError("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
|
||||
|
||||
def feed(self) -> Generator[Metadata]:
|
||||
for url in urls:
|
||||
logger.debug(f"Processing URL: '{url}'")
|
||||
yield Metadata().set_url(url)
|
||||
|
||||
pseudo_module = type('CLIFeeder', (Feeder,), {
|
||||
'name': 'cli_feeder',
|
||||
'display_name': 'CLI Feeder',
|
||||
'__iter__': feed
|
||||
|
||||
})()
|
||||
|
||||
pseudo_module.__iter__ = feed
|
||||
step_items.append(pseudo_module)
|
||||
continue
|
||||
|
||||
if module in invalid_modules:
|
||||
continue
|
||||
|
||||
loaded_module = None
|
||||
try:
|
||||
loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
|
||||
except (KeyboardInterrupt, Exception) as e:
|
||||
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
|
||||
if module_type == 'extractor' and loaded_module.name == module:
|
||||
if loaded_module and module_type == 'extractor':
|
||||
loaded_module.cleanup()
|
||||
raise e
|
||||
|
||||
@@ -348,7 +337,23 @@ class ArchivingOrchestrator:
|
||||
yaml_config = self.load_config(basic_config.config_file)
|
||||
|
||||
return self.setup_complete_parser(basic_config, yaml_config, unused_args)
|
||||
|
||||
def check_for_updates(self):
|
||||
response = requests.get("https://pypi.org/pypi/auto-archiver/json").json()
|
||||
latest_version = response['info']['version']
|
||||
# check version compared to current version
|
||||
if latest_version != __version__:
|
||||
if os.environ.get('RUNNING_IN_DOCKER'):
|
||||
update_cmd = "`docker pull bellingcat/auto-archiver:latest`"
|
||||
else:
|
||||
update_cmd = "`pip install --upgrade auto-archiver`"
|
||||
logger.warning("")
|
||||
logger.warning("********* IMPORTANT: UPDATE AVAILABLE ********")
|
||||
logger.warning(f"A new version of auto-archiver is available (v{latest_version}, you have {__version__})")
|
||||
logger.warning(f"Make sure to update to the latest version using: {update_cmd}")
|
||||
logger.warning("")
|
||||
|
||||
|
||||
def setup(self, args: list):
|
||||
"""
|
||||
Function to configure all setup of the orchestrator: setup configs and load modules.
|
||||
@@ -356,6 +361,8 @@ class ArchivingOrchestrator:
|
||||
This method should only ever be called once
|
||||
"""
|
||||
|
||||
self.check_for_updates()
|
||||
|
||||
if self.setup_finished:
|
||||
logger.warning("The `setup_config()` function should only ever be run once. \
|
||||
If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
from .atlos_db import AtlosDb
|
||||
@@ -1,38 +0,0 @@
|
||||
{
|
||||
"name": "Atlos Database",
|
||||
"type": ["database"],
|
||||
"entry_point": "atlos_db::AtlosDb",
|
||||
"requires_setup": True,
|
||||
"dependencies":
|
||||
{"python": ["loguru",
|
||||
""],
|
||||
"bin": [""]},
|
||||
"configs": {
|
||||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"required": True,
|
||||
"type": "str",
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"type": "str"
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Handles integration with the Atlos platform for managing archival results.
|
||||
|
||||
### Features
|
||||
- Outputs archival results to the Atlos API for storage and tracking.
|
||||
- Updates failure status with error details when archiving fails.
|
||||
- Processes and formats metadata, including ISO formatting for datetime fields.
|
||||
- Skips processing for items without an Atlos ID.
|
||||
|
||||
### Setup
|
||||
Required configs:
|
||||
- atlos_url: Base URL for the Atlos API.
|
||||
- api_token: Authentication token for API access.
|
||||
"""
|
||||
,
|
||||
}
|
||||
@@ -1,66 +0,0 @@
|
||||
from typing import Union
|
||||
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Database
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
class AtlosDb(Database):
|
||||
"""
|
||||
Outputs results to Atlos
|
||||
"""
|
||||
|
||||
def failed(self, item: Metadata, reason: str) -> None:
|
||||
"""Update DB accordingly for failure"""
|
||||
# If the item has no Atlos ID, there's nothing for us to do
|
||||
if not item.metadata.get("atlos_id"):
|
||||
logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
|
||||
return
|
||||
|
||||
requests.post(
|
||||
f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver",
|
||||
headers={"Authorization": f"Bearer {self.api_token}"},
|
||||
json={"metadata": {"processed": True, "status": "error", "error": reason}},
|
||||
).raise_for_status()
|
||||
logger.info(
|
||||
f"Stored failure for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos: {reason}"
|
||||
)
|
||||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
"""check and fetch if the given item has been archived already, each
|
||||
database should handle its own caching, and configuration mechanisms"""
|
||||
return False
|
||||
|
||||
def _process_metadata(self, item: Metadata) -> dict:
|
||||
"""Process metadata for storage on Atlos. Will convert any datetime
|
||||
objects to ISO format."""
|
||||
|
||||
return {
|
||||
k: v.isoformat() if hasattr(v, "isoformat") else v
|
||||
for k, v in item.metadata.items()
|
||||
}
|
||||
|
||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
|
||||
if not item.metadata.get("atlos_id"):
|
||||
logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
|
||||
return
|
||||
|
||||
requests.post(
|
||||
f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver",
|
||||
headers={"Authorization": f"Bearer {self.api_token}"},
|
||||
json={
|
||||
"metadata": dict(
|
||||
processed=True,
|
||||
status="success",
|
||||
results=self._process_metadata(item),
|
||||
)
|
||||
},
|
||||
).raise_for_status()
|
||||
|
||||
logger.info(
|
||||
f"Stored success for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos"
|
||||
)
|
||||
@@ -1 +0,0 @@
|
||||
from .atlos_feeder import AtlosFeeder
|
||||
@@ -1,34 +0,0 @@
|
||||
{
|
||||
"name": "Atlos Feeder",
|
||||
"type": ["feeder"],
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "requests"],
|
||||
},
|
||||
"configs": {
|
||||
"api_token": {
|
||||
"type": "str",
|
||||
"required": True,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"type": "str"
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
AtlosFeeder: A feeder module that integrates with the Atlos API to fetch source material URLs for archival.
|
||||
|
||||
### Features
|
||||
- Connects to the Atlos API to retrieve a list of source material URLs.
|
||||
- Filters source materials based on visibility, processing status, and metadata.
|
||||
- Converts filtered source materials into `Metadata` objects with the relevant `atlos_id` and URL.
|
||||
- Iterates through paginated results using a cursor for efficient API interaction.
|
||||
|
||||
### Notes
|
||||
- Requires an Atlos API endpoint and a valid API token for authentication.
|
||||
- Ensures only unprocessed, visible, and ready-to-archive URLs are returned.
|
||||
- Handles pagination transparently when retrieving data from the Atlos API.
|
||||
"""
|
||||
}
|
||||
@@ -1,42 +0,0 @@
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Feeder
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
class AtlosFeeder(Feeder):
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
# Get all the urls from the Atlos API
|
||||
count = 0
|
||||
cursor = None
|
||||
while True:
|
||||
response = requests.get(
|
||||
f"{self.atlos_url}/api/v2/source_material",
|
||||
headers={"Authorization": f"Bearer {self.api_token}"},
|
||||
params={"cursor": cursor},
|
||||
)
|
||||
data = response.json()
|
||||
response.raise_for_status()
|
||||
cursor = data["next"]
|
||||
|
||||
for item in data["results"]:
|
||||
if (
|
||||
item["source_url"] not in [None, ""]
|
||||
and (
|
||||
item["metadata"]
|
||||
.get("auto_archiver", {})
|
||||
.get("processed", False)
|
||||
!= True
|
||||
)
|
||||
and item["visibility"] == "visible"
|
||||
and item["status"] not in ["processing", "pending"]
|
||||
):
|
||||
yield Metadata().set_url(item["source_url"]).set(
|
||||
"atlos_id", item["id"]
|
||||
)
|
||||
count += 1
|
||||
|
||||
if len(data["results"]) == 0 or cursor is None:
|
||||
break
|
||||
@@ -0,0 +1 @@
|
||||
from .atlos_feeder_db_storage import AtlosFeederDbStorage
|
||||
@@ -0,0 +1,46 @@
|
||||
{
|
||||
"name": "Atlos Feeder Database Storage",
|
||||
"type": ["feeder", "database", "storage"],
|
||||
"entry_point": "atlos_feeder_db_storage::AtlosFeederDbStorage",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "requests"],
|
||||
},
|
||||
"configs": {
|
||||
"api_token": {
|
||||
"type": "str",
|
||||
"required": True,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"type": "str"
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
A module that integrates with the Atlos API to fetch source material URLs for archival, uplaod extracted media,
|
||||
|
||||
[Atlos](https://www.atlos.org/) is a visual investigation and archiving platform designed for investigative research, journalism, and open-source intelligence (OSINT).
|
||||
It helps users organize, analyze, and store media from various sources, making it easier to track and investigate digital evidence.
|
||||
|
||||
To get started create a new project and obtain an API token from the settings page. You can group event's into Atlos's 'incidents'.
|
||||
Here you can add 'source material' by URLn and the Atlos feeder will fetch these URLs for archival.
|
||||
|
||||
You can use Atlos only as a 'feeder', however you can also implement the 'database' and 'storage' features to store the media files in Atlos which is recommended.
|
||||
The Auto Archiver will retain the Atlos ID for each item, ensuring that the media and database outputs are uplaoded back into the relevant media item.
|
||||
|
||||
|
||||
### Features
|
||||
- Connects to the Atlos API to retrieve a list of source material URLs.
|
||||
- Iterates through the URLs from all source material items which are unprocessed, visible, and ready to archive.
|
||||
- If the storage option is selected, it will store the media files alongside the original source material item in Atlos.
|
||||
- Is the database option is selected it will output the results to the media item, as well as updating failure status with error details when archiving fails.
|
||||
- Skips Storege/ database upload for items without an Atlos ID - restricting that you must use the Atlos feeder so that it has the Atlos ID to store the results with.
|
||||
|
||||
### Notes
|
||||
- Requires an Atlos account with a project and a valid API token for authentication.
|
||||
- Ensures only unprocessed, visible, and ready-to-archive URLs are returned.
|
||||
- Feches any media items within an Atlos project, regardless of separation into incidents.
|
||||
"""
|
||||
}
|
||||
@@ -0,0 +1,153 @@
|
||||
import hashlib
|
||||
import os
|
||||
from typing import IO, Iterator, Optional, Union
|
||||
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Database, Feeder, Media, Metadata, Storage
|
||||
from auto_archiver.utils import calculate_file_hash
|
||||
|
||||
|
||||
class AtlosFeederDbStorage(Feeder, Database, Storage):
|
||||
|
||||
def setup(self) -> requests.Session:
|
||||
"""create and return a persistent session."""
|
||||
self.session = requests.Session()
|
||||
|
||||
def _get(self, endpoint: str, params: Optional[dict] = None) -> dict:
|
||||
"""Wrapper for GET requests to the Atlos API."""
|
||||
url = f"{self.atlos_url}{endpoint}"
|
||||
response = self.session.get(
|
||||
url, headers={"Authorization": f"Bearer {self.api_token}"}, params=params
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def _post(
|
||||
self,
|
||||
endpoint: str,
|
||||
json: Optional[dict] = None,
|
||||
params: Optional[dict] = None,
|
||||
files: Optional[dict] = None,
|
||||
) -> dict:
|
||||
"""Wrapper for POST requests to the Atlos API."""
|
||||
url = f"{self.atlos_url}{endpoint}"
|
||||
response = self.session.post(
|
||||
url,
|
||||
headers={"Authorization": f"Bearer {self.api_token}"},
|
||||
json=json,
|
||||
params=params,
|
||||
files=files,
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
# ! Atlos Module - Feeder Methods
|
||||
|
||||
def __iter__(self) -> Iterator[Metadata]:
|
||||
"""Iterate over unprocessed, visible source materials from Atlos."""
|
||||
cursor = None
|
||||
while True:
|
||||
data = self._get("/api/v2/source_material", params={"cursor": cursor})
|
||||
cursor = data.get("next")
|
||||
results = data.get("results", [])
|
||||
for item in results:
|
||||
if (
|
||||
item.get("source_url") not in [None, ""]
|
||||
and not item.get("metadata", {}).get("auto_archiver", {}).get("processed", False)
|
||||
and item.get("visibility") == "visible"
|
||||
and item.get("status") not in ["processing", "pending"]
|
||||
):
|
||||
yield Metadata().set_url(item["source_url"]).set("atlos_id", item["id"])
|
||||
if not results or cursor is None:
|
||||
break
|
||||
|
||||
# ! Atlos Module - Database Methods
|
||||
|
||||
def failed(self, item: Metadata, reason: str) -> None:
|
||||
"""Mark an item as failed in Atlos, if the ID exists."""
|
||||
atlos_id = item.metadata.get("atlos_id")
|
||||
if not atlos_id:
|
||||
logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
|
||||
return
|
||||
self._post(
|
||||
f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver",
|
||||
json={"metadata": {"processed": True, "status": "error", "error": reason}},
|
||||
)
|
||||
logger.info(f"Stored failure for {item.get_url()} (ID {atlos_id}) on Atlos: {reason}")
|
||||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
"""check and fetch if the given item has been archived already, each
|
||||
database should handle its own caching, and configuration mechanisms"""
|
||||
return False
|
||||
|
||||
def _process_metadata(self, item: Metadata) -> dict:
|
||||
"""Process metadata for storage on Atlos. Will convert any datetime
|
||||
objects to ISO format."""
|
||||
return {
|
||||
k: v.isoformat() if hasattr(v, "isoformat") else v
|
||||
for k, v in item.metadata.items()
|
||||
}
|
||||
|
||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||
"""Mark an item as successfully archived in Atlos."""
|
||||
atlos_id = item.metadata.get("atlos_id")
|
||||
if not atlos_id:
|
||||
logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
|
||||
return
|
||||
self._post(
|
||||
f"/api/v2/source_material/metadata/{atlos_id}/auto_archiver",
|
||||
json={
|
||||
"metadata": {
|
||||
"processed": True,
|
||||
"status": "success",
|
||||
"results": self._process_metadata(item),
|
||||
}
|
||||
},
|
||||
)
|
||||
logger.info(f"Stored success for {item.get_url()} (ID {atlos_id}) on Atlos")
|
||||
|
||||
# ! Atlos Module - Storage Methods
|
||||
|
||||
def get_cdn_url(self, _media: Media) -> str:
|
||||
"""Return the base Atlos URL as the CDN URL."""
|
||||
return self.atlos_url
|
||||
|
||||
def upload(self, media: Media, metadata: Optional[Metadata] = None, **_kwargs) -> bool:
|
||||
"""Upload a media file to Atlos if it has not been uploaded already."""
|
||||
if metadata is None:
|
||||
logger.error(f"No metadata provided for {media.filename}")
|
||||
return False
|
||||
|
||||
atlos_id = metadata.get("atlos_id")
|
||||
if not atlos_id:
|
||||
logger.error(f"No Atlos ID found in metadata; can't store {media.filename} in Atlos.")
|
||||
return False
|
||||
|
||||
media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096)
|
||||
|
||||
# Check whether the media has already been uploaded
|
||||
source_material = self._get(f"/api/v2/source_material/{atlos_id}")["result"]
|
||||
existing_media = [
|
||||
artifact.get("file_hash_sha256")
|
||||
for artifact in source_material.get("artifacts", [])
|
||||
]
|
||||
if media_hash in existing_media:
|
||||
logger.info(f"{media.filename} with SHA256 {media_hash} already uploaded to Atlos")
|
||||
return True
|
||||
|
||||
# Upload the media to the Atlos API
|
||||
with open(media.filename, "rb") as file_obj:
|
||||
self._post(
|
||||
f"/api/v2/source_material/upload/{atlos_id}",
|
||||
params={"title": media.properties},
|
||||
files={"file": (os.path.basename(media.filename), file_obj)},
|
||||
)
|
||||
logger.info(f"Uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}")
|
||||
return True
|
||||
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
||||
"""Upload a file-like object; not implemented."""
|
||||
pass
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
from .atlos_storage import AtlosStorage
|
||||
@@ -1,32 +0,0 @@
|
||||
{
|
||||
"name": "Atlos Storage",
|
||||
"type": ["storage"],
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "boto3"],
|
||||
"bin": []
|
||||
},
|
||||
"description": """
|
||||
Stores media files in a [Atlos](https://www.atlos.org/).
|
||||
|
||||
### Features
|
||||
- Saves media files to Atlos, organizing them into folders based on the provided path structure.
|
||||
|
||||
### Notes
|
||||
- Requires setup with Atlos credentials.
|
||||
- Files are uploaded to the specified `root_folder_id` and organized by the `media.key` structure.
|
||||
""",
|
||||
"configs": {
|
||||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"required": True,
|
||||
"type": "str"
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"type": "str"
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -1,66 +0,0 @@
|
||||
import hashlib
|
||||
import os
|
||||
from typing import IO, Optional
|
||||
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Media, Metadata
|
||||
from auto_archiver.core import Storage
|
||||
|
||||
|
||||
class AtlosStorage(Storage):
|
||||
|
||||
def get_cdn_url(self, _media: Media) -> str:
|
||||
# It's not always possible to provide an exact URL, because it's
|
||||
# possible that the media once uploaded could have been copied to
|
||||
# another project.
|
||||
return self.atlos_url
|
||||
|
||||
def _hash(self, media: Media) -> str:
|
||||
# Hash the media file using sha-256. We don't use the existing auto archiver
|
||||
# hash because there's no guarantee that the configuerer is using sha-256, which
|
||||
# is how Atlos hashes files.
|
||||
|
||||
sha256 = hashlib.sha256()
|
||||
with open(media.filename, "rb") as f:
|
||||
while True:
|
||||
buf = f.read(4096)
|
||||
if not buf: break
|
||||
sha256.update(buf)
|
||||
return sha256.hexdigest()
|
||||
|
||||
def upload(self, media: Media, metadata: Optional[Metadata]=None, **_kwargs) -> bool:
|
||||
atlos_id = metadata.get("atlos_id")
|
||||
if atlos_id is None:
|
||||
logger.error(f"No Atlos ID found in metadata; can't store {media.filename} on Atlos")
|
||||
return False
|
||||
|
||||
media_hash = self._hash(media)
|
||||
|
||||
# Check whether the media has already been uploaded
|
||||
source_material = requests.get(
|
||||
f"{self.atlos_url}/api/v2/source_material/{atlos_id}",
|
||||
headers={"Authorization": f"Bearer {self.api_token}"},
|
||||
).json()["result"]
|
||||
existing_media = [x["file_hash_sha256"] for x in source_material.get("artifacts", [])]
|
||||
if media_hash in existing_media:
|
||||
logger.info(f"{media.filename} with SHA256 {media_hash} already uploaded to Atlos")
|
||||
return True
|
||||
|
||||
# Upload the media to the Atlos API
|
||||
requests.post(
|
||||
f"{self.atlos_url}/api/v2/source_material/upload/{atlos_id}",
|
||||
headers={"Authorization": f"Bearer {self.api_token}"},
|
||||
params={
|
||||
"title": media.properties
|
||||
},
|
||||
files={"file": (os.path.basename(media.filename), open(media.filename, "rb"))},
|
||||
).raise_for_status()
|
||||
|
||||
logger.info(f"Uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}")
|
||||
|
||||
return True
|
||||
|
||||
# must be implemented even if unused
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
|
||||
23
src/auto_archiver/modules/cli_feeder/__manifest__.py
Normal file
23
src/auto_archiver/modules/cli_feeder/__manifest__.py
Normal file
@@ -0,0 +1,23 @@
|
||||
{
|
||||
'name': 'Command Line Feeder',
|
||||
'type': ['feeder'],
|
||||
'entry_point': 'cli_feeder::CLIFeeder',
|
||||
'requires_setup': False,
|
||||
'description': 'Feeds URLs to orchestrator from the command line',
|
||||
'configs': {
|
||||
'urls': {
|
||||
'default': None,
|
||||
'help': 'URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml',
|
||||
},
|
||||
},
|
||||
'description': """
|
||||
The Command Line Feeder is the default enabled feeder for the Auto Archiver. It allows you to pass URLs directly to the orchestrator from the command line
|
||||
without the need to specify any additional configuration or command line arguments:
|
||||
|
||||
`auto-archiver --feeder cli_feeder -- "https://example.com/1/,https://example.com/2/"`
|
||||
|
||||
You can pass multiple URLs by separating them with a space. The URLs will be processed in the order they are provided.
|
||||
|
||||
`auto-archiver --feeder cli_feeder -- https://example.com/1/ https://example.com/2/`
|
||||
""",
|
||||
}
|
||||
21
src/auto_archiver/modules/cli_feeder/cli_feeder.py
Normal file
21
src/auto_archiver/modules/cli_feeder/cli_feeder.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core.feeder import Feeder
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
|
||||
class CLIFeeder(Feeder):
|
||||
|
||||
def setup(self) -> None:
|
||||
self.urls = self.config['urls']
|
||||
if not self.urls:
|
||||
raise ValueError("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
urls = self.config['urls']
|
||||
for url in urls:
|
||||
logger.debug(f"Processing {url}")
|
||||
m = Metadata().set_url(url)
|
||||
m.set_context("folder", "cli")
|
||||
yield m
|
||||
|
||||
logger.success(f"Processed {len(urls)} URL(s)")
|
||||
@@ -10,7 +10,7 @@ class ConsoleDb(Database):
|
||||
"""
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
logger.info(f"STARTED {item}")
|
||||
|
||||
def failed(self, item: Metadata, reason:str) -> None:
|
||||
logger.error(f"FAILED {item}: {reason}")
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
},
|
||||
'entry_point': 'csv_db::CSVDb',
|
||||
"configs": {
|
||||
"csv_file": {"default": "db.csv", "help": "CSV file name"}
|
||||
"csv_file": {"default": "db.csv", "help": "CSV file name to save metadata to"},
|
||||
},
|
||||
"description": """
|
||||
Handles exporting archival results to a CSV file.
|
||||
|
||||
@@ -28,6 +28,13 @@ the broader archiving framework.
|
||||
metadata objects. Some dropins are included in this generic_archiver by default, but
|
||||
custom dropins can be created to handle additional websites and passed to the archiver
|
||||
via the command line using the `--dropins` option (TODO!).
|
||||
|
||||
### Auto-Updates
|
||||
|
||||
The Generic Extractor will also automatically check for updates to `yt-dlp` (every 5 days by default).
|
||||
This can be configured using the `ytdlp_update_interval` setting (or disabled by setting it to -1).
|
||||
If you are having issues with the extractor, you can review the version of `yt-dlp` being used with `yt-dlp --version`.
|
||||
|
||||
""",
|
||||
"configs": {
|
||||
"subtitles": {"default": True, "help": "download subtitles if available", "type": "bool"},
|
||||
@@ -69,5 +76,10 @@ via the command line using the `--dropins` option (TODO!).
|
||||
"help": "Additional arguments to pass to the yt-dlp extractor. See https://github.com/yt-dlp/yt-dlp/blob/master/README.md#extractor-arguments.",
|
||||
"type": "json_loader",
|
||||
},
|
||||
"ytdlp_update_interval": {
|
||||
"default": 5,
|
||||
"help": "How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.",
|
||||
"type": "int",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
import datetime, os, yt_dlp, pysubs2
|
||||
import datetime, os
|
||||
import importlib
|
||||
import subprocess
|
||||
from typing import Generator, Type
|
||||
|
||||
import yt_dlp
|
||||
from yt_dlp.extractor.common import InfoExtractor
|
||||
import pysubs2
|
||||
|
||||
from loguru import logger
|
||||
|
||||
@@ -11,6 +15,44 @@ from auto_archiver.core import Metadata, Media
|
||||
class GenericExtractor(Extractor):
|
||||
_dropins = {}
|
||||
|
||||
def setup(self):
|
||||
# check for file .ytdlp-update in the secrets folder
|
||||
if self.ytdlp_update_interval < 0:
|
||||
return
|
||||
|
||||
use_secrets = os.path.exists('secrets')
|
||||
path = os.path.join('secrets' if use_secrets else '', '.ytdlp-update')
|
||||
next_update_check = None
|
||||
if os.path.exists(path):
|
||||
with open(path, "r") as f:
|
||||
next_update_check = datetime.datetime.fromisoformat(f.read())
|
||||
|
||||
if not next_update_check or next_update_check < datetime.datetime.now():
|
||||
self.update_ytdlp()
|
||||
|
||||
next_update_check = datetime.datetime.now() + datetime.timedelta(days=self.ytdlp_update_interval)
|
||||
with open(path, "w") as f:
|
||||
f.write(next_update_check.isoformat())
|
||||
|
||||
def update_ytdlp(self):
|
||||
logger.info("Checking and updating yt-dlp...")
|
||||
logger.info(f"Tip: change the 'ytdlp_update_interval' setting to control how often yt-dlp is updated. Set to -1 to disable or 0 to enable on every run. Current setting: {self.ytdlp_update_interval}")
|
||||
from importlib.metadata import version as get_version
|
||||
old_version = get_version("yt-dlp")
|
||||
try:
|
||||
# try and update with pip (this works inside poetry environment and in a normal virtualenv)
|
||||
result = subprocess.run(["pip", "install", "--upgrade", "yt-dlp"], check=True, capture_output=True)
|
||||
|
||||
if "Successfully installed yt-dlp" in result.stdout.decode():
|
||||
new_version = importlib.metadata.version("yt-dlp")
|
||||
logger.info(f"yt-dlp successfully (from {old_version} to {new_version})")
|
||||
importlib.reload(yt_dlp)
|
||||
else:
|
||||
logger.info("yt-dlp already up to date")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating yt-dlp: {e}")
|
||||
|
||||
def suitable_extractors(self, url: str) -> Generator[str, None, None]:
|
||||
"""
|
||||
Returns a list of valid extractors for the given URL"""
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
from .gsheet_db import GsheetsDb
|
||||
@@ -1,38 +0,0 @@
|
||||
{
|
||||
"name": "Google Sheets Database",
|
||||
"type": ["database"],
|
||||
"entry_point": "gsheet_db::GsheetsDb",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "gspread", "slugify"],
|
||||
},
|
||||
"configs": {
|
||||
"allow_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": True,
|
||||
"type": "bool",
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
GsheetsDatabase:
|
||||
Handles integration with Google Sheets for tracking archival tasks.
|
||||
|
||||
### Features
|
||||
- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.
|
||||
- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.
|
||||
- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.
|
||||
- Skips redundant updates for empty or invalid data fields.
|
||||
|
||||
### Notes
|
||||
- Currently works only with metadata provided by GsheetFeeder.
|
||||
- Requires configuration of a linked Google Sheet and appropriate API credentials.
|
||||
"""
|
||||
}
|
||||
@@ -1,114 +0,0 @@
|
||||
from typing import Union, Tuple
|
||||
from urllib.parse import quote
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Database
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.modules.gsheet_feeder import GWorksheet
|
||||
from auto_archiver.utils.misc import get_current_timestamp
|
||||
|
||||
|
||||
class GsheetsDb(Database):
|
||||
"""
|
||||
NB: only works if GsheetFeeder is used.
|
||||
could be updated in the future to support non-GsheetFeeder metadata
|
||||
"""
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, "status", "Archive in progress")
|
||||
|
||||
def failed(self, item: Metadata, reason: str) -> None:
|
||||
logger.error(f"FAILED {item}")
|
||||
self._safe_status_update(item, f"Archive failed {reason}")
|
||||
|
||||
def aborted(self, item: Metadata) -> None:
|
||||
logger.warning(f"ABORTED {item}")
|
||||
self._safe_status_update(item, "")
|
||||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
"""check if the given item has been archived already"""
|
||||
return False
|
||||
|
||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
logger.success(f"DONE {item.get_url()}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
# self._safe_status_update(item, 'done')
|
||||
|
||||
cell_updates = []
|
||||
row_values = gw.get_row(row)
|
||||
|
||||
def batch_if_valid(col, val, final_value=None):
|
||||
final_value = final_value or val
|
||||
try:
|
||||
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
|
||||
cell_updates.append((row, col, final_value))
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to batch {col}={final_value} due to {e}")
|
||||
|
||||
status_message = item.status
|
||||
if cached:
|
||||
status_message = f"[cached] {status_message}"
|
||||
cell_updates.append((row, "status", status_message))
|
||||
|
||||
media: Media = item.get_final_media()
|
||||
if hasattr(media, "urls"):
|
||||
batch_if_valid("archive", "\n".join(media.urls))
|
||||
batch_if_valid("date", True, get_current_timestamp())
|
||||
batch_if_valid("title", item.get_title())
|
||||
batch_if_valid("text", item.get("content", ""))
|
||||
batch_if_valid("timestamp", item.get_timestamp())
|
||||
if media:
|
||||
batch_if_valid("hash", media.get("hash", "not-calculated"))
|
||||
|
||||
# merge all pdq hashes into a single string, if present
|
||||
pdq_hashes = []
|
||||
all_media = item.get_all_media()
|
||||
for m in all_media:
|
||||
if pdq := m.get("pdq_hash"):
|
||||
pdq_hashes.append(pdq)
|
||||
if len(pdq_hashes):
|
||||
batch_if_valid("pdq_hash", ",".join(pdq_hashes))
|
||||
|
||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(
|
||||
screenshot, "urls"
|
||||
):
|
||||
batch_if_valid("screenshot", "\n".join(screenshot.urls))
|
||||
|
||||
if thumbnail := item.get_first_image("thumbnail"):
|
||||
if hasattr(thumbnail, "urls"):
|
||||
batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")')
|
||||
|
||||
if browsertrix := item.get_media_by_id("browsertrix"):
|
||||
batch_if_valid("wacz", "\n".join(browsertrix.urls))
|
||||
batch_if_valid(
|
||||
"replaywebpage",
|
||||
"\n".join(
|
||||
[
|
||||
f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}"
|
||||
for wacz in browsertrix.urls
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
gw.batch_set_cell(cell_updates)
|
||||
|
||||
def _safe_status_update(self, item: Metadata, new_status: str) -> None:
|
||||
try:
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, "status", new_status)
|
||||
except Exception as e:
|
||||
logger.debug(f"Unable to update sheet: {e}")
|
||||
|
||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||
|
||||
if gsheet := item.get_context("gsheet"):
|
||||
gw: GWorksheet = gsheet.get("worksheet")
|
||||
row: int = gsheet.get("row")
|
||||
elif self.sheet_id:
|
||||
logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.")
|
||||
|
||||
return gw, row
|
||||
@@ -1,2 +0,0 @@
|
||||
from .gworksheet import GWorksheet
|
||||
from .gsheet_feeder import GsheetsFeeder
|
||||
@@ -1,95 +0,0 @@
|
||||
"""
|
||||
GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver.
|
||||
|
||||
This reads data from Google Sheets and filters rows based on user-defined rules.
|
||||
The filtered rows are processed into `Metadata` objects.
|
||||
|
||||
### Key properties
|
||||
- validates the sheet's structure and filters rows based on input configurations.
|
||||
- Ensures only rows with valid URLs and unprocessed statuses are included.
|
||||
"""
|
||||
import os
|
||||
import gspread
|
||||
|
||||
from loguru import logger
|
||||
from slugify import slugify
|
||||
|
||||
from auto_archiver.core import Feeder
|
||||
from auto_archiver.core import Metadata
|
||||
from . import GWorksheet
|
||||
|
||||
|
||||
class GsheetsFeeder(Feeder):
|
||||
|
||||
def setup(self) -> None:
|
||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||
# TODO mv to validators
|
||||
if not self.sheet and not self.sheet_id:
|
||||
raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.")
|
||||
|
||||
def open_sheet(self):
|
||||
if self.sheet:
|
||||
return self.gsheets_client.open(self.sheet)
|
||||
else: # self.sheet_id
|
||||
return self.gsheets_client.open_by_key(self.sheet_id)
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
sh = self.open_sheet()
|
||||
for ii, worksheet in enumerate(sh.worksheets()):
|
||||
if not self.should_process_sheet(worksheet.title):
|
||||
logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
|
||||
continue
|
||||
logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}')
|
||||
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
|
||||
if len(missing_cols := self.missing_required_columns(gw)):
|
||||
logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}")
|
||||
continue
|
||||
|
||||
# process and yield metadata here:
|
||||
yield from self._process_rows(gw)
|
||||
logger.success(f'Finished worksheet {worksheet.title}')
|
||||
|
||||
def _process_rows(self, gw: GWorksheet):
|
||||
for row in range(1 + self.header, gw.count_rows() + 1):
|
||||
url = gw.get_cell(row, 'url').strip()
|
||||
if not len(url): continue
|
||||
original_status = gw.get_cell(row, 'status')
|
||||
status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
|
||||
# TODO: custom status parser(?) aka should_retry_from_status
|
||||
if status not in ['', None]: continue
|
||||
|
||||
# All checks done - archival process starts here
|
||||
m = Metadata().set_url(url)
|
||||
self._set_context(m, gw, row)
|
||||
yield m
|
||||
|
||||
def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata:
|
||||
|
||||
m.set_context("gsheet", {"row": row, "worksheet": gw})
|
||||
|
||||
if gw.get_cell_or_default(row, 'folder', "") is None:
|
||||
folder = ''
|
||||
else:
|
||||
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
|
||||
if len(folder):
|
||||
if self.use_sheet_names_in_stored_paths:
|
||||
m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title)))
|
||||
else:
|
||||
m.set_context("folder", folder)
|
||||
|
||||
|
||||
def should_process_sheet(self, sheet_name: str) -> bool:
|
||||
if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:
|
||||
# ALLOW rules exist AND sheet name not explicitly allowed
|
||||
return False
|
||||
if len(self.block_worksheets) and sheet_name in self.block_worksheets:
|
||||
# BLOCK rules exist AND sheet name is blocked
|
||||
return False
|
||||
return True
|
||||
|
||||
def missing_required_columns(self, gw: GWorksheet) -> list:
|
||||
missing = []
|
||||
for required_col in ['url', 'status']:
|
||||
if not gw.col_exists(required_col):
|
||||
missing.append(required_col)
|
||||
return missing
|
||||
2
src/auto_archiver/modules/gsheet_feeder_db/__init__.py
Normal file
2
src/auto_archiver/modules/gsheet_feeder_db/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from .gworksheet import GWorksheet
|
||||
from .gsheet_feeder_db import GsheetsFeederDB
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"name": "Google Sheets Feeder",
|
||||
"type": ["feeder"],
|
||||
"entry_point": "gsheet_feeder::GsheetsFeeder",
|
||||
"name": "Google Sheets Feeder Database",
|
||||
"type": ["feeder", "database"],
|
||||
"entry_point": "gsheet_feeder_db::GsheetsFeederDB",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "gspread", "slugify"],
|
||||
@@ -12,7 +12,9 @@
|
||||
"default": None,
|
||||
"help": "the id of the sheet to archive (alternative to 'sheet' config)",
|
||||
},
|
||||
"header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
|
||||
"header": {"default": 1,
|
||||
"type": "int",
|
||||
"help": "index of the header row (starts at 1)", "type": "int"},
|
||||
"service_account": {
|
||||
"default": "secrets/service_account.json",
|
||||
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
|
||||
@@ -51,10 +53,23 @@
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
"type": "bool",
|
||||
},
|
||||
"allow_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": True,
|
||||
"type": "bool",
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
GsheetsFeeder
|
||||
A Google Sheets-based feeder for the Auto Archiver.
|
||||
GsheetsFeederDatabase
|
||||
A Google Sheets-based feeder and optional database for the Auto Archiver.
|
||||
|
||||
This reads data from Google Sheets and filters rows based on user-defined rules.
|
||||
The filtered rows are processed into `Metadata` objects.
|
||||
@@ -64,11 +79,16 @@
|
||||
- Processes only worksheets allowed by the `allow_worksheets` and `block_worksheets` configurations.
|
||||
- Ensures only rows with valid URLs and unprocessed statuses are included for archival.
|
||||
- Supports organizing stored files into folder paths based on sheet and worksheet names.
|
||||
- If the database is enabled, this updates the Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.
|
||||
- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.
|
||||
- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.
|
||||
- Skips redundant updates for empty or invalid data fields.
|
||||
|
||||
### Setup
|
||||
- Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`.
|
||||
To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html).
|
||||
- Define the `sheet` or `sheet_id` configuration to specify the sheet to archive.
|
||||
- Customize the column names in your Google sheet using the `columns` configuration.
|
||||
- The Google Sheet can be used soley as a feeder or as a feeder and database, but note you can't currently feed into the database from an alternate feeder.
|
||||
""",
|
||||
}
|
||||
196
src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py
Normal file
196
src/auto_archiver/modules/gsheet_feeder_db/gsheet_feeder_db.py
Normal file
@@ -0,0 +1,196 @@
|
||||
"""
|
||||
GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver.
|
||||
|
||||
This reads data from Google Sheets and filters rows based on user-defined rules.
|
||||
The filtered rows are processed into `Metadata` objects.
|
||||
|
||||
### Key properties
|
||||
- validates the sheet's structure and filters rows based on input configurations.
|
||||
- Ensures only rows with valid URLs and unprocessed statuses are included.
|
||||
"""
|
||||
import os
|
||||
from typing import Tuple, Union
|
||||
from urllib.parse import quote
|
||||
|
||||
import gspread
|
||||
from loguru import logger
|
||||
from slugify import slugify
|
||||
|
||||
from auto_archiver.core import Feeder, Database, Media
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.modules.gsheet_feeder_db import GWorksheet
|
||||
from auto_archiver.utils.misc import calculate_file_hash, get_current_timestamp
|
||||
|
||||
|
||||
class GsheetsFeederDB(Feeder, Database):
|
||||
|
||||
def setup(self) -> None:
|
||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||
# TODO mv to validators
|
||||
if not self.sheet and not self.sheet_id:
|
||||
raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.")
|
||||
|
||||
def open_sheet(self):
|
||||
if self.sheet:
|
||||
return self.gsheets_client.open(self.sheet)
|
||||
else: # self.sheet_id
|
||||
return self.gsheets_client.open_by_key(self.sheet_id)
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
sh = self.open_sheet()
|
||||
for ii, worksheet in enumerate(sh.worksheets()):
|
||||
if not self.should_process_sheet(worksheet.title):
|
||||
logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
|
||||
continue
|
||||
logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}')
|
||||
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
|
||||
if len(missing_cols := self.missing_required_columns(gw)):
|
||||
logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}")
|
||||
continue
|
||||
|
||||
# process and yield metadata here:
|
||||
yield from self._process_rows(gw)
|
||||
logger.success(f'Finished worksheet {worksheet.title}')
|
||||
|
||||
def _process_rows(self, gw: GWorksheet):
|
||||
for row in range(1 + self.header, gw.count_rows() + 1):
|
||||
url = gw.get_cell(row, 'url').strip()
|
||||
if not len(url): continue
|
||||
original_status = gw.get_cell(row, 'status')
|
||||
status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
|
||||
# TODO: custom status parser(?) aka should_retry_from_status
|
||||
if status not in ['', None]: continue
|
||||
|
||||
# All checks done - archival process starts here
|
||||
m = Metadata().set_url(url)
|
||||
self._set_context(m, gw, row)
|
||||
yield m
|
||||
|
||||
def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata:
|
||||
# TODO: Check folder value not being recognised
|
||||
m.set_context("gsheet", {"row": row, "worksheet": gw})
|
||||
|
||||
if gw.get_cell_or_default(row, 'folder', "") is None:
|
||||
folder = ''
|
||||
else:
|
||||
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
|
||||
if len(folder):
|
||||
if self.use_sheet_names_in_stored_paths:
|
||||
m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title)))
|
||||
else:
|
||||
m.set_context("folder", folder)
|
||||
|
||||
def should_process_sheet(self, sheet_name: str) -> bool:
|
||||
if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:
|
||||
# ALLOW rules exist AND sheet name not explicitly allowed
|
||||
return False
|
||||
if len(self.block_worksheets) and sheet_name in self.block_worksheets:
|
||||
# BLOCK rules exist AND sheet name is blocked
|
||||
return False
|
||||
return True
|
||||
|
||||
def missing_required_columns(self, gw: GWorksheet) -> list:
|
||||
missing = []
|
||||
for required_col in ['url', 'status']:
|
||||
if not gw.col_exists(required_col):
|
||||
missing.append(required_col)
|
||||
return missing
|
||||
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, "status", "Archive in progress")
|
||||
|
||||
def failed(self, item: Metadata, reason: str) -> None:
|
||||
logger.error(f"FAILED {item}")
|
||||
self._safe_status_update(item, f"Archive failed {reason}")
|
||||
|
||||
def aborted(self, item: Metadata) -> None:
|
||||
logger.warning(f"ABORTED {item}")
|
||||
self._safe_status_update(item, "")
|
||||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
"""check if the given item has been archived already"""
|
||||
return False
|
||||
|
||||
def done(self, item: Metadata, cached: bool = False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
logger.success(f"DONE {item.get_url()}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
# self._safe_status_update(item, 'done')
|
||||
|
||||
cell_updates = []
|
||||
row_values = gw.get_row(row)
|
||||
|
||||
def batch_if_valid(col, val, final_value=None):
|
||||
final_value = final_value or val
|
||||
try:
|
||||
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
|
||||
cell_updates.append((row, col, final_value))
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to batch {col}={final_value} due to {e}")
|
||||
|
||||
status_message = item.status
|
||||
if cached:
|
||||
status_message = f"[cached] {status_message}"
|
||||
cell_updates.append((row, "status", status_message))
|
||||
|
||||
media: Media = item.get_final_media()
|
||||
if hasattr(media, "urls"):
|
||||
batch_if_valid("archive", "\n".join(media.urls))
|
||||
batch_if_valid("date", True, get_current_timestamp())
|
||||
batch_if_valid("title", item.get_title())
|
||||
batch_if_valid("text", item.get("content", ""))
|
||||
batch_if_valid("timestamp", item.get_timestamp())
|
||||
if media:
|
||||
batch_if_valid("hash", media.get("hash", "not-calculated"))
|
||||
|
||||
# merge all pdq hashes into a single string, if present
|
||||
pdq_hashes = []
|
||||
all_media = item.get_all_media()
|
||||
for m in all_media:
|
||||
if pdq := m.get("pdq_hash"):
|
||||
pdq_hashes.append(pdq)
|
||||
if len(pdq_hashes):
|
||||
batch_if_valid("pdq_hash", ",".join(pdq_hashes))
|
||||
|
||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(
|
||||
screenshot, "urls"
|
||||
):
|
||||
batch_if_valid("screenshot", "\n".join(screenshot.urls))
|
||||
|
||||
if thumbnail := item.get_first_image("thumbnail"):
|
||||
if hasattr(thumbnail, "urls"):
|
||||
batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")')
|
||||
|
||||
if browsertrix := item.get_media_by_id("browsertrix"):
|
||||
batch_if_valid("wacz", "\n".join(browsertrix.urls))
|
||||
batch_if_valid(
|
||||
"replaywebpage",
|
||||
"\n".join(
|
||||
[
|
||||
f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}"
|
||||
for wacz in browsertrix.urls
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
gw.batch_set_cell(cell_updates)
|
||||
|
||||
def _safe_status_update(self, item: Metadata, new_status: str) -> None:
|
||||
try:
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
gw.set_cell(row, "status", new_status)
|
||||
except Exception as e:
|
||||
logger.debug(f"Unable to update sheet: {e}")
|
||||
|
||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||
|
||||
if gsheet := item.get_context("gsheet"):
|
||||
gw: GWorksheet = gsheet.get("worksheet")
|
||||
row: int = gsheet.get("row")
|
||||
elif self.sheet_id:
|
||||
logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.")
|
||||
|
||||
return gw, row
|
||||
@@ -7,7 +7,9 @@
|
||||
"bin": [""]
|
||||
},
|
||||
"configs": {
|
||||
"detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
|
||||
"detect_thumbnails": {"default": True,
|
||||
"help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'",
|
||||
"type": "bool"},
|
||||
},
|
||||
"description": """ """,
|
||||
}
|
||||
|
||||
@@ -10,25 +10,30 @@
|
||||
"requires_setup": True,
|
||||
"configs": {
|
||||
"username": {"required": True,
|
||||
"help": "a valid Instagram username"},
|
||||
"help": "A valid Instagram username."},
|
||||
"password": {
|
||||
"required": True,
|
||||
"help": "the corresponding Instagram account password",
|
||||
"help": "The corresponding Instagram account password.",
|
||||
},
|
||||
"download_folder": {
|
||||
"default": "instaloader",
|
||||
"help": "name of a folder to temporarily download content to",
|
||||
"help": "Name of a folder to temporarily download content to.",
|
||||
},
|
||||
"session_file": {
|
||||
"default": "secrets/instaloader.session",
|
||||
"help": "path to the instagram session which saves session credentials",
|
||||
"help": "Path to the instagram session file which saves session credentials. If one doesn't exist this gives the path to store a new one.",
|
||||
},
|
||||
# TODO: fine-grain
|
||||
# "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"},
|
||||
},
|
||||
"description": """
|
||||
Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. This class handles both individual posts
|
||||
and user profiles, downloading as much information as possible, including images, videos, text, stories,
|
||||
Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram.
|
||||
|
||||
> ⚠️ **Warning**
|
||||
> This module is not actively maintained due to known issues with blocking.
|
||||
> Prioritise usage of the [Instagram Tbot Extractor](./instagram_tbot_extractor.md) and [Instagram API Extractor](./instagram_api_extractor.md)
|
||||
|
||||
This class handles both individual posts and user profiles, downloading as much information as possible, including images, videos, text, stories,
|
||||
highlights, and tagged posts.
|
||||
Authentication is required via username/password or a session file.
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
highlights, and tagged posts. Authentication is required via username/password or a session file.
|
||||
|
||||
"""
|
||||
import re, os, shutil, traceback
|
||||
import re, os, shutil
|
||||
import instaloader
|
||||
from loguru import logger
|
||||
|
||||
@@ -15,10 +15,9 @@ class InstagramExtractor(Extractor):
|
||||
"""
|
||||
Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
|
||||
"""
|
||||
|
||||
# NB: post regex should be tested before profile
|
||||
|
||||
valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/")
|
||||
|
||||
# https://regex101.com/r/MGPquX/1
|
||||
post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url))
|
||||
# https://regex101.com/r/6Wbsxa/1
|
||||
@@ -28,19 +27,22 @@ class InstagramExtractor(Extractor):
|
||||
def setup(self) -> None:
|
||||
|
||||
self.insta = instaloader.Instaloader(
|
||||
download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}"
|
||||
download_geotags=True,
|
||||
download_comments=True,
|
||||
compress_json=False,
|
||||
dirname_pattern=self.download_folder,
|
||||
filename_pattern="{date_utc}_UTC_{target}__{typename}"
|
||||
)
|
||||
try:
|
||||
self.insta.load_session_from_file(self.username, self.session_file)
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to login from session file: {e}\n{traceback.format_exc()}")
|
||||
try:
|
||||
self.insta.login(self.username, config.instagram_self.password)
|
||||
# TODO: wait for this issue to be fixed https://github.com/instaloader/instaloader/issues/1758
|
||||
logger.debug(f"Session file failed", exc_info=True)
|
||||
logger.info("No valid session file found - Attempting login with use and password.")
|
||||
self.insta.login(self.username, self.password)
|
||||
self.insta.save_session_to_file(self.session_file)
|
||||
except Exception as e2:
|
||||
logger.error(f"Unable to finish login (retrying from file): {e2}\n{traceback.format_exc()}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}")
|
||||
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
|
||||
@@ -104,7 +104,7 @@ class InstagramTbotExtractor(Extractor):
|
||||
message = ""
|
||||
time.sleep(3)
|
||||
# media is added before text by the bot so it can be used as a stop-logic mechanism
|
||||
while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
|
||||
while attempts < max(self.timeout - 3, 3) and (not message or not len(seen_media)):
|
||||
attempts += 1
|
||||
time.sleep(1)
|
||||
for post in self.client.iter_messages(chat, min_id=since_id):
|
||||
|
||||
@@ -17,7 +17,9 @@
|
||||
"choices": ["random", "static"],
|
||||
},
|
||||
"save_to": {"default": "./local_archive", "help": "folder where to save archived content"},
|
||||
"save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
|
||||
"save_absolute": {"default": False,
|
||||
"type": "bool",
|
||||
"help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
|
||||
},
|
||||
"description": """
|
||||
LocalStorage: A storage module for saving archived content locally on the filesystem.
|
||||
|
||||
@@ -6,13 +6,25 @@
|
||||
"python": ["loguru", "selenium"],
|
||||
},
|
||||
"configs": {
|
||||
"width": {"default": 1280, "help": "width of the screenshots"},
|
||||
"height": {"default": 720, "help": "height of the screenshots"},
|
||||
"timeout": {"default": 60, "help": "timeout for taking the screenshot"},
|
||||
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
|
||||
"width": {"default": 1280,
|
||||
"type": "int",
|
||||
"help": "width of the screenshots"},
|
||||
"height": {"default": 1024,
|
||||
"type": "int",
|
||||
"help": "height of the screenshots"},
|
||||
"timeout": {"default": 60,
|
||||
"type": "int",
|
||||
"help": "timeout for taking the screenshot"},
|
||||
"sleep_before_screenshot": {"default": 4,
|
||||
"type": "int",
|
||||
"help": "seconds to wait for the pages to load before taking screenshot"},
|
||||
"http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
|
||||
"save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
|
||||
"print_options": {"default": {}, "help": "options to pass to the pdf printer"}
|
||||
"save_to_pdf": {"default": False,
|
||||
"type": "bool",
|
||||
"help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
|
||||
"print_options": {"default": {},
|
||||
"help": "options to pass to the pdf printer, in JSON format. See https://www.selenium.dev/documentation/webdriver/interactions/print_page/ for more information",
|
||||
"type": "json_loader"},
|
||||
},
|
||||
"description": """
|
||||
Captures screenshots and optionally saves web pages as PDFs using a WebDriver.
|
||||
|
||||
@@ -7,7 +7,9 @@
|
||||
},
|
||||
'entry_point': 'ssl_enricher::SSLEnricher',
|
||||
"configs": {
|
||||
"skip_when_nothing_archived": {"default": True, "help": "if true, will skip enriching when no media is archived"},
|
||||
"skip_when_nothing_archived": {"default": True,
|
||||
"type": 'bool',
|
||||
"help": "if true, will skip enriching when no media is archived"},
|
||||
},
|
||||
"description": """
|
||||
Retrieves SSL certificate information for a domain and stores it as a file.
|
||||
|
||||
@@ -14,7 +14,9 @@
|
||||
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
||||
"bot_token": {"default": None, "help": "optional, but allows access to more content such as large videos, talk to @botfather"},
|
||||
"session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
|
||||
"join_channels": {"default": True, "help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
|
||||
"join_channels": {"default": True,
|
||||
"type": "bool",
|
||||
"help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
|
||||
"channel_invites": {
|
||||
"default": {},
|
||||
"help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
|
||||
|
||||
@@ -17,11 +17,19 @@
|
||||
"configs": {
|
||||
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
|
||||
"docker_commands": {"default": None, "help":"if a custom docker invocation is needed"},
|
||||
"timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"},
|
||||
"extract_media": {"default": False, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
|
||||
"extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
|
||||
"timeout": {"default": 120,
|
||||
"type": "int",
|
||||
"help": "timeout for WACZ generation in seconds", "type": "int"},
|
||||
"extract_media": {"default": False,
|
||||
"type": 'bool',
|
||||
"help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."
|
||||
},
|
||||
"extract_screenshot": {"default": True,
|
||||
"type": 'bool',
|
||||
"help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."
|
||||
},
|
||||
"socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
|
||||
"socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
|
||||
"socks_proxy_port": {"default": None, "type":"int", "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
|
||||
"proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"},
|
||||
},
|
||||
"description": """
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
"configs": {
|
||||
"timeout": {
|
||||
"default": 15,
|
||||
"type": "int",
|
||||
"help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually.",
|
||||
},
|
||||
"if_not_archived_within": {
|
||||
|
||||
@@ -10,8 +10,12 @@
|
||||
"help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
|
||||
"api_key": {"required": True,
|
||||
"help": "WhisperApi api key for authentication"},
|
||||
"include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
|
||||
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
|
||||
"include_srt": {"default": False,
|
||||
"type": "bool",
|
||||
"help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
|
||||
"timeout": {"default": 90,
|
||||
"type": "int",
|
||||
"help": "How many seconds to wait at most for a successful job completion."},
|
||||
"action": {"default": "translate",
|
||||
"help": "which Whisper operation to execute",
|
||||
"choices": ["transcribe", "translate", "language_detection"]},
|
||||
|
||||
@@ -1,18 +1,23 @@
|
||||
""" This Webdriver class acts as a context manager for the selenium webdriver. """
|
||||
from __future__ import annotations
|
||||
from selenium import webdriver
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from selenium.webdriver.common.proxy import Proxy, ProxyType
|
||||
from selenium.webdriver.common.print_page_options import PrintOptions
|
||||
|
||||
from loguru import logger
|
||||
from selenium.webdriver.common.by import By
|
||||
import os
|
||||
import time
|
||||
|
||||
#import domain_for_url
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
from http.cookiejar import MozillaCookieJar
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common import exceptions as selenium_exceptions
|
||||
from selenium.webdriver.common.print_page_options import PrintOptions
|
||||
from selenium.webdriver.common.by import By
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class CookieSettingDriver(webdriver.Firefox):
|
||||
|
||||
facebook_accept_cookies: bool
|
||||
@@ -20,6 +25,10 @@ class CookieSettingDriver(webdriver.Firefox):
|
||||
cookiejar: MozillaCookieJar
|
||||
|
||||
def __init__(self, cookies, cookiejar, facebook_accept_cookies, *args, **kwargs):
|
||||
if os.environ.get('RUNNING_IN_DOCKER'):
|
||||
# Selenium doesn't support linux-aarch64 driver, we need to set this manually
|
||||
kwargs['service'] = webdriver.FirefoxService(executable_path='/usr/local/bin/geckodriver')
|
||||
|
||||
super(CookieSettingDriver, self).__init__(*args, **kwargs)
|
||||
self.cookies = cookies
|
||||
self.cookiejar = cookiejar
|
||||
@@ -64,14 +73,29 @@ class CookieSettingDriver(webdriver.Firefox):
|
||||
time.sleep(2)
|
||||
except Exception as e:
|
||||
logger.warning(f'Failed on fb accept cookies.', e)
|
||||
|
||||
|
||||
# now get the actual URL
|
||||
super(CookieSettingDriver, self).get(url)
|
||||
if self.facebook_accept_cookies:
|
||||
# try and click the 'close' button on the 'login' window to close it
|
||||
close_button = self.find_element(By.XPATH, "//div[@role='dialog']//div[@aria-label='Close']")
|
||||
if close_button:
|
||||
close_button.click()
|
||||
try:
|
||||
xpath = "//div[@role='dialog']//div[@aria-label='Close']"
|
||||
WebDriverWait(self, 5).until(EC.element_to_be_clickable((By.XPATH, xpath))).click()
|
||||
except selenium_exceptions.NoSuchElementException:
|
||||
logger.warning("Unable to find the 'close' button on the facebook login window")
|
||||
pass
|
||||
|
||||
else:
|
||||
|
||||
# for all other sites, try and use some common button text to reject/accept cookies
|
||||
for text in ["Refuse non-essential cookies", "Decline optional cookies", "Reject additional cookies", "Reject all", "Accept all cookies"]:
|
||||
try:
|
||||
xpath = f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text.lower()}')]"
|
||||
WebDriverWait(self, 5).until(EC.element_to_be_clickable((By.XPATH, xpath))).click()
|
||||
break
|
||||
except selenium_exceptions.WebDriverException:
|
||||
pass
|
||||
|
||||
|
||||
class Webdriver:
|
||||
@@ -90,7 +114,6 @@ class Webdriver:
|
||||
setattr(self.print_options, k, v)
|
||||
|
||||
def __enter__(self) -> webdriver:
|
||||
|
||||
options = webdriver.FirefoxOptions()
|
||||
options.add_argument("--headless")
|
||||
options.add_argument(f'--proxy-server={self.http_proxy}')
|
||||
@@ -105,7 +128,7 @@ class Webdriver:
|
||||
self.driver.set_window_size(self.width, self.height)
|
||||
self.driver.set_page_load_timeout(self.timeout_seconds)
|
||||
self.driver.print_options = self.print_options
|
||||
except TimeoutException as e:
|
||||
except selenium_exceptions.TimeoutException as e:
|
||||
logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")
|
||||
|
||||
return self.driver
|
||||
|
||||
@@ -2,7 +2,7 @@ import pytest
|
||||
from datetime import datetime
|
||||
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.modules.atlos_db import AtlosDb
|
||||
from auto_archiver.modules.atlos_feeder_db_storage import AtlosFeederDbStorage as AtlosDb
|
||||
|
||||
|
||||
class FakeAPIResponse:
|
||||
@@ -12,19 +12,28 @@ class FakeAPIResponse:
|
||||
self._data = data
|
||||
self.raise_error = raise_error
|
||||
|
||||
def json(self) -> dict:
|
||||
return self._data
|
||||
|
||||
def raise_for_status(self) -> None:
|
||||
if self.raise_error:
|
||||
raise Exception("HTTP error")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def atlos_db(setup_module) -> AtlosDb:
|
||||
def atlos_db(setup_module, mocker) -> AtlosDb:
|
||||
"""Fixture for AtlosDb."""
|
||||
configs: dict = {
|
||||
"api_token": "abc123",
|
||||
"atlos_url": "https://platform.atlos.org",
|
||||
}
|
||||
return setup_module("atlos_db", configs)
|
||||
mocker.patch("requests.Session")
|
||||
atlos_feeder = setup_module("atlos_feeder_db_storage", configs)
|
||||
fake_session = mocker.MagicMock()
|
||||
# Configure the default response to have no results so that __iter__ terminates
|
||||
fake_session.get.return_value = FakeAPIResponse({"next": None, "results": []})
|
||||
atlos_feeder.session = fake_session
|
||||
return atlos_feeder
|
||||
|
||||
|
||||
def test_failed_no_atlos_id(atlos_db, metadata, mocker):
|
||||
@@ -38,25 +47,20 @@ def test_failed_with_atlos_id(atlos_db, metadata, mocker):
|
||||
"""Test failed() posts failure when atlos_id is present."""
|
||||
metadata.set("atlos_id", 42)
|
||||
fake_resp = FakeAPIResponse({}, raise_error=False)
|
||||
post_mock = mocker.patch("requests.post", return_value=fake_resp)
|
||||
post_mock = mocker.patch.object(atlos_db, "_post", return_value=fake_resp)
|
||||
atlos_db.failed(metadata, "failure reason")
|
||||
expected_url = (
|
||||
f"{atlos_db.atlos_url}/api/v2/source_material/metadata/42/auto_archiver"
|
||||
)
|
||||
expected_headers = {"Authorization": f"Bearer {atlos_db.api_token}"}
|
||||
expected_endpoint = f"/api/v2/source_material/metadata/42/auto_archiver"
|
||||
expected_json = {
|
||||
"metadata": {"processed": True, "status": "error", "error": "failure reason"}
|
||||
}
|
||||
post_mock.assert_called_once_with(
|
||||
expected_url, headers=expected_headers, json=expected_json
|
||||
)
|
||||
post_mock.assert_called_once_with(expected_endpoint, json=expected_json)
|
||||
|
||||
|
||||
def test_failed_http_error(atlos_db, metadata, mocker):
|
||||
"""Test failed() raises exception on HTTP error."""
|
||||
metadata.set("atlos_id", 42)
|
||||
fake_resp = FakeAPIResponse({}, raise_error=True)
|
||||
mocker.patch("requests.post", return_value=fake_resp)
|
||||
# Patch _post to raise an exception instead of returning a fake response.
|
||||
mocker.patch.object(atlos_db, "_post", side_effect=Exception("HTTP error"))
|
||||
with pytest.raises(Exception, match="HTTP error"):
|
||||
atlos_db.failed(metadata, "failure reason")
|
||||
|
||||
@@ -81,12 +85,9 @@ def test_done_with_atlos_id(atlos_db, metadata, mocker):
|
||||
now = datetime.now()
|
||||
metadata.set("timestamp", now)
|
||||
fake_resp = FakeAPIResponse({}, raise_error=False)
|
||||
post_mock = mocker.patch("requests.post", return_value=fake_resp)
|
||||
post_mock = mocker.patch.object(atlos_db, "_post", return_value=fake_resp)
|
||||
atlos_db.done(metadata)
|
||||
expected_url = (
|
||||
f"{atlos_db.atlos_url}/api/v2/source_material/metadata/99/auto_archiver"
|
||||
)
|
||||
expected_headers = {"Authorization": f"Bearer {atlos_db.api_token}"}
|
||||
expected_endpoint = f"/api/v2/source_material/metadata/99/auto_archiver"
|
||||
expected_results = metadata.metadata.copy()
|
||||
expected_results["timestamp"] = now.isoformat()
|
||||
expected_json = {
|
||||
@@ -96,15 +97,13 @@ def test_done_with_atlos_id(atlos_db, metadata, mocker):
|
||||
"results": expected_results,
|
||||
}
|
||||
}
|
||||
post_mock.assert_called_once_with(
|
||||
expected_url, headers=expected_headers, json=expected_json
|
||||
)
|
||||
post_mock.assert_called_once_with(expected_endpoint, json=expected_json)
|
||||
|
||||
|
||||
def test_done_http_error(atlos_db, metadata, mocker):
|
||||
"""Test done() raises exception on HTTP error."""
|
||||
"""Test done() raises an exception on HTTP error."""
|
||||
metadata.set("atlos_id", 123)
|
||||
fake_resp = FakeAPIResponse({}, raise_error=True)
|
||||
mocker.patch("requests.post", return_value=fake_resp)
|
||||
# Patch _post to raise an exception.
|
||||
mocker.patch.object(atlos_db, "_post", side_effect=Exception("HTTP error"))
|
||||
with pytest.raises(Exception, match="HTTP error"):
|
||||
atlos_db.done(metadata)
|
||||
|
||||
@@ -2,8 +2,7 @@ from datetime import datetime, timezone
|
||||
import pytest
|
||||
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.modules.gsheet_db import GsheetsDb
|
||||
from auto_archiver.modules.gsheet_feeder import GWorksheet
|
||||
from auto_archiver.modules.gsheet_feeder_db import GsheetsFeederDB, GWorksheet
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -32,8 +31,9 @@ def mock_metadata(mocker):
|
||||
@pytest.fixture
|
||||
def metadata():
|
||||
metadata = Metadata()
|
||||
metadata.add_media(Media(filename="screenshot.png", urls=["http://example.com/screenshot.png"]).set("id", "screenshot"))
|
||||
metadata.add_media(Media(filename="browsertrix", urls=["http://example.com/browsertrix.wacz"]).set("id", "browsertrix"))
|
||||
metadata.add_media(Media(filename="screenshot", urls=["http://example.com/screenshot.png"]))
|
||||
metadata.add_media(Media(filename="browsertrix", urls=["http://example.com/browsertrix.wacz"]))
|
||||
metadata.add_media(Media(filename="thumbnail", urls=["http://example.com/thumbnail.png"]))
|
||||
metadata.set_url("http://example.com")
|
||||
metadata.set_title("Example Title")
|
||||
metadata.set_content("Example Content")
|
||||
@@ -52,12 +52,19 @@ def mock_media(mocker):
|
||||
return mock_media
|
||||
|
||||
@pytest.fixture
|
||||
def gsheets_db(mock_gworksheet, setup_module, mocker) -> GsheetsDb:
|
||||
db = setup_module("gsheet_db", {
|
||||
"allow_worksheets": "set()",
|
||||
"block_worksheets": "set()",
|
||||
"use_sheet_names_in_stored_paths": "True",
|
||||
})
|
||||
def gsheets_db(mock_gworksheet, setup_module, mocker):
|
||||
mocker.patch("gspread.service_account")
|
||||
config: dict = {
|
||||
"sheet": "testsheet",
|
||||
"sheet_id": None,
|
||||
"header": 1,
|
||||
"service_account": "test/service_account.json",
|
||||
"columns": {'url': 'link', 'status': 'archive status', 'folder': 'destination folder', 'archive': 'archive location', 'date': 'archive date', 'thumbnail': 'thumbnail', 'timestamp': 'upload timestamp', 'title': 'upload title', 'text': 'text content', 'screenshot': 'screenshot', 'hash': 'hash', 'pdq_hash': 'perceptual hashes', 'wacz': 'wacz', 'replaywebpage': 'replaywebpage'},
|
||||
"allow_worksheets": set(),
|
||||
"block_worksheets": set(),
|
||||
"use_sheet_names_in_stored_paths": True,
|
||||
}
|
||||
db = setup_module("gsheet_feeder_db", config)
|
||||
db._retrieve_gsheet = mocker.MagicMock(return_value=(mock_gworksheet, 1))
|
||||
return db
|
||||
|
||||
@@ -79,10 +86,10 @@ def expected_calls(mock_media, fixed_timestamp):
|
||||
(1, 'text', 'Example Content'),
|
||||
(1, 'timestamp', '2025-01-01T00:00:00+00:00'),
|
||||
(1, 'hash', 'not-calculated'),
|
||||
(1, 'screenshot', 'http://example.com/screenshot.png'),
|
||||
(1, 'thumbnail', '=IMAGE("http://example.com/screenshot.png")'),
|
||||
(1, 'wacz', 'http://example.com/browsertrix.wacz'),
|
||||
(1, 'replaywebpage', 'https://replayweb.page/?source=http%3A//example.com/browsertrix.wacz#view=pages&url=http%3A//example.com')
|
||||
# (1, 'screenshot', 'http://example.com/screenshot.png'),
|
||||
# (1, 'thumbnail', '=IMAGE("http://example.com/thumbnail.png")'),
|
||||
# (1, 'wacz', 'http://example.com/browsertrix.wacz'),
|
||||
# (1, 'replaywebpage', 'https://replayweb.page/?source=http%3A%2F%2Fexample.com%2Fbrowsertrix.wacz#view=pages&url=')
|
||||
]
|
||||
|
||||
def test_retrieve_gsheet(gsheets_db, metadata, mock_gworksheet):
|
||||
@@ -107,13 +114,13 @@ def test_aborted(gsheets_db, mock_metadata, mock_gworksheet):
|
||||
|
||||
|
||||
def test_done(gsheets_db, metadata, mock_gworksheet, expected_calls, mocker):
|
||||
mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
|
||||
mocker.patch("auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
|
||||
gsheets_db.done(metadata)
|
||||
mock_gworksheet.batch_set_cell.assert_called_once_with(expected_calls)
|
||||
|
||||
|
||||
def test_done_cached(gsheets_db, metadata, mock_gworksheet, mocker):
|
||||
mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
|
||||
mocker.patch("auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
|
||||
gsheets_db.done(metadata, cached=True)
|
||||
|
||||
# Verify the status message includes "[cached]"
|
||||
@@ -124,7 +131,7 @@ def test_done_cached(gsheets_db, metadata, mock_gworksheet, mocker):
|
||||
def test_done_missing_media(gsheets_db, metadata, mock_gworksheet, mocker):
|
||||
# clear media from metadata
|
||||
metadata.media = []
|
||||
mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
|
||||
mocker.patch("auto_archiver.modules.gsheet_feeder_db.gsheet_feeder_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
|
||||
gsheets_db.done(metadata)
|
||||
# Verify nothing media-related gets updated
|
||||
call_args = mock_gworksheet.batch_set_cell.call_args[0][0]
|
||||
|
||||
@@ -1,21 +1,36 @@
|
||||
import pytest
|
||||
|
||||
from auto_archiver.modules.instagram_extractor import InstagramExtractor
|
||||
from .test_extractor_base import TestExtractorBase
|
||||
|
||||
class TestInstagramExtractor(TestExtractorBase):
|
||||
|
||||
@pytest.fixture
|
||||
def instagram_extractor(setup_module, mocker):
|
||||
|
||||
extractor_module: str = 'instagram_extractor'
|
||||
config: dict = {}
|
||||
config: dict = {
|
||||
"username": "user_name",
|
||||
"password": "password123",
|
||||
"download_folder": "instaloader",
|
||||
"session_file": "secrets/instaloader.session",
|
||||
}
|
||||
fake_loader = mocker.MagicMock()
|
||||
fake_loader.load_session_from_file.return_value = None
|
||||
fake_loader.login.return_value = None
|
||||
fake_loader.save_session_to_file.return_value = None
|
||||
mocker.patch("instaloader.Instaloader", return_value=fake_loader,)
|
||||
return setup_module(extractor_module, config)
|
||||
|
||||
@pytest.mark.parametrize("url", [
|
||||
"https://www.instagram.com/p/",
|
||||
"https://www.instagram.com/p/1234567890/",
|
||||
"https://www.instagram.com/reel/1234567890/",
|
||||
"https://www.instagram.com/username/",
|
||||
"https://www.instagram.com/username/stories/",
|
||||
"https://www.instagram.com/username/highlights/",
|
||||
])
|
||||
def test_regex_matches(self, url):
|
||||
# post
|
||||
assert InstagramExtractor.valid_url.match(url)
|
||||
|
||||
@pytest.mark.parametrize("url", [
|
||||
"https://www.instagram.com/p/",
|
||||
"https://www.instagram.com/p/1234567890/",
|
||||
"https://www.instagram.com/reel/1234567890/",
|
||||
"https://www.instagram.com/username/",
|
||||
"https://www.instagram.com/username/stories/",
|
||||
"https://www.instagram.com/username/highlights/",
|
||||
])
|
||||
def test_regex_matches(url: str, instagram_extractor: InstagramExtractor) -> None:
|
||||
"""
|
||||
Ensure that the valid_url regex matches all provided Instagram URLs.
|
||||
"""
|
||||
assert instagram_extractor.valid_url.match(url)
|
||||
@@ -1,5 +1,5 @@
|
||||
import pytest
|
||||
from auto_archiver.modules.atlos_feeder import AtlosFeeder
|
||||
from auto_archiver.modules.atlos_feeder_db_storage import AtlosFeederDbStorage as AtlosFeeder
|
||||
|
||||
|
||||
class FakeAPIResponse:
|
||||
@@ -18,23 +18,26 @@ class FakeAPIResponse:
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def atlos_feeder(setup_module) -> AtlosFeeder:
|
||||
def atlos_feeder(setup_module, mocker) -> AtlosFeeder:
|
||||
"""Fixture for AtlosFeeder."""
|
||||
configs: dict = {
|
||||
"api_token": "abc123",
|
||||
"atlos_url": "https://platform.atlos.org",
|
||||
}
|
||||
return setup_module("atlos_feeder", configs)
|
||||
mocker.patch("requests.Session")
|
||||
atlos_feeder = setup_module("atlos_feeder_db_storage", configs)
|
||||
fake_session = mocker.MagicMock()
|
||||
# Configure the default response to have no results so that __iter__ terminates
|
||||
fake_session.get.return_value = FakeAPIResponse({"next": None, "results": []})
|
||||
atlos_feeder.session = fake_session
|
||||
return atlos_feeder
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_atlos_api(mocker):
|
||||
"""Fixture to mock requests to Atlos API."""
|
||||
def mock_atlos_api(atlos_feeder):
|
||||
"""Fixture to update the atlos_feeder.session.get side_effect."""
|
||||
def _mock_responses(responses):
|
||||
mocker.patch(
|
||||
"requests.get",
|
||||
side_effect=[FakeAPIResponse(data) for data in responses],
|
||||
)
|
||||
atlos_feeder.session.get.side_effect = [FakeAPIResponse(data) for data in responses]
|
||||
return _mock_responses
|
||||
|
||||
|
||||
@@ -100,9 +103,7 @@ def test_atlos_feeder_no_results(atlos_feeder, mock_atlos_api):
|
||||
|
||||
def test_atlos_feeder_http_error(atlos_feeder, mocker):
|
||||
"""Test raises an exception on HTTP error."""
|
||||
mocker.patch(
|
||||
"requests.get",
|
||||
return_value=FakeAPIResponse({"next": None, "results": []}, raise_error=True),
|
||||
)
|
||||
fake_response = FakeAPIResponse({"next": None, "results": []}, raise_error=True)
|
||||
atlos_feeder.session.get.side_effect = [fake_response]
|
||||
with pytest.raises(Exception, match="HTTP error"):
|
||||
list(atlos_feeder)
|
||||
|
||||
@@ -2,7 +2,7 @@ from typing import Type
|
||||
|
||||
import gspread
|
||||
import pytest
|
||||
from auto_archiver.modules.gsheet_feeder import GsheetsFeeder
|
||||
from auto_archiver.modules.gsheet_feeder_db import GsheetsFeederDB
|
||||
from auto_archiver.core import Metadata, Feeder
|
||||
|
||||
|
||||
@@ -11,13 +11,13 @@ def test_setup_without_sheet_and_sheet_id(setup_module, mocker):
|
||||
mocker.patch("gspread.service_account")
|
||||
with pytest.raises(ValueError):
|
||||
setup_module(
|
||||
"gsheet_feeder",
|
||||
"gsheet_feeder_db",
|
||||
{"service_account": "dummy.json", "sheet": None, "sheet_id": None},
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def gsheet_feeder(setup_module, mocker) -> GsheetsFeeder:
|
||||
def gsheet_feeder(setup_module, mocker) -> GsheetsFeederDB:
|
||||
config: dict = {
|
||||
"service_account": "dummy.json",
|
||||
"sheet": "test-auto-archiver",
|
||||
@@ -45,7 +45,7 @@ def gsheet_feeder(setup_module, mocker) -> GsheetsFeeder:
|
||||
}
|
||||
mocker.patch("gspread.service_account")
|
||||
feeder = setup_module(
|
||||
"gsheet_feeder",
|
||||
"gsheet_feeder_db",
|
||||
config
|
||||
)
|
||||
feeder.gsheets_client = mocker.MagicMock()
|
||||
@@ -90,7 +90,7 @@ class MockWorksheet:
|
||||
return matching.get(col_name, default)
|
||||
|
||||
|
||||
def test__process_rows(gsheet_feeder: GsheetsFeeder):
|
||||
def test__process_rows(gsheet_feeder: GsheetsFeederDB):
|
||||
testworksheet = MockWorksheet()
|
||||
metadata_items = list(gsheet_feeder._process_rows(testworksheet))
|
||||
assert len(metadata_items) == 3
|
||||
@@ -98,7 +98,7 @@ def test__process_rows(gsheet_feeder: GsheetsFeeder):
|
||||
assert metadata_items[0].get("url") == "http://example.com"
|
||||
|
||||
|
||||
def test__set_metadata(gsheet_feeder: GsheetsFeeder):
|
||||
def test__set_metadata(gsheet_feeder: GsheetsFeederDB):
|
||||
worksheet = MockWorksheet()
|
||||
metadata = Metadata()
|
||||
gsheet_feeder._set_context(metadata, worksheet, 1)
|
||||
@@ -106,12 +106,12 @@ def test__set_metadata(gsheet_feeder: GsheetsFeeder):
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Not recognising folder column")
|
||||
def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeeder, worksheet):
|
||||
def test__set_metadata_with_folder_pickled(gsheet_feeder: GsheetsFeederDB, worksheet):
|
||||
gsheet_feeder._set_context(worksheet, 7)
|
||||
assert Metadata.get_context("gsheet") == {"row": 1, "worksheet": worksheet}
|
||||
|
||||
|
||||
def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder):
|
||||
def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeederDB):
|
||||
testworksheet = MockWorksheet()
|
||||
metadata = Metadata()
|
||||
testworksheet.wks.title = "TestSheet"
|
||||
@@ -140,7 +140,7 @@ def test_open_sheet_with_name_or_id(
|
||||
|
||||
# Setup module with parameterized values
|
||||
feeder = setup_module(
|
||||
"gsheet_feeder",
|
||||
"gsheet_feeder_db",
|
||||
{"service_account": "dummy.json", "sheet": sheet, "sheet_id": sheet_id},
|
||||
)
|
||||
sheet_result = feeder.open_sheet()
|
||||
@@ -159,7 +159,7 @@ def test_open_sheet_with_sheet_id(setup_module, mocker):
|
||||
mock_service_account.return_value = mock_client
|
||||
mock_client.open_by_key.return_value = "MockSheet"
|
||||
feeder = setup_module(
|
||||
"gsheet_feeder",
|
||||
"gsheet_feeder_db",
|
||||
{"service_account": "dummy.json", "sheet": None, "sheet_id": "ABC123"},
|
||||
)
|
||||
sheet = feeder.open_sheet()
|
||||
@@ -170,7 +170,7 @@ def test_open_sheet_with_sheet_id(setup_module, mocker):
|
||||
def test_should_process_sheet(setup_module, mocker):
|
||||
mocker.patch("gspread.service_account")
|
||||
gdb = setup_module(
|
||||
"gsheet_feeder",
|
||||
"gsheet_feeder_db",
|
||||
{
|
||||
"service_account": "dummy.json",
|
||||
"sheet": "TestSheet",
|
||||
@@ -187,10 +187,10 @@ def test_should_process_sheet(setup_module, mocker):
|
||||
|
||||
@pytest.mark.skip(reason="Requires a real connection")
|
||||
class TestGSheetsFeederReal:
|
||||
"""Testing GSheetsFeeder class"""
|
||||
"""Testing GsheetsFeeder class"""
|
||||
|
||||
module_name: str = "gsheet_feeder"
|
||||
feeder: GsheetsFeeder
|
||||
module_name: str = "gsheet_feeder_db"
|
||||
feeder: GsheetsFeederDB
|
||||
# You must follow the setup process explain in the docs for this to work
|
||||
config: dict = {
|
||||
"service_account": "secrets/service_account.json",
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# Note this isn't a feeder, but contained as utility of the gsheet feeder module
|
||||
import pytest
|
||||
|
||||
from auto_archiver.modules.gsheet_feeder import GWorksheet
|
||||
from auto_archiver.modules.gsheet_feeder_db import GWorksheet
|
||||
|
||||
|
||||
class TestGWorksheet:
|
||||
|
||||
@@ -2,7 +2,7 @@ import os
|
||||
import hashlib
|
||||
import pytest
|
||||
from auto_archiver.core import Media, Metadata
|
||||
from auto_archiver.modules.atlos_storage import AtlosStorage
|
||||
from auto_archiver.modules.atlos_feeder_db_storage import AtlosFeederDbStorage as AtlosStorage
|
||||
|
||||
|
||||
class FakeAPIResponse:
|
||||
@@ -21,13 +21,19 @@ class FakeAPIResponse:
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def atlos_storage(setup_module) -> AtlosStorage:
|
||||
def atlos_storage(setup_module, mocker) -> AtlosStorage:
|
||||
"""Fixture for AtlosStorage."""
|
||||
configs: dict = {
|
||||
"api_token": "abc123",
|
||||
"atlos_url": "https://platform.atlos.org",
|
||||
}
|
||||
return setup_module("atlos_storage", configs)
|
||||
mocker.patch("requests.Session")
|
||||
atlos_feeder = setup_module("atlos_feeder_db_storage", configs)
|
||||
mock_session = mocker.MagicMock()
|
||||
# Configure the default response to have no results so that __iter__ terminates
|
||||
mock_session.get.return_value = FakeAPIResponse({"next": None, "results": []})
|
||||
atlos_feeder.session = mock_session
|
||||
return atlos_feeder
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -49,17 +55,6 @@ def test_get_cdn_url(atlos_storage: AtlosStorage) -> None:
|
||||
assert url == atlos_storage.atlos_url
|
||||
|
||||
|
||||
def test_hash(tmp_path, atlos_storage: AtlosStorage) -> None:
|
||||
"""Test _hash() computes the correct SHA-256 hash of a file."""
|
||||
content = b"hello world"
|
||||
file_path = tmp_path / "test.txt"
|
||||
file_path.write_bytes(content)
|
||||
media = Media(filename="dummy.mp4")
|
||||
media.filename = str(file_path)
|
||||
expected_hash = hashlib.sha256(content).hexdigest()
|
||||
assert atlos_storage._hash(media) == expected_hash
|
||||
|
||||
|
||||
def test_upload_no_atlos_id(tmp_path, atlos_storage: AtlosStorage, media: Media, mocker) -> None:
|
||||
"""Test upload() returns False when metadata lacks atlos_id."""
|
||||
metadata = Metadata() # atlos_id not set
|
||||
@@ -69,74 +64,49 @@ def test_upload_no_atlos_id(tmp_path, atlos_storage: AtlosStorage, media: Media,
|
||||
post_mock.assert_not_called()
|
||||
|
||||
|
||||
def test_upload_already_uploaded(atlos_storage: AtlosStorage,
|
||||
metadata: Metadata,
|
||||
media: Media,
|
||||
tmp_path,
|
||||
mocker) -> None:
|
||||
def test_upload_already_uploaded(atlos_storage: AtlosStorage, metadata: Metadata, media: Media, mocker) -> None:
|
||||
"""Test upload() returns True if media hash already exists."""
|
||||
content = b"media content"
|
||||
metadata.set("atlos_id", 101)
|
||||
media_hash = hashlib.sha256(content).hexdigest()
|
||||
fake_get = FakeAPIResponse({
|
||||
"result": {"artifacts": [{"file_hash_sha256": media_hash}]}
|
||||
})
|
||||
get_mock = mocker.patch("requests.get", return_value=fake_get)
|
||||
post_mock = mocker.patch("requests.post")
|
||||
fake_get_response = {"result": {"artifacts": [{"file_hash_sha256": media_hash}]}}
|
||||
get_mock = mocker.patch.object(atlos_storage, "_get", return_value=fake_get_response)
|
||||
post_mock = mocker.patch.object(atlos_storage, "_post")
|
||||
result = atlos_storage.upload(media, metadata)
|
||||
assert result is True
|
||||
get_mock.assert_called_once()
|
||||
post_mock.assert_not_called()
|
||||
|
||||
|
||||
def test_upload_not_uploaded(tmp_path, atlos_storage: AtlosStorage,
|
||||
metadata: Metadata,
|
||||
media: Media,
|
||||
mocker) -> None:
|
||||
def test_upload_not_uploaded(tmp_path, atlos_storage: AtlosStorage, metadata: Metadata, media: Media, mocker) -> None:
|
||||
"""Test upload() uploads media when not already present."""
|
||||
metadata.set("atlos_id", 202)
|
||||
fake_get = FakeAPIResponse({
|
||||
"result": {"artifacts": [{"file_hash_sha256": "different_hash"}]}
|
||||
})
|
||||
get_mock = mocker.patch("requests.get", return_value=fake_get)
|
||||
fake_post = FakeAPIResponse({}, raise_error=False)
|
||||
post_mock = mocker.patch("requests.post", return_value=fake_post)
|
||||
fake_get_response = {"result": {"artifacts": [{"file_hash_sha256": "different_hash"}]}}
|
||||
get_mock = mocker.patch.object(atlos_storage, "_get", return_value=fake_get_response)
|
||||
fake_post_response = {"result": "uploaded"}
|
||||
post_mock = mocker.patch.object(atlos_storage, "_post", return_value=fake_post_response)
|
||||
result = atlos_storage.upload(media, metadata)
|
||||
assert result is True
|
||||
|
||||
get_mock.assert_called_once()
|
||||
post_mock.assert_called_once()
|
||||
expected_url = f"{atlos_storage.atlos_url}/api/v2/source_material/upload/202"
|
||||
expected_endpoint = f"/api/v2/source_material/upload/202"
|
||||
call_args = post_mock.call_args[0]
|
||||
assert call_args[0] == expected_endpoint
|
||||
call_kwargs = post_mock.call_args[1]
|
||||
expected_headers = {"Authorization": f"Bearer {atlos_storage.api_token}"}
|
||||
expected_params = {"title": media.properties}
|
||||
call_kwargs = post_mock.call_args.kwargs
|
||||
assert call_kwargs["headers"] == expected_headers
|
||||
assert call_kwargs["params"] == expected_params
|
||||
# Verify the URL passed to requests.post.
|
||||
posted_url = call_kwargs.get("url") or post_mock.call_args.args[0]
|
||||
assert posted_url == expected_url
|
||||
# Verify files parameter contains the correct filename.
|
||||
file_tuple = call_kwargs["files"]["file"]
|
||||
assert file_tuple[0] == os.path.basename(media.filename)
|
||||
|
||||
|
||||
def test_upload_post_http_error(tmp_path,
|
||||
atlos_storage: AtlosStorage,
|
||||
metadata: Metadata,
|
||||
media: Media,
|
||||
mocker) -> None:
|
||||
def test_upload_post_http_error(tmp_path, atlos_storage: AtlosStorage, metadata: Metadata, media: Media, mocker) -> None:
|
||||
"""Test upload() propagates HTTP error during POST."""
|
||||
metadata.set("atlos_id", 303)
|
||||
fake_get = FakeAPIResponse({
|
||||
"result": {"artifacts": []}
|
||||
})
|
||||
mocker.patch("requests.get", return_value=fake_get)
|
||||
fake_post = FakeAPIResponse({}, raise_error=True)
|
||||
mocker.patch("requests.post", return_value=fake_post)
|
||||
fake_get_response = {"result": {"artifacts": []}}
|
||||
mocker.patch.object(atlos_storage, "_get", return_value=fake_get_response)
|
||||
mocker.patch.object(atlos_storage, "_post", side_effect=Exception("HTTP error"))
|
||||
with pytest.raises(Exception, match="HTTP error"):
|
||||
atlos_storage.upload(media, metadata)
|
||||
|
||||
|
||||
def test_uploadf_not_implemented(atlos_storage: AtlosStorage) -> None:
|
||||
"""Test uploadf() returns None (not implemented)."""
|
||||
result = atlos_storage.uploadf(None, "dummy")
|
||||
assert result is None
|
||||
|
||||
@@ -78,7 +78,7 @@ def test_help(orchestrator, basic_parser, capsys):
|
||||
assert "--logging.level" in logs
|
||||
|
||||
# individual module configs
|
||||
assert "--gsheet_feeder.sheet_id" in logs
|
||||
assert "--gsheet_feeder_db.sheet_id" in logs
|
||||
|
||||
|
||||
def test_add_custom_modules_path(orchestrator, test_args):
|
||||
@@ -154,22 +154,22 @@ def test_load_modules_from_commandline(orchestrator, test_args):
|
||||
assert orchestrator.formatters[0].name == "example_module"
|
||||
|
||||
def test_load_settings_for_module_from_commandline(orchestrator, test_args):
|
||||
args = test_args + ["--feeders", "gsheet_feeder", "--gsheet_feeder.sheet_id", "123", "--gsheet_feeder.service_account", "tests/data/test_service_account.json"]
|
||||
args = test_args + ["--feeders", "gsheet_feeder_db", "--gsheet_feeder_db.sheet_id", "123", "--gsheet_feeder_db.service_account", "tests/data/test_service_account.json"]
|
||||
|
||||
orchestrator.setup(args)
|
||||
|
||||
assert len(orchestrator.feeders) == 1
|
||||
assert orchestrator.feeders[0].name == "gsheet_feeder"
|
||||
assert orchestrator.config['gsheet_feeder']['sheet_id'] == "123"
|
||||
assert orchestrator.feeders[0].name == "gsheet_feeder_db"
|
||||
assert orchestrator.config['gsheet_feeder_db']['sheet_id'] == "123"
|
||||
|
||||
|
||||
def test_multiple_orchestrator(test_args):
|
||||
|
||||
o1_args = test_args + ["--feeders", "gsheet_feeder", "--gsheet_feeder.service_account", "tests/data/test_service_account.json"]
|
||||
o1_args = test_args + ["--feeders", "gsheet_feeder_db", "--gsheet_feeder_db.service_account", "tests/data/test_service_account.json"]
|
||||
o1 = ArchivingOrchestrator()
|
||||
|
||||
with pytest.raises(ValueError) as exit_error:
|
||||
# this should fail because the gsheet_feeder requires a sheet_id / sheet
|
||||
# this should fail because the gsheet_feeder_db requires a sheet_id / sheet
|
||||
o1.setup(o1_args)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user