mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
Compare commits
61 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b7c69c0f0d | ||
|
|
c98991cdfb | ||
|
|
45b982ec38 | ||
|
|
e11be449e8 | ||
|
|
134bf09257 | ||
|
|
417ca9ef51 | ||
|
|
5b79dcb80c | ||
|
|
52d7b4a016 | ||
|
|
31f6aae7b9 | ||
|
|
26373d4545 | ||
|
|
7a34915f8e | ||
|
|
b67a7b818a | ||
|
|
2e63cb8411 | ||
|
|
9cb73c073f | ||
|
|
9d078a648f | ||
|
|
e150370657 | ||
|
|
4116c90168 | ||
|
|
2c5b115fbe | ||
|
|
bda812f850 | ||
|
|
ac82764ffc | ||
|
|
0fae7d96fb | ||
|
|
2f7181ced6 | ||
|
|
9c25b33f1c | ||
|
|
ae3e607705 | ||
|
|
c1a60fde8a | ||
|
|
875e1de589 | ||
|
|
8f3d4e05c3 | ||
|
|
3bd6bed825 | ||
|
|
2659675f06 | ||
|
|
9d44f4b207 | ||
|
|
5b0bff612e | ||
|
|
ae7ceba0e5 | ||
|
|
97821a81bc | ||
|
|
9191b38cf2 | ||
|
|
567edfc35e | ||
|
|
8c22a9df72 | ||
|
|
d2d6db162b | ||
|
|
5cfbcc0137 | ||
|
|
5fdaa6c739 | ||
|
|
3d389ee05b | ||
|
|
0ecbed0df0 | ||
|
|
69bcfea2eb | ||
|
|
2e2e695444 | ||
|
|
493055a8d9 | ||
|
|
6f6eb2db7a | ||
|
|
906ed0f6e0 | ||
|
|
39818e648a | ||
|
|
2bbf534d67 | ||
|
|
6be7536fad | ||
|
|
0654e8c5c6 | ||
|
|
0e3c427371 | ||
|
|
7497bc08c0 | ||
|
|
49863768fe | ||
|
|
7b9483bbf9 | ||
|
|
cd81cae559 | ||
|
|
23894fad51 | ||
|
|
876988b587 | ||
|
|
f95293b84b | ||
|
|
2fbcbe4e8b | ||
|
|
d1e4574c6c | ||
|
|
d347b26d37 |
15
.github/workflows/docker-publish.yaml
vendored
15
.github/workflows/docker-publish.yaml
vendored
@@ -26,6 +26,14 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
- name: Check out the repo
|
- name: Check out the repo
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Set up QEMU
|
||||||
|
uses: docker/setup-qemu-action@v1
|
||||||
|
# https://github.com/docker/setup-buildx-action
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
id: buildx
|
||||||
|
uses: docker/setup-buildx-action@v1
|
||||||
|
|
||||||
- name: Log in to Docker Hub
|
- name: Log in to Docker Hub
|
||||||
uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
|
uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
|
||||||
@@ -40,9 +48,10 @@ jobs:
|
|||||||
images: bellingcat/auto-archiver
|
images: bellingcat/auto-archiver
|
||||||
|
|
||||||
- name: Build and push Docker image
|
- name: Build and push Docker image
|
||||||
uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
|
uses: docker/build-push-action@v2
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
push: true
|
platforms: linux/amd64,linux/arm64
|
||||||
|
push: ${{ github.event_name != 'pull_request' }}
|
||||||
tags: ${{ steps.meta.outputs.tags }}
|
tags: ${{ steps.meta.outputs.tags }}
|
||||||
labels: ${{ steps.meta.outputs.labels }}
|
labels: ${{ steps.meta.outputs.labels }}
|
||||||
|
|||||||
33
Dockerfile
33
Dockerfile
@@ -1,35 +1,36 @@
|
|||||||
# stage 1 - all dependencies
|
FROM webrecorder/browsertrix-crawler:latest
|
||||||
From python:3.10
|
|
||||||
|
ENV RUNNING_IN_DOCKER=1
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# TODO: use custom ffmpeg builds instead of apt-get install
|
# TODO: use custom ffmpeg builds instead of apt-get install
|
||||||
RUN pip install --upgrade pip && \
|
RUN pip install --upgrade pip && \
|
||||||
pip install pipenv && \
|
pip install pipenv && \
|
||||||
|
add-apt-repository ppa:mozillateam/ppa && \
|
||||||
apt-get update && \
|
apt-get update && \
|
||||||
apt-get install -y gcc ffmpeg fonts-noto firefox-esr && \
|
apt-get install -y gcc ffmpeg fonts-noto && \
|
||||||
wget https://github.com/mozilla/geckodriver/releases/download/v0.32.0/geckodriver-v0.32.0-linux64.tar.gz && \
|
apt-get install -y --no-install-recommends firefox-esr && \
|
||||||
|
ln -s /usr/bin/firefox-esr /usr/bin/firefox && \
|
||||||
|
wget https://github.com/mozilla/geckodriver/releases/download/v0.33.0/geckodriver-v0.33.0-linux64.tar.gz && \
|
||||||
tar -xvzf geckodriver* -C /usr/local/bin && \
|
tar -xvzf geckodriver* -C /usr/local/bin && \
|
||||||
chmod +x /usr/local/bin/geckodriver && \
|
chmod +x /usr/local/bin/geckodriver && \
|
||||||
rm geckodriver-v*
|
rm geckodriver-v*
|
||||||
|
|
||||||
|
|
||||||
# install docker for WACZ
|
|
||||||
# TODO: currently disabled see https://github.com/bellingcat/auto-archiver/issues/66
|
|
||||||
# RUN curl -fsSL https://get.docker.com | sh
|
|
||||||
|
|
||||||
# TODO: avoid copying unnecessary files, including .git
|
# TODO: avoid copying unnecessary files, including .git
|
||||||
COPY Pipfile Pipfile.lock ./
|
COPY Pipfile* ./
|
||||||
RUN pipenv install --python=3.10 --system --deploy
|
RUN pipenv install
|
||||||
# ENV IS_DOCKER=1
|
|
||||||
# doing this at the end helps during development, builds are quick
|
# doing this at the end helps during development, builds are quick
|
||||||
COPY ./src/ .
|
COPY ./src/ .
|
||||||
|
|
||||||
# TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
|
# TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
|
||||||
# RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
|
# RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
|
||||||
# USER archiver
|
# USER archiver
|
||||||
ENTRYPOINT ["python"]
|
|
||||||
# ENTRYPOINT ["docker-entrypoint.sh"]
|
|
||||||
|
|
||||||
# should be executed with 2 volumes (3 if local_storage)
|
|
||||||
# docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa --help
|
ENTRYPOINT ["pipenv", "run", "python3", "-m", "auto_archiver"]
|
||||||
|
|
||||||
|
# should be executed with 2 volumes (3 if local_storage is used)
|
||||||
|
# docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml
|
||||||
10
Pipfile
10
Pipfile
@@ -14,7 +14,6 @@ loguru = "*"
|
|||||||
ffmpeg-python = "*"
|
ffmpeg-python = "*"
|
||||||
selenium = "*"
|
selenium = "*"
|
||||||
snscrape = "*"
|
snscrape = "*"
|
||||||
yt-dlp = "*"
|
|
||||||
telethon = "*"
|
telethon = "*"
|
||||||
google-api-python-client = "*"
|
google-api-python-client = "*"
|
||||||
google-auth-httplib2 = "*"
|
google-auth-httplib2 = "*"
|
||||||
@@ -23,16 +22,21 @@ oauth2client = "*"
|
|||||||
python-slugify = "*"
|
python-slugify = "*"
|
||||||
pyyaml = "*"
|
pyyaml = "*"
|
||||||
dateparser = "*"
|
dateparser = "*"
|
||||||
vk-url-scraper = "*"
|
|
||||||
python-twitter-v2 = "*"
|
python-twitter-v2 = "*"
|
||||||
instaloader = "*"
|
instaloader = "*"
|
||||||
tqdm = "*"
|
tqdm = "*"
|
||||||
jinja2 = "*"
|
jinja2 = "*"
|
||||||
cryptography = "==38.0.4"
|
cryptography = "==38.0.4"
|
||||||
dataclasses-json = "*"
|
dataclasses-json = "*"
|
||||||
|
yt-dlp = ">=2023.2.17"
|
||||||
|
vk-url-scraper = "*"
|
||||||
|
uwsgi = "*"
|
||||||
|
requests = {extras = ["socks"], version = "*"}
|
||||||
|
# wacz = "==0.4.8"
|
||||||
|
pywb = ">=2.7.3"
|
||||||
|
|
||||||
[requires]
|
[requires]
|
||||||
python_version = "3.9"
|
python_version = "3.10"
|
||||||
|
|
||||||
[dev-packages]
|
[dev-packages]
|
||||||
autopep8 = "*"
|
autopep8 = "*"
|
||||||
|
|||||||
1607
Pipfile.lock
generated
1607
Pipfile.lock
generated
File diff suppressed because it is too large
Load Diff
26
README.md
26
README.md
@@ -33,7 +33,7 @@ Docker works like a virtual machine running inside your computer, it isolates ev
|
|||||||
|
|
||||||
1. install [docker](https://docs.docker.com/get-docker/)
|
1. install [docker](https://docs.docker.com/get-docker/)
|
||||||
2. pull the auto-archiver docker [image](https://hub.docker.com/r/bellingcat/auto-archiver) with `docker pull bellingcat/auto-archiver`
|
2. pull the auto-archiver docker [image](https://hub.docker.com/r/bellingcat/auto-archiver) with `docker pull bellingcat/auto-archiver`
|
||||||
3. run the docker image locally in a container: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver -m auto_archiver --config secrets/orchestration.yaml` breaking this command down:
|
3. run the docker image locally in a container: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml` breaking this command down:
|
||||||
1. `docker run` tells docker to start a new container (an instance of the image)
|
1. `docker run` tells docker to start a new container (an instance of the image)
|
||||||
2. `--rm` makes sure this container is removed after execution (less garbage locally)
|
2. `--rm` makes sure this container is removed after execution (less garbage locally)
|
||||||
3. `-v $PWD/secrets:/app/secrets` - your secrets folder
|
3. `-v $PWD/secrets:/app/secrets` - your secrets folder
|
||||||
@@ -87,11 +87,9 @@ The archiver work is orchestrated by the following workflow (we call each a **st
|
|||||||
4. **Formatter** creates a report from all the archived content (HTML, PDF, ...)
|
4. **Formatter** creates a report from all the archived content (HTML, PDF, ...)
|
||||||
5. **Database** knows what's been archived and also stores the archive result (spreadsheet, CSV, or just the console)
|
5. **Database** knows what's been archived and also stores the archive result (spreadsheet, CSV, or just the console)
|
||||||
|
|
||||||
To check all available steps (which archivers, storages, databses, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).
|
To setup an auto-archiver instance, instance, create an `orchestration.yaml` which contains the workflow you would like. We advise you put this file into a `secrets/` folder and do not share it with others because it will contain passwords and other secrets.
|
||||||
|
|
||||||
The great thing is you configure all the workflow in your `orchestration.yaml` file which we advise you put into a `secrets/` folder and don't share it with others because it will contain passwords and other secrets.
|
The structure of orchestration file is split into 2 parts: `steps` (what **steps** to use) and `configurations` (how those steps should behave), here's a simplification:
|
||||||
|
|
||||||
The structure of orchestration file is split into 2 parts: `steps` (what **steps** to use) and `configs` (how those steps should behave), here's a simplification:
|
|
||||||
```yaml
|
```yaml
|
||||||
# orchestration.yaml content
|
# orchestration.yaml content
|
||||||
steps:
|
steps:
|
||||||
@@ -113,10 +111,12 @@ configurations:
|
|||||||
# ... configurations for the other steps here ...
|
# ... configurations for the other steps here ...
|
||||||
```
|
```
|
||||||
|
|
||||||
|
To see all available `steps` (which archivers, storages, databses, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).
|
||||||
|
|
||||||
All the `configurations` in the `orchestration.yaml` file (you can name it differently but need to pass it in the `--config FILENAME` argument) can be seen in the console by using the `--help` flag. They can also be overwritten, for example if you are using the `cli_feeder` to archive from the command line and want to provide the URLs you should do:
|
All the `configurations` in the `orchestration.yaml` file (you can name it differently but need to pass it in the `--config FILENAME` argument) can be seen in the console by using the `--help` flag. They can also be overwritten, for example if you are using the `cli_feeder` to archive from the command line and want to provide the URLs you should do:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
auto-archiver --config orchestration.yaml --cli_feeder.urls="url1,url2,url3"
|
auto-archiver --config secrets/orchestration.yaml --cli_feeder.urls="url1,url2,url3"
|
||||||
```
|
```
|
||||||
|
|
||||||
Here's the complete workflow that the auto-archiver goes through:
|
Here's the complete workflow that the auto-archiver goes through:
|
||||||
@@ -153,11 +153,11 @@ These assume you've installed with pipenv, see docker section above for how to r
|
|||||||
# all the configurations come from ./orchestration.yaml
|
# all the configurations come from ./orchestration.yaml
|
||||||
auto-archiver
|
auto-archiver
|
||||||
# all the configurations come from ./secrets/orchestration.yaml
|
# all the configurations come from ./secrets/orchestration.yaml
|
||||||
auto-archiver --config orchestration.yaml
|
auto-archiver --config secrets/orchestration.yaml
|
||||||
# uses the configurations but for another google docs sheet
|
# uses the same configurations but for another google docs sheet
|
||||||
# with a header on row 2 and with some different column names
|
# with a header on row 2 and with some different column names
|
||||||
# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided
|
# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided
|
||||||
auto-archiver --config orchestration.yaml --gsheets_feeder.sheet="use it on another sheets doc" --gsheets_feeder.header=2 --gsheets_feeder.columns='{"url": "link"}'
|
auto-archiver --config orchestration.yaml --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}'
|
||||||
# all the configurations come from orchestration.yaml and specifies that s3 files should be private
|
# all the configurations come from orchestration.yaml and specifies that s3 files should be private
|
||||||
auto-archiver --s3_storage.private=1
|
auto-archiver --s3_storage.private=1
|
||||||
```
|
```
|
||||||
@@ -166,11 +166,11 @@ auto-archiver --s3_storage.private=1
|
|||||||
#### Google Drive
|
#### Google Drive
|
||||||
To use Google Drive storage you need the id of the shared folder in the `config.yaml` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com` and then you can use `--storage=gd`
|
To use Google Drive storage you need the id of the shared folder in the `config.yaml` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com` and then you can use `--storage=gd`
|
||||||
|
|
||||||
#### Telethon (Telegrams API Library)
|
#### Telethon + Instagram with telegram bot
|
||||||
The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root.
|
The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root.
|
||||||
|
|
||||||
|
|
||||||
## Running on Google Sheets Feeder (gsheets_feeder)
|
## Running on Google Sheets Feeder (gsheet_feeder)
|
||||||
The `--gseets_feeder.sheet` property is the name of the Google Sheet to check for URLs.
|
The `--gseets_feeder.sheet` property is the name of the Google Sheet to check for URLs.
|
||||||
This sheet must have been shared with the Google Service account used by `gspread`.
|
This sheet must have been shared with the Google Service account used by `gspread`.
|
||||||
This sheet must also have specific columns (case-insensitive) in the `header` row - see [Gsheet.configs](src/auto_archiver/utils/gsheet.py) for all their names.
|
This sheet must also have specific columns (case-insensitive) in the `header` row - see [Gsheet.configs](src/auto_archiver/utils/gsheet.py) for all their names.
|
||||||
@@ -183,7 +183,7 @@ When the auto archiver starts running, it updates the "Archive status" column.
|
|||||||

|

|
||||||
The links are downloaded and archived, and the spreadsheet is updated to the following:
|
The links are downloaded and archived, and the spreadsheet is updated to the following:
|
||||||

|

|
||||||
Note that the first row is skipped, as it is assumed to be a header row (`--gsheets_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked.
|
Note that the first row is skipped, as it is assumed to be a header row (`--gsheet_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked.
|
||||||
|
|
||||||
|
|
||||||
---
|
---
|
||||||
@@ -193,7 +193,7 @@ Use `python -m src.auto_archiver --config secrets/orchestration.yaml` to run fro
|
|||||||
#### Docker development
|
#### Docker development
|
||||||
working with docker locally:
|
working with docker locally:
|
||||||
* `docker build . -t auto-archiver` to build a local image
|
* `docker build . -t auto-archiver` to build a local image
|
||||||
* `docker run --rm -v $PWD/secrets:/app/secrets aa --config secrets/config.yaml`
|
* `docker run --rm -v $PWD/secrets:/app/secrets aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml`
|
||||||
* to use local archive, also create a volume `-v` for it by adding `-v $PWD/local_archive:/app/local_archive`
|
* to use local archive, also create a volume `-v` for it by adding `-v $PWD/local_archive:/app/local_archive`
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,22 +1,21 @@
|
|||||||
steps:
|
steps:
|
||||||
# only 1 feeder allowed
|
# only 1 feeder allowed
|
||||||
# feeder: cli_feeder # default feeder
|
feeder: gsheet_feeder # defaults to cli_feeder
|
||||||
feeder: gsheet_feeder # default -> only expects URL from CLI
|
archivers: # order matters, uncomment to activate
|
||||||
archivers: # order matters
|
|
||||||
# - vk_archiver
|
# - vk_archiver
|
||||||
# - telethon_archiver
|
# - telethon_archiver
|
||||||
# - telegram_archiver
|
# - telegram_archiver
|
||||||
# - twitter_archiver
|
# - twitter_archiver
|
||||||
# - twitter_api_archiver
|
# - twitter_api_archiver
|
||||||
# - instagram_archiver
|
|
||||||
# - instagram_tbot_archiver
|
# - instagram_tbot_archiver
|
||||||
|
# - instagram_archiver
|
||||||
# - tiktok_archiver
|
# - tiktok_archiver
|
||||||
- youtubedl_archiver
|
- youtubedl_archiver
|
||||||
# - wayback_archiver_enricher
|
- wayback_archiver_enricher
|
||||||
enrichers:
|
enrichers:
|
||||||
- hash_enricher
|
- hash_enricher
|
||||||
- screenshot_enricher
|
# - screenshot_enricher
|
||||||
- thumbnail_enricher
|
# - thumbnail_enricher
|
||||||
# - wayback_archiver_enricher
|
# - wayback_archiver_enricher
|
||||||
# - wacz_enricher
|
# - wacz_enricher
|
||||||
|
|
||||||
@@ -26,16 +25,18 @@ steps:
|
|||||||
# - s3_storage
|
# - s3_storage
|
||||||
# - gdrive_storage
|
# - gdrive_storage
|
||||||
databases:
|
databases:
|
||||||
# - console_db
|
- console_db
|
||||||
# - csv_db
|
# - csv_db
|
||||||
- gsheet_db
|
# - gsheet_db
|
||||||
# - mongo_db
|
# - mongo_db
|
||||||
|
|
||||||
configurations:
|
configurations:
|
||||||
gsheet_feeder:
|
gsheet_feeder:
|
||||||
sheet: auto-archiver-test
|
sheet: "your sheet name"
|
||||||
header: 2 # defaults to 1 in GSheetsFeeder
|
header: 1
|
||||||
service_account: "secrets/service_account.json"
|
service_account: "secrets/service_account.json"
|
||||||
|
# allow_worksheets: "only parse this worksheet"
|
||||||
|
# block_worksheets: "blocked sheet 1,blocked sheet 2"
|
||||||
use_sheet_names_in_stored_paths: false
|
use_sheet_names_in_stored_paths: false
|
||||||
columns:
|
columns:
|
||||||
url: link
|
url: link
|
||||||
@@ -44,36 +45,77 @@ configurations:
|
|||||||
archive: archive location
|
archive: archive location
|
||||||
date: archive date
|
date: archive date
|
||||||
thumbnail: thumbnail
|
thumbnail: thumbnail
|
||||||
thumbnail_index: thumbnail index
|
|
||||||
timestamp: upload timestamp
|
timestamp: upload timestamp
|
||||||
title: upload title
|
title: upload title
|
||||||
text: textual content
|
text: textual content
|
||||||
duration: duration
|
|
||||||
screenshot: screenshot
|
screenshot: screenshot
|
||||||
hash: hash
|
hash: hash
|
||||||
wacz: wacz
|
wacz: wacz
|
||||||
replaywebpage: replaywebpage
|
replaywebpage: replaywebpage
|
||||||
|
instagram_tbot_archiver:
|
||||||
|
api_id: "TELEGRAM_BOT_API_ID"
|
||||||
|
api_hash: "TELEGRAM_BOT_API_HASH"
|
||||||
|
# session_file: "secrets/anon"
|
||||||
|
telethon_archiver:
|
||||||
|
api_id: "TELEGRAM_BOT_API_ID"
|
||||||
|
api_hash: "TELEGRAM_BOT_API_HASH"
|
||||||
|
# session_file: "secrets/anon"
|
||||||
|
join_channels: false
|
||||||
|
channel_invites: # if you want to archive from private channels
|
||||||
|
- invite: https://t.me/+123456789
|
||||||
|
id: 0000000001
|
||||||
|
- invite: https://t.me/+123456788
|
||||||
|
id: 0000000002
|
||||||
|
|
||||||
|
twitter_api_archiver:
|
||||||
|
# either bearer_token only
|
||||||
|
bearer_token: "TWITTER_BEARER_TOKEN"
|
||||||
|
# OR all of the below
|
||||||
|
# consumer_key: ""
|
||||||
|
# consumer_secret: ""
|
||||||
|
# access_token: ""
|
||||||
|
# access_secret: ""
|
||||||
|
instagram_archiver:
|
||||||
|
username: "INSTAGRAM_USERNAME"
|
||||||
|
password: "INSTAGRAM_PASSWORD"
|
||||||
|
# session_file: "secrets/instaloader.session"
|
||||||
|
|
||||||
|
vk_archiver:
|
||||||
|
username: "or phone number"
|
||||||
|
password: "vk pass"
|
||||||
|
session_file: "secrets/vk_config.v2.json"
|
||||||
|
|
||||||
screenshot_enricher:
|
screenshot_enricher:
|
||||||
width: 1280
|
width: 1280
|
||||||
height: 2300
|
height: 2300
|
||||||
wayback_archiver_enricher:
|
wayback_archiver_enricher:
|
||||||
timeout: 10
|
timeout: 10
|
||||||
key: ""
|
key: "wayback key"
|
||||||
secret: ""
|
secret: "wayback secret"
|
||||||
hash_enricher:
|
hash_enricher:
|
||||||
algorithm: "SHA3-512"
|
algorithm: "SHA3-512" # can also be SHA-256
|
||||||
# wacz:
|
wacz_enricher:
|
||||||
# profile: secrets/profile.tar.gz
|
profile: secrets/profile.tar.gz
|
||||||
local_storage:
|
local_storage:
|
||||||
save_to: "./local_archive"
|
save_to: "./local_archive"
|
||||||
save_absolute: true
|
save_absolute: true
|
||||||
filename_generator: static
|
filename_generator: static
|
||||||
path_generator: flat
|
path_generator: flat
|
||||||
|
s3_storage:
|
||||||
|
bucket: your-bucket-name
|
||||||
|
region: reg1
|
||||||
|
key: S3_KEY
|
||||||
|
secret: S3_SECRET
|
||||||
|
endpoint_url: "https://{region}.digitaloceanspaces.com"
|
||||||
|
cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
|
||||||
|
# if private:true S3 urls will not be readable online
|
||||||
|
private: false
|
||||||
|
# with 'random' you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files, alternative is 'default' or not omitted from config
|
||||||
|
key_path: random
|
||||||
|
|
||||||
gdrive_storage:
|
gdrive_storage:
|
||||||
path_generator: url
|
path_generator: url
|
||||||
filename_generator: random
|
filename_generator: random
|
||||||
root_folder_id: TODO
|
root_folder_id: folder_id_from_url
|
||||||
oauth_token: secrets/gd-token.json
|
oauth_token: secrets/gd-token.json # needs to be generated with scripts/create_update_gdrive_oauth_token.py
|
||||||
service_account: "secrets/service_account.json"
|
service_account: "secrets/service_account.json"
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ from googleapiclient.errors import HttpError
|
|||||||
# You can run this code to get a new token and verify it belongs to the correct user
|
# You can run this code to get a new token and verify it belongs to the correct user
|
||||||
# This token will be refresh automatically by the auto-archiver
|
# This token will be refresh automatically by the auto-archiver
|
||||||
# Code below from https://developers.google.com/drive/api/quickstart/python
|
# Code below from https://developers.google.com/drive/api/quickstart/python
|
||||||
|
# Example invocation: py scripts/create_update_gdrive_oauth_token.py -c secrets/credentials.json -t secrets/gd-token.json
|
||||||
|
|
||||||
SCOPES = ['https://www.googleapis.com/auth/drive']
|
SCOPES = ['https://www.googleapis.com/auth/drive']
|
||||||
|
|
||||||
|
|||||||
@@ -3,8 +3,8 @@ from abc import abstractmethod
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import os
|
import os
|
||||||
import mimetypes, requests
|
import mimetypes, requests
|
||||||
from ..core import Metadata
|
|
||||||
from ..core import Step
|
from ..core import Metadata, Step, ArchivingContext
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -51,7 +51,7 @@ class Archiver(Step):
|
|||||||
if len(to_filename) > 64:
|
if len(to_filename) > 64:
|
||||||
to_filename = to_filename[-64:]
|
to_filename = to_filename[-64:]
|
||||||
if item:
|
if item:
|
||||||
to_filename = os.path.join(item.get_tmp_dir(), to_filename)
|
to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename)
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,14 +2,14 @@
|
|||||||
from telethon.sync import TelegramClient
|
from telethon.sync import TelegramClient
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
import time, os
|
import time, os
|
||||||
|
from sqlite3 import OperationalError
|
||||||
from . import Archiver
|
from . import Archiver
|
||||||
from ..core import Metadata, Media
|
from ..core import Metadata, Media, ArchivingContext
|
||||||
|
|
||||||
|
|
||||||
class InstagramTbotArchiver(Archiver):
|
class InstagramTbotArchiver(Archiver):
|
||||||
"""
|
"""
|
||||||
calls a telegram bot to fetch instagram posts/stories...
|
calls a telegram bot to fetch instagram posts/stories... and gets available media from it
|
||||||
https://github.com/adw0rd/instagrapi
|
https://github.com/adw0rd/instagrapi
|
||||||
https://t.me/instagram_load_bot
|
https://t.me/instagram_load_bot
|
||||||
"""
|
"""
|
||||||
@@ -20,15 +20,18 @@ class InstagramTbotArchiver(Archiver):
|
|||||||
self.assert_valid_string("api_id")
|
self.assert_valid_string("api_id")
|
||||||
self.assert_valid_string("api_hash")
|
self.assert_valid_string("api_hash")
|
||||||
self.timeout = int(self.timeout)
|
self.timeout = int(self.timeout)
|
||||||
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
|
try:
|
||||||
|
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
|
||||||
|
except OperationalError as e:
|
||||||
|
logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_archiver. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def configs() -> dict:
|
def configs() -> dict:
|
||||||
return {
|
return {
|
||||||
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
|
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
|
||||||
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
||||||
"session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
|
"session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
|
||||||
"timeout": {"default": 15, "help": "timeout to fetch the instagram content in seconds."},
|
"timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."},
|
||||||
}
|
}
|
||||||
|
|
||||||
def setup(self) -> None:
|
def setup(self) -> None:
|
||||||
@@ -41,26 +44,33 @@ class InstagramTbotArchiver(Archiver):
|
|||||||
if not "instagram.com" in url: return False
|
if not "instagram.com" in url: return False
|
||||||
|
|
||||||
result = Metadata()
|
result = Metadata()
|
||||||
tmp_dir = item.get_tmp_dir()
|
tmp_dir = ArchivingContext.get_tmp_dir()
|
||||||
with self.client.start():
|
with self.client.start():
|
||||||
chat = self.client.get_entity("instagram_load_bot")
|
chat = self.client.get_entity("instagram_load_bot")
|
||||||
since_id = self.client.send_message(entity=chat, message=url).id
|
since_id = self.client.send_message(entity=chat, message=url).id
|
||||||
|
|
||||||
attempts = 0
|
attempts = 0
|
||||||
media = None
|
seen_media = []
|
||||||
message = ""
|
message = ""
|
||||||
time.sleep(4)
|
time.sleep(3)
|
||||||
while attempts < self.timeout and (not message or not media):
|
# media is added before text by the bot so it can be used as a stop-logic mechanism
|
||||||
|
while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
|
||||||
attempts += 1
|
attempts += 1
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
for post in self.client.iter_messages(chat, min_id=since_id):
|
for post in self.client.iter_messages(chat, min_id=since_id):
|
||||||
since_id = max(since_id, post.id)
|
since_id = max(since_id, post.id)
|
||||||
if post.media and not media:
|
if post.media and post.id not in seen_media:
|
||||||
filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
|
filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
|
||||||
media = self.client.download_media(post.media, filename_dest)
|
media = self.client.download_media(post.media, filename_dest)
|
||||||
if media: result.add_media(Media(media))
|
if media:
|
||||||
|
result.add_media(Media(media))
|
||||||
|
seen_media.append(post.id)
|
||||||
if post.message: message += post.message
|
if post.message: message += post.message
|
||||||
|
|
||||||
|
if "You must enter a URL to a post" in message:
|
||||||
|
logger.debug(f"invalid link {url=} for {self.name}: {message}")
|
||||||
|
return False
|
||||||
|
|
||||||
if message:
|
if message:
|
||||||
result.set_content(message).set_title(message[:128])
|
result.set_content(message).set_title(message[:128])
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ from tqdm import tqdm
|
|||||||
import re, time, json, os
|
import re, time, json, os
|
||||||
|
|
||||||
from . import Archiver
|
from . import Archiver
|
||||||
from ..core import Metadata, Media
|
from ..core import Metadata, Media, ArchivingContext
|
||||||
|
|
||||||
|
|
||||||
class TelethonArchiver(Archiver):
|
class TelethonArchiver(Archiver):
|
||||||
@@ -128,7 +128,7 @@ class TelethonArchiver(Archiver):
|
|||||||
media_posts = self._get_media_posts_in_group(chat, post)
|
media_posts = self._get_media_posts_in_group(chat, post)
|
||||||
logger.debug(f'got {len(media_posts)=} for {url=}')
|
logger.debug(f'got {len(media_posts)=} for {url=}')
|
||||||
|
|
||||||
tmp_dir = item.get_tmp_dir()
|
tmp_dir = ArchivingContext.get_tmp_dir()
|
||||||
|
|
||||||
group_id = post.grouped_id if post.grouped_id is not None else post.id
|
group_id = post.grouped_id if post.grouped_id is not None else post.id
|
||||||
title = post.message
|
title = post.message
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ import tiktok_downloader
|
|||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from . import Archiver
|
from . import Archiver
|
||||||
from ..core import Metadata, Media
|
from ..core import Metadata, Media, ArchivingContext
|
||||||
|
|
||||||
|
|
||||||
class TiktokArchiver(Archiver):
|
class TiktokArchiver(Archiver):
|
||||||
@@ -41,7 +41,7 @@ class TiktokArchiver(Archiver):
|
|||||||
logger.warning(f'Other Tiktok error {error}')
|
logger.warning(f'Other Tiktok error {error}')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
filename = os.path.join(item.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4')
|
filename = os.path.join(ArchivingContext.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4')
|
||||||
tiktok_media = tiktok_downloader.snaptik(url).get_media()
|
tiktok_media = tiktok_downloader.snaptik(url).get_media()
|
||||||
|
|
||||||
if len(tiktok_media) <= 0:
|
if len(tiktok_media) <= 0:
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ from vk_url_scraper import VkScraper
|
|||||||
|
|
||||||
from ..utils.misc import dump_payload
|
from ..utils.misc import dump_payload
|
||||||
from . import Archiver
|
from . import Archiver
|
||||||
from ..core import Metadata, Media
|
from ..core import Metadata, Media, ArchivingContext
|
||||||
|
|
||||||
|
|
||||||
class VkArchiver(Archiver):
|
class VkArchiver(Archiver):
|
||||||
@@ -50,7 +50,7 @@ class VkArchiver(Archiver):
|
|||||||
|
|
||||||
result.set_content(dump_payload(vk_scrapes))
|
result.set_content(dump_payload(vk_scrapes))
|
||||||
|
|
||||||
filenames = self.vks.download_media(vk_scrapes, item.get_tmp_dir())
|
filenames = self.vks.download_media(vk_scrapes, ArchivingContext.get_tmp_dir())
|
||||||
for filename in filenames:
|
for filename in filenames:
|
||||||
result.add_media(Media(filename))
|
result.add_media(Media(filename))
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import datetime, os, yt_dlp
|
|||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from . import Archiver
|
from . import Archiver
|
||||||
from ..core import Metadata, Media
|
from ..core import Metadata, Media, ArchivingContext
|
||||||
|
|
||||||
|
|
||||||
class YoutubeDLArchiver(Archiver):
|
class YoutubeDLArchiver(Archiver):
|
||||||
@@ -25,7 +25,7 @@ class YoutubeDLArchiver(Archiver):
|
|||||||
logger.debug('Using Facebook cookie')
|
logger.debug('Using Facebook cookie')
|
||||||
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
|
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
|
||||||
|
|
||||||
ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(item.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False})
|
ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False})
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# don'd download since it can be a live stream
|
# don'd download since it can be a live stream
|
||||||
|
|||||||
@@ -1,34 +0,0 @@
|
|||||||
|
|
||||||
#TODO: refactor GDriveStorage before merging to main
|
|
||||||
# is it possible to have something like this with the new pipeline?
|
|
||||||
|
|
||||||
|
|
||||||
# # import tempfile
|
|
||||||
# import auto_archive
|
|
||||||
# from loguru import logger
|
|
||||||
# from configs import Config
|
|
||||||
# from storages import Storage
|
|
||||||
|
|
||||||
|
|
||||||
# def main():
|
|
||||||
# c = Config()
|
|
||||||
# c.parse()
|
|
||||||
# logger.info(f'Opening document {c.sheet} to look for sheet names to archive')
|
|
||||||
|
|
||||||
# gc = c.gsheets_client
|
|
||||||
# sh = gc.open(c.sheet)
|
|
||||||
|
|
||||||
# wks = sh.get_worksheet(0)
|
|
||||||
# values = wks.get_all_values()
|
|
||||||
|
|
||||||
# with tempfile.TemporaryDirectory(dir="./") as tmpdir:
|
|
||||||
# Storage.TMP_FOLDER = tmpdir
|
|
||||||
# for i in range(11, len(values)):
|
|
||||||
# c.sheet = values[i][0]
|
|
||||||
# logger.info(f"Processing {c.sheet}")
|
|
||||||
# auto_archive.process_sheet(c)
|
|
||||||
# c.destroy_webdriver()
|
|
||||||
|
|
||||||
|
|
||||||
# if __name__ == "__main__":
|
|
||||||
# main()
|
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
from .media import Media
|
|
||||||
from .metadata import Metadata
|
from .metadata import Metadata
|
||||||
|
from .media import Media
|
||||||
from .step import Step
|
from .step import Step
|
||||||
|
from .context import ArchivingContext
|
||||||
|
|
||||||
# cannot import ArchivingOrchestrator/Config to avoid circular dep
|
# cannot import ArchivingOrchestrator/Config to avoid circular dep
|
||||||
# from .orchestrator import ArchivingOrchestrator
|
# from .orchestrator import ArchivingOrchestrator
|
||||||
|
|||||||
52
src/auto_archiver/core/context.py
Normal file
52
src/auto_archiver/core/context.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
|
||||||
|
class ArchivingContext:
|
||||||
|
"""
|
||||||
|
Singleton context class.
|
||||||
|
ArchivingContext._get_instance() to retrieve it if needed
|
||||||
|
otherwise just
|
||||||
|
ArchivingContext.set(key, value)
|
||||||
|
and
|
||||||
|
ArchivingContext.get(key, default)
|
||||||
|
|
||||||
|
When reset is called, all values are cleared EXCEPT if they were .set(keep_on_reset=True)
|
||||||
|
reset(full_reset=True) will recreate everything including the keep_on_reset status
|
||||||
|
"""
|
||||||
|
_instance = None
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.configs = {}
|
||||||
|
self.keep_on_reset = set()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_instance():
|
||||||
|
if ArchivingContext._instance is None:
|
||||||
|
ArchivingContext._instance = ArchivingContext()
|
||||||
|
return ArchivingContext._instance
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def set(key, value, keep_on_reset: bool = False):
|
||||||
|
ac = ArchivingContext.get_instance()
|
||||||
|
ac.configs[key] = value
|
||||||
|
if keep_on_reset: ac.keep_on_reset.add(key)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get(key: str, default=None):
|
||||||
|
return ArchivingContext.get_instance().configs.get(key, default)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def reset(full_reset: bool = False):
|
||||||
|
ac = ArchivingContext.get_instance()
|
||||||
|
if full_reset: ac.keep_on_reset = set()
|
||||||
|
ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset}
|
||||||
|
|
||||||
|
# ---- custom getters/setters for widely used context values
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def set_tmp_dir(tmp_dir: str):
|
||||||
|
ArchivingContext.get_instance().configs["tmp_dir"] = tmp_dir
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_tmp_dir() -> str:
|
||||||
|
return ArchivingContext.get_instance().configs.get("tmp_dir")
|
||||||
@@ -3,18 +3,46 @@ from __future__ import annotations
|
|||||||
from ast import List
|
from ast import List
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from dataclasses_json import dataclass_json
|
from dataclasses_json import dataclass_json, config
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
|
||||||
# annotation order matters
|
from .context import ArchivingContext
|
||||||
@dataclass_json
|
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass_json # annotation order matters
|
||||||
@dataclass
|
@dataclass
|
||||||
class Media:
|
class Media:
|
||||||
filename: str
|
filename: str
|
||||||
key: str = None
|
key: str = None
|
||||||
urls: List[str] = field(default_factory=list)
|
urls: List[str] = field(default_factory=list)
|
||||||
_mimetype: str = None # eg: image/jpeg
|
|
||||||
properties: dict = field(default_factory=dict)
|
properties: dict = field(default_factory=dict)
|
||||||
|
_mimetype: str = None # eg: image/jpeg
|
||||||
|
_stored: bool = field(default=False, repr=False, metadata=config(exclude=lambda _: True)) # always exclude
|
||||||
|
|
||||||
|
def store(self: Media, override_storages: List = None, url: str = "url-not-available"):
|
||||||
|
# stores the media into the provided/available storages [Storage]
|
||||||
|
# repeats the process for its properties, in case they have inner media themselves
|
||||||
|
# for now it only goes down 1 level but it's easy to make it recursive if needed
|
||||||
|
storages = override_storages or ArchivingContext.get("storages")
|
||||||
|
if not len(storages):
|
||||||
|
logger.warning(f"No storages found in local context or provided directly for {self.filename}.")
|
||||||
|
return
|
||||||
|
|
||||||
|
for s in storages:
|
||||||
|
s.store(self, url)
|
||||||
|
# Media can be inside media properties, examples include transformations on original media
|
||||||
|
for prop in self.properties.values():
|
||||||
|
if isinstance(prop, Media):
|
||||||
|
s.store(prop, url)
|
||||||
|
if isinstance(prop, list):
|
||||||
|
for prop_media in prop:
|
||||||
|
if isinstance(prop_media, Media):
|
||||||
|
s.store(prop_media, url)
|
||||||
|
|
||||||
|
def is_stored(self) -> bool:
|
||||||
|
return len(self.urls) > 0 and len(self.urls) == len(ArchivingContext.get("storages"))
|
||||||
|
|
||||||
def set(self, key: str, value: Any) -> Media:
|
def set(self, key: str, value: Any) -> Media:
|
||||||
self.properties[key] = value
|
self.properties[key] = value
|
||||||
@@ -40,3 +68,6 @@ class Media:
|
|||||||
|
|
||||||
def is_video(self) -> bool:
|
def is_video(self) -> bool:
|
||||||
return self.mimetype.startswith("video")
|
return self.mimetype.startswith("video")
|
||||||
|
|
||||||
|
def is_audio(self) -> bool:
|
||||||
|
return self.mimetype.startswith("audio")
|
||||||
|
|||||||
@@ -3,24 +3,25 @@ from __future__ import annotations
|
|||||||
from ast import List, Set
|
from ast import List, Set
|
||||||
from typing import Any, Union, Dict
|
from typing import Any, Union, Dict
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from dataclasses_json import dataclass_json
|
from dataclasses_json import dataclass_json, config
|
||||||
import datetime
|
import datetime
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from dateutil.parser import parse as parse_dt
|
from dateutil.parser import parse as parse_dt
|
||||||
from .media import Media
|
from .media import Media
|
||||||
|
from .context import ArchivingContext
|
||||||
|
|
||||||
|
|
||||||
# annotation order matters
|
@dataclass_json # annotation order matters
|
||||||
@dataclass_json
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Metadata:
|
class Metadata:
|
||||||
status: str = "no archiver"
|
status: str = "no archiver"
|
||||||
_processed_at: datetime = field(default_factory=datetime.datetime.utcnow)
|
|
||||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||||
tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude": True}) # keys that are not to be saved in DBs
|
|
||||||
media: List[Media] = field(default_factory=list)
|
media: List[Media] = field(default_factory=list)
|
||||||
rearchivable: bool = True # defaults to true, archivers can overwrite
|
rearchivable: bool = True # defaults to true, archivers can overwrite
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
self.set("_processed_at", datetime.datetime.utcnow())
|
||||||
|
|
||||||
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
|
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
|
||||||
"""
|
"""
|
||||||
merges two Metadata instances, will overwrite according to overwrite_left flag
|
merges two Metadata instances, will overwrite according to overwrite_left flag
|
||||||
@@ -30,7 +31,6 @@ class Metadata:
|
|||||||
if right.status and len(right.status):
|
if right.status and len(right.status):
|
||||||
self.status = right.status
|
self.status = right.status
|
||||||
self.rearchivable |= right.rearchivable
|
self.rearchivable |= right.rearchivable
|
||||||
self.tmp_keys |= right.tmp_keys
|
|
||||||
for k, v in right.metadata.items():
|
for k, v in right.metadata.items():
|
||||||
assert k not in self.metadata or type(v) == type(self.get(k))
|
assert k not in self.metadata or type(v) == type(self.get(k))
|
||||||
if type(v) not in [dict, list, set] or k not in self.metadata:
|
if type(v) not in [dict, list, set] or k not in self.metadata:
|
||||||
@@ -43,10 +43,14 @@ class Metadata:
|
|||||||
return right.merge(self)
|
return right.merge(self)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def set(self, key: str, val: Any, is_tmp=False) -> Metadata:
|
def store(self: Metadata, override_storages: List = None):
|
||||||
# if not self.metadata: self.metadata = {}
|
# calls .store for all contained media. storages [Storage]
|
||||||
|
storages = override_storages or ArchivingContext.get("storages")
|
||||||
|
for media in self.media:
|
||||||
|
media.store(override_storages=storages, url=self.get_url())
|
||||||
|
|
||||||
|
def set(self, key: str, val: Any) -> Metadata:
|
||||||
self.metadata[key] = val
|
self.metadata[key] = val
|
||||||
if is_tmp: self.tmp_keys.add(key)
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def get(self, key: str, default: Any = None, create_if_missing=False) -> Union[Metadata, str]:
|
def get(self, key: str, default: Any = None, create_if_missing=False) -> Union[Metadata, str]:
|
||||||
@@ -64,7 +68,7 @@ class Metadata:
|
|||||||
return "success" in self.status
|
return "success" in self.status
|
||||||
|
|
||||||
def is_empty(self) -> bool:
|
def is_empty(self) -> bool:
|
||||||
return not self.is_success() and len(self.media) == 0 and len(self.get_clean_metadata()) <= 2 # url, processed_at
|
return not self.is_success() and len(self.media) == 0 and len(self.metadata) <= 2 # url, processed_at
|
||||||
|
|
||||||
@property # getter .netloc
|
@property # getter .netloc
|
||||||
def netloc(self) -> str:
|
def netloc(self) -> str:
|
||||||
@@ -85,7 +89,8 @@ class Metadata:
|
|||||||
|
|
||||||
def set_content(self, content: str) -> Metadata:
|
def set_content(self, content: str) -> Metadata:
|
||||||
# a dump with all the relevant content
|
# a dump with all the relevant content
|
||||||
return self.set("content", content)
|
append_content = (self.get("content", "") + content + "\n").strip()
|
||||||
|
return self.set("content", append_content)
|
||||||
|
|
||||||
def set_title(self, title: str) -> Metadata:
|
def set_title(self, title: str) -> Metadata:
|
||||||
return self.set("title", title)
|
return self.set("title", title)
|
||||||
@@ -93,12 +98,6 @@ class Metadata:
|
|||||||
def get_title(self) -> str:
|
def get_title(self) -> str:
|
||||||
return self.get("title")
|
return self.get("title")
|
||||||
|
|
||||||
def set_tmp_dir(self, tmp_dir: str) -> Metadata:
|
|
||||||
return self.set("tmp_dir", tmp_dir, True)
|
|
||||||
|
|
||||||
def get_tmp_dir(self) -> str:
|
|
||||||
return self.get("tmp_dir")
|
|
||||||
|
|
||||||
def set_timestamp(self, timestamp: datetime.datetime) -> Metadata:
|
def set_timestamp(self, timestamp: datetime.datetime) -> Metadata:
|
||||||
if type(timestamp) == str:
|
if type(timestamp) == str:
|
||||||
timestamp = parse_dt(timestamp)
|
timestamp = parse_dt(timestamp)
|
||||||
@@ -139,8 +138,5 @@ class Metadata:
|
|||||||
_default = self.media[0] if len(self.media) else None
|
_default = self.media[0] if len(self.media) else None
|
||||||
return self.get_media_by_id("_final_media", _default)
|
return self.get_media_by_id("_final_media", _default)
|
||||||
|
|
||||||
def get_clean_metadata(self) -> Metadata:
|
def __str__(self) -> str:
|
||||||
return dict(
|
return self.__repr__()
|
||||||
{k: v for k, v in self.metadata.items() if k not in self.tmp_keys},
|
|
||||||
**{"processed_at": self._processed_at}
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -2,6 +2,8 @@ from __future__ import annotations
|
|||||||
from ast import List
|
from ast import List
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
|
from .context import ArchivingContext
|
||||||
|
|
||||||
from ..archivers import Archiver
|
from ..archivers import Archiver
|
||||||
from ..feeders import Feeder
|
from ..feeders import Feeder
|
||||||
from ..formatters import Formatter
|
from ..formatters import Formatter
|
||||||
@@ -23,6 +25,7 @@ class ArchivingOrchestrator:
|
|||||||
self.archivers: List[Archiver] = config.archivers
|
self.archivers: List[Archiver] = config.archivers
|
||||||
self.databases: List[Database] = config.databases
|
self.databases: List[Database] = config.databases
|
||||||
self.storages: List[Storage] = config.storages
|
self.storages: List[Storage] = config.storages
|
||||||
|
ArchivingContext.set("storages", self.storages, keep_on_reset=True)
|
||||||
|
|
||||||
for a in self.archivers: a.setup()
|
for a in self.archivers: a.setup()
|
||||||
|
|
||||||
@@ -32,8 +35,9 @@ class ArchivingOrchestrator:
|
|||||||
|
|
||||||
def feed_item(self, item: Metadata) -> Metadata:
|
def feed_item(self, item: Metadata) -> Metadata:
|
||||||
try:
|
try:
|
||||||
|
ArchivingContext.reset()
|
||||||
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
|
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
|
||||||
item.set_tmp_dir(tmp_dir)
|
ArchivingContext.set_tmp_dir(tmp_dir)
|
||||||
return self.archive(item)
|
return self.archive(item)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
# catches keyboard interruptions to do a clean exit
|
# catches keyboard interruptions to do a clean exit
|
||||||
@@ -89,7 +93,7 @@ class ArchivingOrchestrator:
|
|||||||
# Q: should this be refactored so it's just a.download(result)?
|
# Q: should this be refactored so it's just a.download(result)?
|
||||||
result.merge(a.download(result))
|
result.merge(a.download(result))
|
||||||
if result.is_success(): break
|
if result.is_success(): break
|
||||||
except Exception as e: logger.error(f"Unexpected error with archiver {a.name}: {e}")
|
except Exception as e: logger.error(f"Unexpected error with archiver {a.name}: {e}: {traceback.format_exc()}")
|
||||||
|
|
||||||
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
|
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
|
||||||
# should it call the HTMLgenerator as if it's not an enrichment?
|
# should it call the HTMLgenerator as if it's not an enrichment?
|
||||||
@@ -101,26 +105,16 @@ class ArchivingOrchestrator:
|
|||||||
# eg: screenshot, wacz, webarchive, thumbnails
|
# eg: screenshot, wacz, webarchive, thumbnails
|
||||||
for e in self.enrichers:
|
for e in self.enrichers:
|
||||||
try: e.enrich(result)
|
try: e.enrich(result)
|
||||||
except Exception as exc: logger.error(f"Unexpected error with enricher {e.name}: {exc}")
|
except Exception as exc: logger.error(f"Unexpected error with enricher {e.name}: {exc}: {traceback.format_exc()}")
|
||||||
|
|
||||||
# 5 - store media
|
# 5 - store media
|
||||||
# looks for Media in result.media and also result.media[x].properties (as list or dict values)
|
# looks for Media in result.media and also result.media[x].properties (as list or dict values)
|
||||||
for s in self.storages:
|
result.store()
|
||||||
for m in result.media:
|
|
||||||
s.store(m, result) # modifies media
|
|
||||||
# Media can be inside media properties, examples include transformations on original media
|
|
||||||
for prop in m.properties.values():
|
|
||||||
if isinstance(prop, Media):
|
|
||||||
s.store(prop, result)
|
|
||||||
if isinstance(prop, list) and len(prop) > 0 and isinstance(prop[0], Media):
|
|
||||||
for prop_media in prop:
|
|
||||||
s.store(prop_media, result)
|
|
||||||
|
|
||||||
# 6 - format and store formatted if needed
|
# 6 - format and store formatted if needed
|
||||||
# enrichers typically need access to already stored URLs etc
|
# enrichers typically need access to already stored URLs etc
|
||||||
if (final_media := self.formatter.format(result)):
|
if (final_media := self.formatter.format(result)):
|
||||||
for s in self.storages:
|
final_media.store(url=url)
|
||||||
s.store(final_media, result)
|
|
||||||
result.set_final_media(final_media)
|
result.set_final_media(final_media)
|
||||||
|
|
||||||
if result.is_empty():
|
if result.is_empty():
|
||||||
|
|||||||
@@ -5,8 +5,7 @@ from urllib.parse import quote
|
|||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from . import Database
|
from . import Database
|
||||||
from ..core import Metadata
|
from ..core import Metadata, Media, ArchivingContext
|
||||||
from ..core import Media
|
|
||||||
from ..utils import GWorksheet
|
from ..utils import GWorksheet
|
||||||
|
|
||||||
|
|
||||||
@@ -63,8 +62,9 @@ class GsheetsDb(Database):
|
|||||||
batch_if_valid('archive', "\n".join(media.urls))
|
batch_if_valid('archive', "\n".join(media.urls))
|
||||||
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
|
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
|
||||||
batch_if_valid('title', item.get_title())
|
batch_if_valid('title', item.get_title())
|
||||||
batch_if_valid('text', item.get("content", "")[:500])
|
batch_if_valid('text', item.get("content", ""))
|
||||||
batch_if_valid('timestamp', item.get_timestamp())
|
batch_if_valid('timestamp', item.get_timestamp())
|
||||||
|
batch_if_valid('hash', media.get("hash", "not-calculated"))
|
||||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
|
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
|
||||||
batch_if_valid('screenshot', "\n".join(screenshot.urls))
|
batch_if_valid('screenshot', "\n".join(screenshot.urls))
|
||||||
|
|
||||||
@@ -86,7 +86,7 @@ class GsheetsDb(Database):
|
|||||||
logger.debug(f"Unable to update sheet: {e}")
|
logger.debug(f"Unable to update sheet: {e}")
|
||||||
|
|
||||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||||
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now
|
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
|
||||||
gw: GWorksheet = item.get("gsheet").get("worksheet")
|
gw: GWorksheet = ArchivingContext.get("gsheet").get("worksheet")
|
||||||
row: int = item.get("gsheet").get("row")
|
row: int = ArchivingContext.get("gsheet").get("row")
|
||||||
return gw, row
|
return gw, row
|
||||||
|
|||||||
@@ -3,4 +3,5 @@ from .screenshot_enricher import ScreenshotEnricher
|
|||||||
from .wayback_enricher import WaybackArchiverEnricher
|
from .wayback_enricher import WaybackArchiverEnricher
|
||||||
from .hash_enricher import HashEnricher
|
from .hash_enricher import HashEnricher
|
||||||
from .thumbnail_enricher import ThumbnailEnricher
|
from .thumbnail_enricher import ThumbnailEnricher
|
||||||
from .wacz_enricher import WaczEnricher
|
from .wacz_enricher import WaczEnricher
|
||||||
|
from .whisper_enricher import WhisperEnricher
|
||||||
@@ -2,7 +2,7 @@ import hashlib
|
|||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from . import Enricher
|
from . import Enricher
|
||||||
from ..core import Metadata
|
from ..core import Metadata, ArchivingContext
|
||||||
|
|
||||||
|
|
||||||
class HashEnricher(Enricher):
|
class HashEnricher(Enricher):
|
||||||
@@ -16,11 +16,14 @@ class HashEnricher(Enricher):
|
|||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
algo_choices = self.configs()["algorithm"]["choices"]
|
algo_choices = self.configs()["algorithm"]["choices"]
|
||||||
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
|
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
|
||||||
|
self.chunksize = int(self.chunksize)
|
||||||
|
ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def configs() -> dict:
|
def configs() -> dict:
|
||||||
return {
|
return {
|
||||||
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}
|
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
|
||||||
|
"chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
|
||||||
}
|
}
|
||||||
|
|
||||||
def enrich(self, to_enrich: Metadata) -> None:
|
def enrich(self, to_enrich: Metadata) -> None:
|
||||||
@@ -28,12 +31,19 @@ class HashEnricher(Enricher):
|
|||||||
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
|
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
|
||||||
|
|
||||||
for i, m in enumerate(to_enrich.media):
|
for i, m in enumerate(to_enrich.media):
|
||||||
with open(m.filename, "rb") as f:
|
if len(hd := self.calculate_hash(m.filename)):
|
||||||
bytes = f.read() # read entire file as bytes
|
to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
|
||||||
hash = None
|
|
||||||
if self.algorithm == "SHA-256":
|
def calculate_hash(self, filename):
|
||||||
hash = hashlib.sha256(bytes)
|
hash = None
|
||||||
elif self.algorithm == "SHA3-512":
|
if self.algorithm == "SHA-256":
|
||||||
hash = hashlib.sha3_512(bytes)
|
hash = hashlib.sha256()
|
||||||
else: continue
|
elif self.algorithm == "SHA3-512":
|
||||||
to_enrich.media[i].set("hash", f"{self.algorithm}:{hash.hexdigest()}")
|
hash = hashlib.sha3_512()
|
||||||
|
else: return ""
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
while True:
|
||||||
|
buf = f.read(self.chunksize)
|
||||||
|
if not buf: break
|
||||||
|
hash.update(buf)
|
||||||
|
return hash.hexdigest()
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ from selenium.common.exceptions import TimeoutException
|
|||||||
|
|
||||||
from . import Enricher
|
from . import Enricher
|
||||||
from ..utils import Webdriver, UrlUtil
|
from ..utils import Webdriver, UrlUtil
|
||||||
from ..core import Media, Metadata
|
from ..core import Media, Metadata, ArchivingContext
|
||||||
|
|
||||||
class ScreenshotEnricher(Enricher):
|
class ScreenshotEnricher(Enricher):
|
||||||
name = "screenshot_enricher"
|
name = "screenshot_enricher"
|
||||||
@@ -14,7 +14,8 @@ class ScreenshotEnricher(Enricher):
|
|||||||
return {
|
return {
|
||||||
"width": {"default": 1280, "help": "width of the screenshots"},
|
"width": {"default": 1280, "help": "width of the screenshots"},
|
||||||
"height": {"default": 720, "help": "height of the screenshots"},
|
"height": {"default": 720, "help": "height of the screenshots"},
|
||||||
"timeout": {"default": 60, "help": "timeout for taking the screenshot"}
|
"timeout": {"default": 60, "help": "timeout for taking the screenshot"},
|
||||||
|
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}
|
||||||
}
|
}
|
||||||
|
|
||||||
def enrich(self, to_enrich: Metadata) -> None:
|
def enrich(self, to_enrich: Metadata) -> None:
|
||||||
@@ -27,12 +28,11 @@ class ScreenshotEnricher(Enricher):
|
|||||||
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver:
|
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver:
|
||||||
try:
|
try:
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
time.sleep(2)
|
time.sleep(int(self.sleep_before_screenshot))
|
||||||
screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
|
screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
|
||||||
driver.save_screenshot(screenshot_file)
|
driver.save_screenshot(screenshot_file)
|
||||||
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
|
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
|
||||||
except TimeoutException:
|
except TimeoutException:
|
||||||
logger.info("TimeoutException loading page for screenshot")
|
logger.info("TimeoutException loading page for screenshot")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
|
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
|
||||||
# return None
|
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import ffmpeg, os, uuid
|
|||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from . import Enricher
|
from . import Enricher
|
||||||
from ..core import Media, Metadata
|
from ..core import Media, Metadata, ArchivingContext
|
||||||
|
|
||||||
|
|
||||||
class ThumbnailEnricher(Enricher):
|
class ThumbnailEnricher(Enricher):
|
||||||
@@ -23,7 +23,7 @@ class ThumbnailEnricher(Enricher):
|
|||||||
logger.debug(f"generating thumbnails")
|
logger.debug(f"generating thumbnails")
|
||||||
for i, m in enumerate(to_enrich.media[::]):
|
for i, m in enumerate(to_enrich.media[::]):
|
||||||
if m.is_video():
|
if m.is_video():
|
||||||
folder = os.path.join(to_enrich.get_tmp_dir(), str(uuid.uuid4()))
|
folder = os.path.join(ArchivingContext.get_tmp_dir(), str(uuid.uuid4()))
|
||||||
os.makedirs(folder, exist_ok=True)
|
os.makedirs(folder, exist_ok=True)
|
||||||
logger.debug(f"generating thumbnails for {m.filename}")
|
logger.debug(f"generating thumbnails for {m.filename}")
|
||||||
fps, duration = 0.5, m.get("duration")
|
fps, duration = 0.5, m.get("duration")
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
import os, shutil, subprocess, uuid
|
import os, shutil, subprocess, uuid
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from ..core import Media, Metadata
|
from ..core import Media, Metadata, ArchivingContext
|
||||||
from . import Enricher
|
from . import Enricher
|
||||||
|
from ..utils import UrlUtil
|
||||||
|
|
||||||
|
|
||||||
class WaczEnricher(Enricher):
|
class WaczEnricher(Enricher):
|
||||||
@@ -20,35 +21,63 @@ class WaczEnricher(Enricher):
|
|||||||
return {
|
return {
|
||||||
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
|
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
|
||||||
"timeout": {"default": 90, "help": "timeout for WACZ generation in seconds"},
|
"timeout": {"default": 90, "help": "timeout for WACZ generation in seconds"},
|
||||||
|
"ignore_auth_wall": {"default": True, "help": "skip URL if it is behind authentication wall, set to False if you have browsertrix profile configured for private content."},
|
||||||
}
|
}
|
||||||
|
|
||||||
def enrich(self, to_enrich: Metadata) -> bool:
|
def enrich(self, to_enrich: Metadata) -> bool:
|
||||||
# TODO: figure out support for browsertrix in docker
|
# TODO: figure out support for browsertrix in docker
|
||||||
|
|
||||||
url = to_enrich.get_url()
|
url = to_enrich.get_url()
|
||||||
logger.debug(f"generating WACZ for {url=}")
|
|
||||||
|
if UrlUtil.is_auth_wall(url):
|
||||||
|
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
|
||||||
|
return
|
||||||
|
|
||||||
collection = str(uuid.uuid4())[0:8]
|
collection = str(uuid.uuid4())[0:8]
|
||||||
browsertrix_home = os.path.abspath(to_enrich.get_tmp_dir())
|
browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
|
||||||
cmd = [
|
|
||||||
"docker", "run",
|
if os.getenv('RUNNING_IN_DOCKER'):
|
||||||
"--rm", # delete container once it has completed running
|
logger.debug(f"generating WACZ without Docker for {url=}")
|
||||||
"-v", f"{browsertrix_home}:/crawls/",
|
|
||||||
# "-it", # this leads to "the input device is not a TTY"
|
cmd = [
|
||||||
"webrecorder/browsertrix-crawler", "crawl",
|
"crawl",
|
||||||
"--url", url,
|
"--url", url,
|
||||||
"--scopeType", "page",
|
"--scopeType", "page",
|
||||||
"--generateWACZ",
|
"--generateWACZ",
|
||||||
"--text",
|
"--text",
|
||||||
"--collection", collection,
|
"--collection", collection,
|
||||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
"--id", collection,
|
||||||
"--behaviorTimeout", str(self.timeout),
|
"--saveState", "never",
|
||||||
"--timeout", str(self.timeout)
|
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||||
]
|
"--behaviorTimeout", str(self.timeout),
|
||||||
if self.profile:
|
"--timeout", str(self.timeout),
|
||||||
profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
|
"--profile", str(self.profile)
|
||||||
shutil.copyfile(self.profile, profile_fn)
|
]
|
||||||
# TODO: test which is right
|
else:
|
||||||
cmd.extend(["--profile", profile_fn])
|
logger.debug(f"generating WACZ in Docker for {url=}")
|
||||||
# cmd.extend(["--profile", "/crawls/profile.tar.gz"])
|
|
||||||
|
cmd = [
|
||||||
|
"docker", "run",
|
||||||
|
"--rm", # delete container once it has completed running
|
||||||
|
"-v", f"{browsertrix_home}:/crawls/",
|
||||||
|
# "-it", # this leads to "the input device is not a TTY"
|
||||||
|
"webrecorder/browsertrix-crawler", "crawl",
|
||||||
|
"--url", url,
|
||||||
|
"--scopeType", "page",
|
||||||
|
"--generateWACZ",
|
||||||
|
"--text",
|
||||||
|
"--collection", collection,
|
||||||
|
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||||
|
"--behaviorTimeout", str(self.timeout),
|
||||||
|
"--timeout", str(self.timeout)
|
||||||
|
]
|
||||||
|
|
||||||
|
if self.profile:
|
||||||
|
profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
|
||||||
|
shutil.copyfile(self.profile, profile_fn)
|
||||||
|
# TODO: test which is right
|
||||||
|
cmd.extend(["--profile", profile_fn])
|
||||||
|
# cmd.extend(["--profile", "/crawls/profile.tar.gz"])
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
|
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
|
||||||
@@ -57,7 +86,13 @@ class WaczEnricher(Enricher):
|
|||||||
logger.error(f"WACZ generation failed: {e}")
|
logger.error(f"WACZ generation failed: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
|
|
||||||
|
|
||||||
|
if os.getenv('RUNNING_IN_DOCKER'):
|
||||||
|
filename = os.path.join("collections", collection, f"{collection}.wacz")
|
||||||
|
else:
|
||||||
|
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
|
||||||
|
|
||||||
if not os.path.exists(filename):
|
if not os.path.exists(filename):
|
||||||
logger.warning(f"Unable to locate and upload WACZ {filename=}")
|
logger.warning(f"Unable to locate and upload WACZ {filename=}")
|
||||||
return False
|
return False
|
||||||
|
|||||||
130
src/auto_archiver/enrichers/whisper_enricher.py
Normal file
130
src/auto_archiver/enrichers/whisper_enricher.py
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
import traceback
|
||||||
|
import requests, time
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from . import Enricher
|
||||||
|
from ..core import Metadata, Media, ArchivingContext
|
||||||
|
from ..storages import S3Storage
|
||||||
|
|
||||||
|
|
||||||
|
class WhisperEnricher(Enricher):
|
||||||
|
"""
|
||||||
|
Connects with a Whisper API service to get texts out of audio
|
||||||
|
whisper API repository: TODO
|
||||||
|
Only works if an S3 compatible storage is used
|
||||||
|
"""
|
||||||
|
name = "whisper_enricher"
|
||||||
|
|
||||||
|
def __init__(self, config: dict) -> None:
|
||||||
|
# without this STEP.__init__ is not called
|
||||||
|
super().__init__(config)
|
||||||
|
assert type(self.api_key) == str and len(self.api_key) > 0, "please provide a value for the whisper_enricher api_key"
|
||||||
|
self.timeout = int(self.timeout)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def configs() -> dict:
|
||||||
|
return {
|
||||||
|
"api_endpoint": {"default": "https://whisper.spoettel.dev/api/v1", "help": "WhisperApi api endpoint"},
|
||||||
|
"api_key": {"default": None, "help": "WhisperApi api key for authentication"},
|
||||||
|
"include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
|
||||||
|
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
|
||||||
|
"action": {"default": "translation", "help": "which Whisper operation to execute", "choices": ["transcript", "translation", "language_detection"]},
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
def enrich(self, to_enrich: Metadata) -> None:
|
||||||
|
if not self._get_s3_storage():
|
||||||
|
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
|
||||||
|
return
|
||||||
|
|
||||||
|
url = to_enrich.get_url()
|
||||||
|
logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.")
|
||||||
|
|
||||||
|
job_results = {}
|
||||||
|
for i, m in enumerate(to_enrich.media):
|
||||||
|
if m.is_video() or m.is_audio():
|
||||||
|
m.store(url=url)
|
||||||
|
try:
|
||||||
|
job_id = self.submit_job(m)
|
||||||
|
job_results[job_id] = False
|
||||||
|
logger.debug(f"JOB SUBMITTED: {job_id=} for {m.key=}")
|
||||||
|
to_enrich.media[i].set("whisper_model", {"job_id": job_id})
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to submit whisper job for {m.filename=} with error {e}\n{traceback.format_exc()}")
|
||||||
|
|
||||||
|
job_results = self.check_jobs(job_results)
|
||||||
|
|
||||||
|
for i, m in enumerate(to_enrich.media):
|
||||||
|
if m.is_video() or m.is_audio():
|
||||||
|
job_id = to_enrich.media[i].get("whisper_model")["job_id"]
|
||||||
|
to_enrich.media[i].set("whisper_model", {
|
||||||
|
"job_id": job_id,
|
||||||
|
**(job_results[job_id] if job_results[job_id] else {"result": "incomplete or failed job"})
|
||||||
|
})
|
||||||
|
# append the extracted text to the content of the post so it gets written to the DBs like gsheets text column
|
||||||
|
if job_results[job_id]:
|
||||||
|
for k,v in job_results[job_id].items():
|
||||||
|
if "_text" in k and len(v):
|
||||||
|
to_enrich.set_content(f"\n[automatic video transcript]: {v}")
|
||||||
|
|
||||||
|
def submit_job(self, media: Media):
|
||||||
|
s3 = self._get_s3_storage()
|
||||||
|
s3_url = s3.get_cdn_url(media)
|
||||||
|
assert s3_url in media.urls, f"Could not find S3 url ({s3_url}) in list of stored media urls "
|
||||||
|
payload = {
|
||||||
|
"url": s3_url,
|
||||||
|
"type": self.action,
|
||||||
|
# "language": "string" # may be a config
|
||||||
|
}
|
||||||
|
response = requests.post(f'{self.api_endpoint}/jobs', json=payload, headers={'Authorization': f'Bearer {self.api_key}'})
|
||||||
|
assert response.status_code == 201, f"calling the whisper api {self.api_endpoint} returned a non-success code: {response.status_code}"
|
||||||
|
logger.debug(response.json())
|
||||||
|
return response.json()['id']
|
||||||
|
|
||||||
|
def check_jobs(self, job_results: dict):
|
||||||
|
start_time = time.time()
|
||||||
|
all_completed = False
|
||||||
|
while not all_completed and (time.time() - start_time) <= self.timeout:
|
||||||
|
all_completed = True
|
||||||
|
for job_id in job_results:
|
||||||
|
if job_results[job_id] != False: continue
|
||||||
|
all_completed = False # at least one not ready
|
||||||
|
try: job_results[job_id] = self.check_job(job_id)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to check {job_id=} with error {e}\n{traceback.format_exc()}")
|
||||||
|
if not all_completed: time.sleep(3)
|
||||||
|
return job_results
|
||||||
|
|
||||||
|
def check_job(self, job_id):
|
||||||
|
r = requests.get(f'{self.api_endpoint}/jobs/{job_id}', headers={'Authorization': f'Bearer {self.api_key}'})
|
||||||
|
assert r.status_code == 200, f"Job status did not respond with 200, instead with: {r.status_code}"
|
||||||
|
j = r.json()
|
||||||
|
logger.debug(f"Checked job {job_id=} with status='{j['status']}'")
|
||||||
|
if j['status'] == "processing": return False
|
||||||
|
elif j['status'] == "error": return f"Error: {j['meta']['error']}"
|
||||||
|
elif j['status'] == "success":
|
||||||
|
r_res = requests.get(f'{self.api_endpoint}/jobs/{job_id}/artifacts', headers={'Authorization': f'Bearer {self.api_key}'})
|
||||||
|
assert r_res.status_code == 200, f"Job artifacts did not respond with 200, instead with: {r_res.status_code}"
|
||||||
|
logger.success(r_res.json())
|
||||||
|
result = {}
|
||||||
|
for art_id, artifact in enumerate(r_res.json()):
|
||||||
|
subtitle = []
|
||||||
|
full_text = []
|
||||||
|
for i, d in enumerate(artifact.get("data")):
|
||||||
|
subtitle.append(f"{i+1}\n{d.get('start')} --> {d.get('end')}\n{d.get('text').strip()}")
|
||||||
|
full_text.append(d.get('text').strip())
|
||||||
|
if not len(subtitle): continue
|
||||||
|
if self.include_srt: result[f"artifact_{art_id}_subtitle"] = "\n".join(subtitle)
|
||||||
|
result[f"artifact_{art_id}_text"] = "\n".join(full_text)
|
||||||
|
# call /delete endpoint on timely success
|
||||||
|
r_del = requests.delete(f'{self.api_endpoint}/jobs/{job_id}', headers={'Authorization': f'Bearer {self.api_key}'})
|
||||||
|
logger.debug(f"DELETE whisper {job_id=} result: {r_del.status_code}")
|
||||||
|
return result
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _get_s3_storage(self) -> S3Storage:
|
||||||
|
try:
|
||||||
|
return next(s for s in ArchivingContext.get("storages") if s.__class__ == S3Storage)
|
||||||
|
except:
|
||||||
|
logger.warning("No S3Storage instance found in storages")
|
||||||
|
return
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from . import Feeder
|
from . import Feeder
|
||||||
from ..core import Metadata
|
from ..core import Metadata, ArchivingContext
|
||||||
|
|
||||||
|
|
||||||
class CLIFeeder(Feeder):
|
class CLIFeeder(Feeder):
|
||||||
@@ -26,5 +26,7 @@ class CLIFeeder(Feeder):
|
|||||||
def __iter__(self) -> Metadata:
|
def __iter__(self) -> Metadata:
|
||||||
for url in self.urls:
|
for url in self.urls:
|
||||||
logger.debug(f"Processing {url}")
|
logger.debug(f"Processing {url}")
|
||||||
yield Metadata().set_url(url).set("folder", "cli", True)
|
yield Metadata().set_url(url)
|
||||||
|
ArchivingContext.set("folder", "cli")
|
||||||
|
|
||||||
logger.success(f"Processed {len(self.urls)} URL(s)")
|
logger.success(f"Processed {len(self.urls)} URL(s)")
|
||||||
|
|||||||
@@ -5,9 +5,10 @@ from slugify import slugify
|
|||||||
|
|
||||||
# from . import Enricher
|
# from . import Enricher
|
||||||
from . import Feeder
|
from . import Feeder
|
||||||
from ..core import Metadata
|
from ..core import Metadata, ArchivingContext
|
||||||
from ..utils import Gsheets, GWorksheet
|
from ..utils import Gsheets, GWorksheet
|
||||||
|
|
||||||
|
|
||||||
class GsheetsFeeder(Gsheets, Feeder):
|
class GsheetsFeeder(Gsheets, Feeder):
|
||||||
name = "gsheet_feeder"
|
name = "gsheet_feeder"
|
||||||
|
|
||||||
@@ -31,7 +32,7 @@ class GsheetsFeeder(Gsheets, Feeder):
|
|||||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||||
},
|
},
|
||||||
"use_sheet_names_in_stored_paths":{
|
"use_sheet_names_in_stored_paths": {
|
||||||
"default": True,
|
"default": True,
|
||||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||||
}
|
}
|
||||||
@@ -61,11 +62,17 @@ class GsheetsFeeder(Gsheets, Feeder):
|
|||||||
if status not in ['', None]: continue
|
if status not in ['', None]: continue
|
||||||
|
|
||||||
# All checks done - archival process starts here
|
# All checks done - archival process starts here
|
||||||
m = Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True)
|
m = Metadata().set_url(url)
|
||||||
if self.use_sheet_names_in_stored_paths:
|
ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
|
||||||
m.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
|
folder = slugify(gw.get_cell(row, 'folder').strip())
|
||||||
|
if len(folder):
|
||||||
|
if self.use_sheet_names_in_stored_paths:
|
||||||
|
ArchivingContext.set("folder", os.path.join(folder, slugify(self.sheet), slugify(wks.title)), True)
|
||||||
|
else:
|
||||||
|
ArchivingContext.set("folder", folder, True)
|
||||||
|
|
||||||
yield m
|
yield m
|
||||||
|
|
||||||
logger.success(f'Finished worksheet {wks.title}')
|
logger.success(f'Finished worksheet {wks.title}')
|
||||||
|
|
||||||
def should_process_sheet(self, sheet_name: str) -> bool:
|
def should_process_sheet(self, sheet_name: str) -> bool:
|
||||||
|
|||||||
@@ -6,8 +6,9 @@ from urllib.parse import quote
|
|||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from ..version import __version__
|
from ..version import __version__
|
||||||
from ..core import Metadata, Media
|
from ..core import Metadata, Media, ArchivingContext
|
||||||
from . import Formatter
|
from . import Formatter
|
||||||
|
from ..enrichers import HashEnricher
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -40,17 +41,22 @@ class HtmlFormatter(Formatter):
|
|||||||
url=url,
|
url=url,
|
||||||
title=item.get_title(),
|
title=item.get_title(),
|
||||||
media=item.media,
|
media=item.media,
|
||||||
metadata=item.get_clean_metadata(),
|
metadata=item.metadata,
|
||||||
version=__version__
|
version=__version__
|
||||||
)
|
)
|
||||||
html_path = os.path.join(item.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html")
|
html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html")
|
||||||
with open(html_path, mode="w", encoding="utf-8") as outf:
|
with open(html_path, mode="w", encoding="utf-8") as outf:
|
||||||
outf.write(content)
|
outf.write(content)
|
||||||
return Media(filename=html_path)
|
final_media = Media(filename=html_path)
|
||||||
|
|
||||||
|
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
|
||||||
|
if len(hd := he.calculate_hash(final_media.filename)):
|
||||||
|
final_media.set("hash", f"{he.algorithm}:{hd}")
|
||||||
|
|
||||||
|
return final_media
|
||||||
|
|
||||||
|
|
||||||
# JINJA helper filters
|
# JINJA helper filters
|
||||||
|
|
||||||
class JinjaHelpers:
|
class JinjaHelpers:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def is_list(v) -> bool:
|
def is_list(v) -> bool:
|
||||||
|
|||||||
@@ -29,7 +29,7 @@
|
|||||||
margin: auto;
|
margin: auto;
|
||||||
border: 1px solid;
|
border: 1px solid;
|
||||||
border-collapse: collapse;
|
border-collapse: collapse;
|
||||||
vertical-align:top;
|
vertical-align: top;
|
||||||
}
|
}
|
||||||
|
|
||||||
table.metadata td:first-child {
|
table.metadata td:first-child {
|
||||||
@@ -42,7 +42,7 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
.copy:hover {
|
.copy:hover {
|
||||||
font-weight: 600;
|
background: aliceblue;
|
||||||
cursor: copy;
|
cursor: copy;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -185,7 +185,11 @@
|
|||||||
el.addEventListener("copy", (e) => {
|
el.addEventListener("copy", (e) => {
|
||||||
e.preventDefault();
|
e.preventDefault();
|
||||||
if (e.clipboardData) {
|
if (e.clipboardData) {
|
||||||
e.clipboardData.setData("text/plain", el.textContent);
|
if (el.hasAttribute("copy-value")) {
|
||||||
|
e.clipboardData.setData("text/plain", el.getAttribute("copy-value"));
|
||||||
|
} else {
|
||||||
|
e.clipboardData.setData("text/plain", el.textContent);
|
||||||
|
}
|
||||||
console.log(e.clipboardData.getData("text"))
|
console.log(e.clipboardData.getData("text"))
|
||||||
showNotification("copied!")
|
showNotification("copied!")
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -46,14 +46,16 @@ No preview available for {{ m.key }}.
|
|||||||
{% endif %}
|
{% endif %}
|
||||||
{% if links %}
|
{% if links %}
|
||||||
<a href="{{ url }}">open</a> or
|
<a href="{{ url }}">open</a> or
|
||||||
<a href="{{ url }}" download="">download</a>
|
<a href="{{ url }}" download="">download</a> or
|
||||||
|
{{ copy_urlize(url, "copy") }}
|
||||||
|
|
||||||
<br>
|
<br>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|
||||||
{%- endmacro -%}
|
{%- endmacro -%}
|
||||||
|
|
||||||
{% macro copy_urlize(val) -%}
|
{% macro copy_urlize(val, href_text) -%}
|
||||||
|
|
||||||
{% if val is mapping %}
|
{% if val is mapping %}
|
||||||
<ul>
|
<ul>
|
||||||
@@ -65,7 +67,11 @@ No preview available for {{ m.key }}.
|
|||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
{% else %}
|
{% else %}
|
||||||
|
{% if href_text | length == 0 %}
|
||||||
<span class="copy">{{ val | string | urlize }}</span>
|
<span class="copy">{{ val | string | urlize }}</span>
|
||||||
|
{% else %}
|
||||||
|
<span class="copy" copy-value="{{val}}">{{ href_text | string | urlize }}</span>
|
||||||
|
{% endif %}
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
{%- endmacro -%}
|
{%- endmacro -%}
|
||||||
@@ -1,10 +1,10 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import hashlib
|
from typing import IO
|
||||||
from typing import IO, Any
|
|
||||||
|
|
||||||
from ..core import Media, Metadata, Step
|
from ..core import Media, Step, ArchivingContext
|
||||||
|
from ..enrichers import HashEnricher
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
import os, uuid
|
import os, uuid
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
@@ -41,8 +41,11 @@ class Storage(Step):
|
|||||||
# only for typing...
|
# only for typing...
|
||||||
return Step.init(name, config, Storage)
|
return Step.init(name, config, Storage)
|
||||||
|
|
||||||
def store(self, media: Media, item: Metadata) -> None:
|
def store(self, media: Media, url: str) -> None:
|
||||||
self.set_key(media, item)
|
if media.is_stored():
|
||||||
|
logger.debug(f"{media.key} already stored, skipping")
|
||||||
|
return
|
||||||
|
self.set_key(media, url)
|
||||||
self.upload(media)
|
self.upload(media)
|
||||||
media.add_url(self.get_cdn_url(media))
|
media.add_url(self.get_cdn_url(media))
|
||||||
|
|
||||||
@@ -57,25 +60,25 @@ class Storage(Step):
|
|||||||
with open(media.filename, 'rb') as f:
|
with open(media.filename, 'rb') as f:
|
||||||
return self.uploadf(f, media, **kwargs)
|
return self.uploadf(f, media, **kwargs)
|
||||||
|
|
||||||
def set_key(self, media: Media, item: Metadata) -> None:
|
def set_key(self, media: Media, url) -> None:
|
||||||
"""takes the media and optionally item info and generates a key"""
|
"""takes the media and optionally item info and generates a key"""
|
||||||
if media.key is not None and len(media.key) > 0: return
|
if media.key is not None and len(media.key) > 0: return
|
||||||
folder = item.get("folder", "")
|
folder = ArchivingContext.get("folder", "")
|
||||||
filename, ext = os.path.splitext(media.filename)
|
filename, ext = os.path.splitext(media.filename)
|
||||||
|
|
||||||
# path_generator logic
|
# path_generator logic
|
||||||
if self.path_generator == "flat":
|
if self.path_generator == "flat":
|
||||||
path = ""
|
path = ""
|
||||||
filename = slugify(filename) # in case it comes with os.sep
|
filename = slugify(filename) # in case it comes with os.sep
|
||||||
elif self.path_generator == "url": path = slugify(item.get_url())
|
elif self.path_generator == "url": path = slugify(url)
|
||||||
elif self.path_generator == "random":
|
elif self.path_generator == "random":
|
||||||
path = item.get("random_path", str(uuid.uuid4())[:16], True)
|
path = ArchivingContext.get("random_path", str(uuid.uuid4())[:16], True)
|
||||||
|
|
||||||
# filename_generator logic
|
# filename_generator logic
|
||||||
if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
|
if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
|
||||||
elif self.filename_generator == "static":
|
elif self.filename_generator == "static":
|
||||||
with open(media.filename, "rb") as f:
|
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
|
||||||
bytes = f.read() # read entire file as bytes
|
hd = he.calculate_hash(media.filename)
|
||||||
filename = hashlib.sha256(bytes).hexdigest()[:24]
|
filename = hd[:24]
|
||||||
|
|
||||||
media.key = os.path.join(folder, path, f"{filename}{ext}")
|
media.key = os.path.join(folder, path, f"{filename}{ext}")
|
||||||
|
|||||||
@@ -30,11 +30,9 @@ class Gsheets(Step):
|
|||||||
'archive': 'archive location',
|
'archive': 'archive location',
|
||||||
'date': 'archive date',
|
'date': 'archive date',
|
||||||
'thumbnail': 'thumbnail',
|
'thumbnail': 'thumbnail',
|
||||||
'thumbnail_index': 'thumbnail index',
|
|
||||||
'timestamp': 'upload timestamp',
|
'timestamp': 'upload timestamp',
|
||||||
'title': 'upload title',
|
'title': 'upload title',
|
||||||
'text': 'text content',
|
'text': 'text content',
|
||||||
'duration': 'duration',
|
|
||||||
'screenshot': 'screenshot',
|
'screenshot': 'screenshot',
|
||||||
'hash': 'hash',
|
'hash': 'hash',
|
||||||
'wacz': 'wacz',
|
'wacz': 'wacz',
|
||||||
|
|||||||
@@ -15,10 +15,8 @@ class GWorksheet:
|
|||||||
'archive': 'archive location',
|
'archive': 'archive location',
|
||||||
'date': 'archive date',
|
'date': 'archive date',
|
||||||
'thumbnail': 'thumbnail',
|
'thumbnail': 'thumbnail',
|
||||||
'thumbnail_index': 'thumbnail index',
|
|
||||||
'timestamp': 'upload timestamp',
|
'timestamp': 'upload timestamp',
|
||||||
'title': 'upload title',
|
'title': 'upload title',
|
||||||
'duration': 'duration',
|
|
||||||
'screenshot': 'screenshot',
|
'screenshot': 'screenshot',
|
||||||
'hash': 'hash',
|
'hash': 'hash',
|
||||||
'wacz': 'wacz',
|
'wacz': 'wacz',
|
||||||
@@ -40,11 +38,11 @@ class GWorksheet:
|
|||||||
|
|
||||||
def _col_index(self, col: str):
|
def _col_index(self, col: str):
|
||||||
self._check_col_exists(col)
|
self._check_col_exists(col)
|
||||||
return self.headers.index(self.columns[col])
|
return self.headers.index(self.columns[col].lower())
|
||||||
|
|
||||||
def col_exists(self, col: str):
|
def col_exists(self, col: str):
|
||||||
self._check_col_exists(col)
|
self._check_col_exists(col)
|
||||||
return self.columns[col] in self.headers
|
return self.columns[col].lower() in self.headers
|
||||||
|
|
||||||
def count_rows(self):
|
def count_rows(self):
|
||||||
return len(self.values)
|
return len(self.values)
|
||||||
@@ -98,7 +96,7 @@ class GWorksheet:
|
|||||||
cell_updates = [
|
cell_updates = [
|
||||||
{
|
{
|
||||||
'range': self.to_a1(row, col),
|
'range': self.to_a1(row, col),
|
||||||
'values': [[val]]
|
'values': [[str(val)[0:49999]]]
|
||||||
}
|
}
|
||||||
for row, col, val in cell_updates
|
for row, col, val in cell_updates
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -1,9 +1,9 @@
|
|||||||
|
|
||||||
_MAJOR = "0"
|
_MAJOR = "0"
|
||||||
_MINOR = "4"
|
_MINOR = "5"
|
||||||
# On main and in a nightly release the patch should be one ahead of the last
|
# On main and in a nightly release the patch should be one ahead of the last
|
||||||
# released build.
|
# released build.
|
||||||
_PATCH = "1"
|
_PATCH = "12"
|
||||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||||
_SUFFIX = ""
|
_SUFFIX = ""
|
||||||
|
|||||||
Reference in New Issue
Block a user