Compare commits
111 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b3b727b005 | ||
|
|
ee37b20e6c | ||
|
|
a184bf7b97 | ||
|
|
e535f44a88 | ||
|
|
0f28bf0e35 | ||
|
|
18a8636552 | ||
|
|
81be65c828 | ||
|
|
0a91863212 | ||
|
|
3ad8349e3f | ||
|
|
2768225cd1 | ||
|
|
3e44b9b577 | ||
|
|
1a5797d0f8 | ||
|
|
768b8fce9f | ||
|
|
613b1f1e50 | ||
|
|
919c37bfb6 | ||
|
|
a655b3c987 | ||
|
|
d645b840ee | ||
|
|
3da9c9cf8f | ||
|
|
987bbcaad0 | ||
|
|
68e9d2a2ce | ||
|
|
76be271c18 | ||
|
|
074f132ad9 | ||
|
|
c47da0a46f | ||
|
|
eb82936a04 | ||
|
|
cc03ad7c49 | ||
|
|
6d2aa3dd7a | ||
|
|
f2e580de4e | ||
|
|
3f48d75d8f | ||
|
|
80ea912d0e | ||
|
|
b7c69c0f0d | ||
|
|
c98991cdfb | ||
|
|
45b982ec38 | ||
|
|
e11be449e8 | ||
|
|
134bf09257 | ||
|
|
417ca9ef51 | ||
|
|
5b79dcb80c | ||
|
|
52d7b4a016 | ||
|
|
31f6aae7b9 | ||
|
|
26373d4545 | ||
|
|
7a34915f8e | ||
|
|
b67a7b818a | ||
|
|
2e63cb8411 | ||
|
|
9cb73c073f | ||
|
|
9d078a648f | ||
|
|
e150370657 | ||
|
|
4116c90168 | ||
|
|
2c5b115fbe | ||
|
|
bda812f850 | ||
|
|
ac82764ffc | ||
|
|
0fae7d96fb | ||
|
|
2f7181ced6 | ||
|
|
9c25b33f1c | ||
|
|
ae3e607705 | ||
|
|
c1a60fde8a | ||
|
|
875e1de589 | ||
|
|
8f3d4e05c3 | ||
|
|
3bd6bed825 | ||
|
|
2659675f06 | ||
|
|
9d44f4b207 | ||
|
|
5b0bff612e | ||
|
|
ae7ceba0e5 | ||
|
|
97821a81bc | ||
|
|
9191b38cf2 | ||
|
|
567edfc35e | ||
|
|
8c22a9df72 | ||
|
|
d2d6db162b | ||
|
|
5cfbcc0137 | ||
|
|
5fdaa6c739 | ||
|
|
3d389ee05b | ||
|
|
0ecbed0df0 | ||
|
|
69bcfea2eb | ||
|
|
2e2e695444 | ||
|
|
493055a8d9 | ||
|
|
6f6eb2db7a | ||
|
|
906ed0f6e0 | ||
|
|
39818e648a | ||
|
|
2bbf534d67 | ||
|
|
6be7536fad | ||
|
|
0654e8c5c6 | ||
|
|
0e3c427371 | ||
|
|
7497bc08c0 | ||
|
|
49863768fe | ||
|
|
7b9483bbf9 | ||
|
|
cd81cae559 | ||
|
|
23894fad51 | ||
|
|
876988b587 | ||
|
|
f95293b84b | ||
|
|
2fbcbe4e8b | ||
|
|
d1e4574c6c | ||
|
|
d347b26d37 | ||
|
|
1970fa3c82 | ||
|
|
aa5430451e | ||
|
|
f35875a94c | ||
|
|
5505255ea3 | ||
|
|
da17b3f68a | ||
|
|
d6dbdec6ac | ||
|
|
224ebe7ee8 | ||
|
|
54a1bc2172 | ||
|
|
77948207d1 | ||
|
|
60552ae0ea | ||
|
|
f255271ecb | ||
|
|
db45e0980e | ||
|
|
2a7ece5dcc | ||
|
|
d14adf0242 | ||
|
|
75459d2880 | ||
|
|
94406bda7a | ||
|
|
6244f35cff | ||
|
|
adb3a7332f | ||
|
|
0d903fa196 | ||
|
|
e5f3e56968 | ||
|
|
57e7023f64 |
57
.github/workflows/docker-publish.yaml
vendored
Normal file
@@ -0,0 +1,57 @@
|
||||
name: Docker
|
||||
|
||||
# This workflow uses actions that are not certified by GitHub.
|
||||
# They are provided by a third-party and are governed by
|
||||
# separate terms of service, privacy policy, and support
|
||||
# documentation.
|
||||
|
||||
on:
|
||||
release:
|
||||
types: [published]
|
||||
push:
|
||||
# branches: [ "main" ]
|
||||
tags: [ "v*.*.*" ]
|
||||
|
||||
env:
|
||||
# Use docker.io for Docker Hub if empty
|
||||
REGISTRY: ghcr.io
|
||||
# github.repository as <account>/<repo>
|
||||
IMAGE_NAME: ${{ github.repository }}
|
||||
|
||||
|
||||
jobs:
|
||||
push_to_registry:
|
||||
name: Push Docker image to Docker Hub
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v1
|
||||
# https://github.com/docker/setup-buildx-action
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
id: buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||
|
||||
- name: Extract metadata (tags, labels) for Docker
|
||||
id: meta
|
||||
uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38
|
||||
with:
|
||||
images: bellingcat/auto-archiver
|
||||
|
||||
- name: Build and push Docker image
|
||||
uses: docker/build-push-action@v2
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/amd64,linux/arm64
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
13
.github/workflows/python-publish.yaml
vendored
@@ -6,24 +6,21 @@
|
||||
# separate terms of service, privacy policy, and support
|
||||
# documentation.
|
||||
|
||||
name: Upload Python Package
|
||||
name: Pypi
|
||||
|
||||
on:
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
push:
|
||||
branches:
|
||||
- dockerize
|
||||
tags:
|
||||
- 'v*.*.*'
|
||||
# branches: [ "main" ]
|
||||
tags: [ "v*.*.*" ]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
|
||||
name: Publish python package
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
@@ -50,5 +47,7 @@ jobs:
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
user: __token__
|
||||
verbose: true
|
||||
skip_existing: true
|
||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
||||
packages_dir: dist/
|
||||
33
Dockerfile
@@ -1,35 +1,36 @@
|
||||
# stage 1 - all dependencies
|
||||
From python:3.10
|
||||
FROM webrecorder/browsertrix-crawler:latest
|
||||
|
||||
ENV RUNNING_IN_DOCKER=1
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# TODO: use custom ffmpeg builds instead of apt-get install
|
||||
RUN pip install --upgrade pip && \
|
||||
pip install pipenv && \
|
||||
add-apt-repository ppa:mozillateam/ppa && \
|
||||
apt-get update && \
|
||||
apt-get install -y gcc ffmpeg fonts-noto firefox-esr && \
|
||||
wget https://github.com/mozilla/geckodriver/releases/download/v0.32.0/geckodriver-v0.32.0-linux64.tar.gz && \
|
||||
apt-get install -y gcc ffmpeg fonts-noto && \
|
||||
apt-get install -y --no-install-recommends firefox-esr && \
|
||||
ln -s /usr/bin/firefox-esr /usr/bin/firefox && \
|
||||
wget https://github.com/mozilla/geckodriver/releases/download/v0.33.0/geckodriver-v0.33.0-linux64.tar.gz && \
|
||||
tar -xvzf geckodriver* -C /usr/local/bin && \
|
||||
chmod +x /usr/local/bin/geckodriver && \
|
||||
rm geckodriver-v*
|
||||
rm geckodriver-v*
|
||||
|
||||
|
||||
# install docker for WACZ
|
||||
# TODO: currently disabled see https://github.com/bellingcat/auto-archiver/issues/66
|
||||
# RUN curl -fsSL https://get.docker.com | sh
|
||||
|
||||
# TODO: avoid copying unnecessary files, including .git
|
||||
COPY Pipfile Pipfile.lock ./
|
||||
RUN pipenv install --python=3.10 --system --deploy
|
||||
ENV IS_DOCKER=1
|
||||
COPY Pipfile* ./
|
||||
RUN pipenv install
|
||||
|
||||
# doing this at the end helps during development, builds are quick
|
||||
COPY ./src/ .
|
||||
|
||||
# TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
|
||||
# RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
|
||||
# USER archiver
|
||||
ENTRYPOINT ["python"]
|
||||
# ENTRYPOINT ["docker-entrypoint.sh"]
|
||||
|
||||
# should be executed with 2 volumes (3 if local_storage)
|
||||
# docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa --help
|
||||
|
||||
ENTRYPOINT ["pipenv", "run", "python3", "-m", "auto_archiver"]
|
||||
|
||||
# should be executed with 2 volumes (3 if local_storage is used)
|
||||
# docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml
|
||||
9
Pipfile
@@ -14,7 +14,6 @@ loguru = "*"
|
||||
ffmpeg-python = "*"
|
||||
selenium = "*"
|
||||
snscrape = "*"
|
||||
yt-dlp = "*"
|
||||
telethon = "*"
|
||||
google-api-python-client = "*"
|
||||
google-auth-httplib2 = "*"
|
||||
@@ -23,16 +22,20 @@ oauth2client = "*"
|
||||
python-slugify = "*"
|
||||
pyyaml = "*"
|
||||
dateparser = "*"
|
||||
vk-url-scraper = "*"
|
||||
python-twitter-v2 = "*"
|
||||
instaloader = "*"
|
||||
tqdm = "*"
|
||||
jinja2 = "*"
|
||||
cryptography = "==38.0.4"
|
||||
dataclasses-json = "*"
|
||||
yt-dlp = ">=2023.2.17"
|
||||
vk-url-scraper = "*"
|
||||
uwsgi = "*"
|
||||
requests = {extras = ["socks"], version = "*"}
|
||||
# wacz = "==0.4.8"
|
||||
|
||||
[requires]
|
||||
python_version = "3.9"
|
||||
python_version = "3.10"
|
||||
|
||||
[dev-packages]
|
||||
autopep8 = "*"
|
||||
|
||||
1223
Pipfile.lock
generated
242
README.md
@@ -1,11 +1,243 @@
|
||||
# Auto Archiver
|
||||
<h1 align="center">Auto Archiver</h1>
|
||||
|
||||
[](https://badge.fury.io/py/auto-archiver)
|
||||
[](https://hub.docker.com/r/bellingcat/auto-archiver)
|
||||
<!--  -->
|
||||
<!-- [](https://pypi.python.org/pypi/auto-archiver/) -->
|
||||
<!-- [](https://vk-url-scraper.readthedocs.io/en/latest/?badge=latest) -->
|
||||
|
||||
|
||||
Read the [article about Auto Archiver on bellingcat.com](https://www.bellingcat.com/resources/2022/09/22/preserve-vital-online-content-with-bellingcats-auto-archiver-tool/).
|
||||
|
||||
|
||||
Python tool to automatically archive social media posts, videos, and images from a Google Sheets, the console, and more. Uses different archivers depending on the platform, and can save content to local storage, S3 bucket (Digital Ocean Spaces, AWS, ...), and Google Drive. If using Google Sheets as the source for links, it will be updated with information about the archived content. It can be run manually or on an automated basis.
|
||||
|
||||
There are 3 ways to use the auto-archiver
|
||||
1. (simplest) via docker `docker ... TODO`
|
||||
2. (pypi) `pip install auto-archiver`
|
||||
3. (legacy) clone and manually install from repo (see legacy [tutorial video](https://youtu.be/VfAhcuV2tLQ))
|
||||
There are 3 ways to use the auto-archiver:
|
||||
1. (easiest installation) via docker
|
||||
2. (local python install) `pip install auto-archiver`
|
||||
3. (legacy/development) clone and manually install from repo (see legacy [tutorial video](https://youtu.be/VfAhcuV2tLQ))
|
||||
|
||||
But **you always need a configuration/orchestration file**, which is where you'll configure where/what/how to archive. Make sure you read [orchestration](#orchestration).
|
||||
|
||||
|
||||
## How to install and run the auto-archiver
|
||||
|
||||
### Option 1 - docker
|
||||
|
||||
[](https://hub.docker.com/r/bellingcat/auto-archiver)
|
||||
|
||||
Docker works like a virtual machine running inside your computer, it isolates everything and makes installation simple. Since it is an isolated environment when you need to pass it your orchestration file or get downloaded media out of docker you will need to connect folders on your machine with folders inside docker with the `-v` volume flag.
|
||||
|
||||
|
||||
1. install [docker](https://docs.docker.com/get-docker/)
|
||||
2. pull the auto-archiver docker [image](https://hub.docker.com/r/bellingcat/auto-archiver) with `docker pull bellingcat/auto-archiver`
|
||||
3. run the docker image locally in a container: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml` breaking this command down:
|
||||
1. `docker run` tells docker to start a new container (an instance of the image)
|
||||
2. `--rm` makes sure this container is removed after execution (less garbage locally)
|
||||
3. `-v $PWD/secrets:/app/secrets` - your secrets folder
|
||||
1. `-v` is a volume flag which means a folder that you have on your computer will be connected to a folder inside the docker container
|
||||
2. `$PWD/secrets` points to a `secrets/` folder in your current working directory (where your console points to), we use this folder as a best practice to hold all the secrets/tokens/passwords/... you use
|
||||
3. `/app/secrets` points to the path the docker container where this image can be found
|
||||
4. `-v $PWD/local_archive:/app/local_archive` - (optional) if you use local_storage
|
||||
1. `-v` same as above, this is a volume instruction
|
||||
2. `$PWD/local_archive` is a folder `local_archive/` in case you want to archive locally and have the files accessible outside docker
|
||||
3. `/app/local_archive` is a folder inside docker that you can reference in your orchestration.yml file
|
||||
|
||||
### Option 2 - python package
|
||||
|
||||
<details><summary><code>Python package instructions</code></summary>
|
||||
|
||||
1. make sure you have python 3.8 or higher installed
|
||||
2. install the package `pip/pipenv/conda install auto-archiver`
|
||||
3. test it's installed with `auto-archiver --help`
|
||||
4. run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml` if your orchestration file is inside a `secrets/`, which we advise
|
||||
|
||||
You will also need [ffmpeg](https://www.ffmpeg.org/), [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases), and optionally [fonts-noto](https://fonts.google.com/noto). Similar to the local installation.
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
### Option 3 - local installation
|
||||
This can also be used for development.
|
||||
|
||||
<details><summary><code>Legacy instructions, only use if docker/package is not an option</code></summary>
|
||||
|
||||
|
||||
Install the following locally:
|
||||
1. [ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work.
|
||||
2. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`.
|
||||
3. (optional) [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`.
|
||||
|
||||
Clone and run:
|
||||
1. `git clone https://github.com/bellingcat/auto-archiver`
|
||||
2. `pipenv install`
|
||||
3. `pipenv run python -m src.auto_archiver --config secrets/orchestration.yaml`
|
||||
|
||||
|
||||
</details><br/>
|
||||
|
||||
# Orchestration
|
||||
The archiver work is orchestrated by the following workflow (we call each a **step**):
|
||||
1. **Feeder** gets the links (from a spreadsheet, from the console, ...)
|
||||
2. **Archiver** tries to archive the link (twitter, youtube, ...)
|
||||
3. **Enricher** adds more info to the content (hashes, thumbnails, ...)
|
||||
4. **Formatter** creates a report from all the archived content (HTML, PDF, ...)
|
||||
5. **Database** knows what's been archived and also stores the archive result (spreadsheet, CSV, or just the console)
|
||||
|
||||
To setup an auto-archiver instance create an `orchestration.yaml` which contains the workflow you would like. We advise you put this file into a `secrets/` folder and do not share it with others because it will contain passwords and other secrets.
|
||||
|
||||
The structure of orchestration file is split into 2 parts: `steps` (what **steps** to use) and `configurations` (how those steps should behave), here's a simplification:
|
||||
```yaml
|
||||
# orchestration.yaml content
|
||||
steps:
|
||||
feeder: gsheet_feeder
|
||||
archivers: # order matters
|
||||
- youtubedl_archiver
|
||||
enrichers:
|
||||
- thumbnail_enricher
|
||||
formatter: html_formatter
|
||||
storages:
|
||||
- local_storage
|
||||
databases:
|
||||
- gsheet_db
|
||||
|
||||
configurations:
|
||||
gsheet_feeder:
|
||||
sheet: "your google sheet name"
|
||||
header: 2 # row with header for your sheet
|
||||
# ... configurations for the other steps here ...
|
||||
```
|
||||
|
||||
To see all available `steps` (which archivers, storages, databses, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).
|
||||
|
||||
All the `configurations` in the `orchestration.yaml` file (you can name it differently but need to pass it in the `--config FILENAME` argument) can be seen in the console by using the `--help` flag. They can also be overwritten, for example if you are using the `cli_feeder` to archive from the command line and want to provide the URLs you should do:
|
||||
|
||||
```bash
|
||||
auto-archiver --config secrets/orchestration.yaml --cli_feeder.urls="url1,url2,url3"
|
||||
```
|
||||
|
||||
Here's the complete workflow that the auto-archiver goes through:
|
||||
```mermaid
|
||||
graph TD
|
||||
s((start)) --> F(fa:fa-table Feeder)
|
||||
F -->|get and clean URL| D1{fa:fa-database Database}
|
||||
D1 -->|is already archived| e((end))
|
||||
D1 -->|not yet archived| a(fa:fa-download Archivers)
|
||||
a -->|got media| E(fa:fa-chart-line Enrichers)
|
||||
E --> S[fa:fa-box-archive Storages]
|
||||
E --> Fo(fa:fa-code Formatter)
|
||||
Fo --> S
|
||||
Fo -->|update database| D2(fa:fa-database Database)
|
||||
D2 --> e
|
||||
```
|
||||
|
||||
## Orchestration checklist
|
||||
Use this to make sure you help making sure you did all the required steps:
|
||||
* [ ] you have a `/secrets` folder with all your configuration files including
|
||||
* [ ] a orchestration file eg: `orchestration.yaml` pointing to the correct location of other files
|
||||
* [ ] (optional if you use GoogleSheets) you have a `service_account.json` (see [how-to](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account))
|
||||
* [ ] (optional for telegram) a `anon.session` which appears after the 1st run where you login to telegram
|
||||
* if you use private channels you need to add `channel_invites` and set `join_channels=true` at least once
|
||||
* [ ] (optional for VK) a `vk_config.v2.json`
|
||||
* [ ] (optional for using GoogleDrive storage) `gd-token.json` (see [help script](scripts/create_update_gdrive_oauth_token.py))
|
||||
* [ ] (optional for instagram) `instaloader.session` file which appears after the 1st run and login in instagram
|
||||
* [ ] (optional for browsertrix) `profile.tar.gz` file
|
||||
|
||||
#### Example invocations
|
||||
The recommended way to run the auto-archiver is through Docker. The invocations below will run the auto-archiver Docker image using a configuration file that you have specified
|
||||
|
||||
```bash
|
||||
# all the configurations come from ./secrets/orchestration.yaml
|
||||
docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml
|
||||
# uses the same configurations but for another google docs sheet
|
||||
# with a header on row 2 and with some different column names
|
||||
# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided
|
||||
docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}'
|
||||
# all the configurations come from orchestration.yaml and specifies that s3 files should be private
|
||||
docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml --s3_storage.private=1
|
||||
```
|
||||
|
||||
The auto-archiver can also be run locally, if pre-requisites are correctly configured. Equivalent invocations are below.
|
||||
|
||||
```bash
|
||||
# all the configurations come from ./secrets/orchestration.yaml
|
||||
auto-archiver --config secrets/orchestration.yaml
|
||||
# uses the same configurations but for another google docs sheet
|
||||
# with a header on row 2 and with some different column names
|
||||
# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided
|
||||
auto-archiver --config secrets/orchestration.yaml --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}'
|
||||
# all the configurations come from orchestration.yaml and specifies that s3 files should be private
|
||||
auto-archiver --config secrets/orchestration.yaml --s3_storage.private=1
|
||||
```
|
||||
|
||||
### Extra notes on configuration
|
||||
#### Google Drive
|
||||
To use Google Drive storage you need the id of the shared folder in the `config.yaml` file which must be shared with the service account eg `autoarchiverservice@auto-archiver-111111.iam.gserviceaccount.com` and then you can use `--storage=gd`
|
||||
|
||||
#### Telethon + Instagram with telegram bot
|
||||
The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root.
|
||||
|
||||
|
||||
## Running on Google Sheets Feeder (gsheet_feeder)
|
||||
The `--gseets_feeder.sheet` property is the name of the Google Sheet to check for URLs.
|
||||
This sheet must have been shared with the Google Service account used by `gspread`.
|
||||
This sheet must also have specific columns (case-insensitive) in the `header` as specified in [Gsheet.configs](src/auto_archiver/utils/gsheet.py). The default names of these columns and their purpose is:
|
||||
|
||||
Inputs:
|
||||
|
||||
* **Link** *(required)*: the URL of the post to archive
|
||||
* **Destination folder**: custom folder for archived file (regardless of storage)
|
||||
|
||||
Outputs:
|
||||
* **Archive status** *(required)*: Status of archive operation
|
||||
* **Archive location**: URL of archived post
|
||||
* **Archive date**: Date archived
|
||||
* **Thumbnail**: Embeds a thumbnail for the post in the spreadsheet
|
||||
* **Timestamp**: Timestamp of original post
|
||||
* **Title**: Post title
|
||||
* **Text**: Post text
|
||||
* **Screenshot**: Link to screenshot of post
|
||||
* **Hash**: Hash of archived HTML file (which contains hashes of post media)
|
||||
* **WACZ**: Link to a WACZ web archive of post
|
||||
* **ReplayWebpage**: Link to a ReplayWebpage viewer of the WACZ archive
|
||||
|
||||
For example, this is a spreadsheet configured with all of the columns for the auto archiver and a few URLs to archive. (Note that the column names are not case sensitive.)
|
||||
|
||||

|
||||
|
||||
Now the auto archiver can be invoked, with this command in this example: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver:dockerize --config secrets/orchestration-global.yaml --gsheet_feeder.sheet "Auto archive test 2023-2"`. Note that the sheet name has been overridden/specified in the command line invocation.
|
||||
|
||||
When the auto archiver starts running, it updates the "Archive status" column.
|
||||
|
||||

|
||||
|
||||
The links are downloaded and archived, and the spreadsheet is updated to the following:
|
||||
|
||||

|
||||
|
||||
Note that the first row is skipped, as it is assumed to be a header row (`--gsheet_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked.
|
||||
|
||||
The "archive location" link contains the path of the archived file, in local storage, S3, or in Google Drive.
|
||||
|
||||

|
||||
|
||||
---
|
||||
## Development
|
||||
Use `python -m src.auto_archiver --config secrets/orchestration.yaml` to run from the local development environment.
|
||||
|
||||
#### Docker development
|
||||
working with docker locally:
|
||||
* `docker build . -t auto-archiver` to build a local image
|
||||
* `docker run --rm -v $PWD/secrets:/app/secrets auto-archiver pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml`
|
||||
* to use local archive, also create a volume `-v` for it by adding `-v $PWD/local_archive:/app/local_archive`
|
||||
|
||||
|
||||
release to docker hub
|
||||
* `docker image tag auto-archiver bellingcat/auto-archiver:latest`
|
||||
* `docker push bellingcat/auto-archiver`
|
||||
|
||||
#### RELEASE
|
||||
* update version in [version.py](src/auto_archiver/version.py)
|
||||
* run `bash ./scripts/release.sh` and confirm
|
||||
* package is automatically updated in pypi
|
||||
* docker image is automatically pushed to dockerhup
|
||||
|
||||
|
Before Width: | Height: | Size: 183 KiB |
|
Before Width: | Height: | Size: 486 KiB After Width: | Height: | Size: 1.5 MiB |
BIN
docs/demo-archive.png
Normal file
|
After Width: | Height: | Size: 819 KiB |
|
Before Width: | Height: | Size: 223 KiB After Width: | Height: | Size: 664 KiB |
|
Before Width: | Height: | Size: 241 KiB After Width: | Height: | Size: 698 KiB |
@@ -1,143 +0,0 @@
|
||||
---
|
||||
secrets:
|
||||
# needed if you use storage=s3
|
||||
s3:
|
||||
# contains S3 info on region, bucket, key and secret
|
||||
region: reg1
|
||||
bucket: my-bucket
|
||||
key: "s3 API key"
|
||||
secret: "s3 API secret"
|
||||
# use region format like such
|
||||
endpoint_url: "https://{region}.digitaloceanspaces.com"
|
||||
# endpoint_url: "https://s3.{region}.amazonaws.com"
|
||||
#use bucket, region, and key (key is the archived file path generated when executing) format like such as:
|
||||
cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
|
||||
# if private:true S3 urls will not be readable online
|
||||
private: false
|
||||
# with 'random' you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files, alternative is 'default' or not omitted from config
|
||||
key_path: random
|
||||
|
||||
# needed if you use storage=gd
|
||||
google_drive:
|
||||
# To authenticate with google you have two options (1. service account OR 2. OAuth token)
|
||||
|
||||
# 1. service account - storage space will count towards the developer account
|
||||
# filename can be the same or different file from google_sheets.service_account, defaults to "service_account.json"
|
||||
# service_account: "service_account.json"
|
||||
|
||||
# 2. OAuth token - storage space will count towards the owner of the GDrive folder
|
||||
# (only 1. or 2. - if both specified then this 2. takes precedence)
|
||||
# needs write access on the server so refresh flow works
|
||||
# To get the token, run the file `create_update_test_oauth_token.py`
|
||||
# you can edit that file if you want a different token filename, default is "gd-token.json"
|
||||
oauth_token_filename: "gd-token.json"
|
||||
|
||||
root_folder_id: copy XXXX from https://drive.google.com/drive/folders/XXXX
|
||||
|
||||
# needed if you use storage=local
|
||||
local:
|
||||
# local path to save files in
|
||||
save_to: "./local_archive"
|
||||
|
||||
wayback:
|
||||
# to get credentials visit https://archive.org/account/s3.php
|
||||
key: your API key
|
||||
secret: your API secret
|
||||
|
||||
telegram:
|
||||
# to get credentials see: https://telegra.ph/How-to-get-Telegram-APP-ID--API-HASH-05-27
|
||||
api_id: your API key, see
|
||||
api_hash: your API hash
|
||||
# optional, but allows access to more content such as large videos, talk to @botfather
|
||||
bot_token: your bot-token
|
||||
# optional, defaults to ./anon, records the telegram login session for future usage
|
||||
session_file: "secrets/anon"
|
||||
|
||||
# twitter configuration - API V2 only
|
||||
# if you don't provide credentials the less-effective unofficial TwitterArchiver will be used instead
|
||||
twitter:
|
||||
# either bearer_token only
|
||||
bearer_token: ""
|
||||
# OR all of the below
|
||||
consumer_key: ""
|
||||
consumer_secret: ""
|
||||
access_token: ""
|
||||
access_secret: ""
|
||||
|
||||
# vkontakte (vk.com) credentials
|
||||
vk:
|
||||
username: "phone number or email"
|
||||
password: "password"
|
||||
# optional, defaults to ./vk_config.v2.json, records VK login session for future usage
|
||||
session_file: "secrets/vk_config.v2.json"
|
||||
|
||||
# instagram credentials
|
||||
instagram:
|
||||
username: "username"
|
||||
password: "password"
|
||||
session_file: "instaloader.session" # <- default value
|
||||
|
||||
google_sheets:
|
||||
# local filename: defaults to service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account
|
||||
service_account: "service_account.json"
|
||||
|
||||
facebook:
|
||||
# optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'
|
||||
cookie: ""
|
||||
execution:
|
||||
# can be overwritten with CMD --sheet=
|
||||
sheet: your-sheet-name
|
||||
|
||||
# block or allow worksheets by name, instead of defaulting to checking all worksheets in a Spreadsheet
|
||||
# worksheet_allow and worksheet_block can be single values or lists
|
||||
# if worksheet_allow is specified, worksheet_block is ignored
|
||||
# worksheet_allow:
|
||||
# - Sheet1
|
||||
# - "Sheet 2"
|
||||
# worksheet_block: BlockedSheet
|
||||
|
||||
# which row of your tabs contains the header, can be overwritten with CMD --header=
|
||||
header: 1
|
||||
# which storage to use, can be overwritten with CMD --storage=
|
||||
storage: s3
|
||||
# defaults to false, when true will try to avoid duplicate URL archives
|
||||
check_if_exists: true
|
||||
|
||||
# choose a hash algorithm (either SHA-256 or SHA3-512, defaults to SHA-256)
|
||||
# hash_algorithm: SHA-256
|
||||
|
||||
# optional configurations for the selenium browser that takes screenshots, these are the defaults
|
||||
selenium:
|
||||
# values under 10s might mean screenshots fail to grab screenshot
|
||||
timeout_seconds: 120
|
||||
window_width: 1400
|
||||
window_height: 2000
|
||||
|
||||
# optional browsertrix configuration (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)
|
||||
# browsertrix will capture a WACZ archive of the page which can then be seen as the original on replaywebpage
|
||||
browsertrix:
|
||||
enabled: true # defaults to false
|
||||
profile: "./browsertrix/crawls/profile.tar.gz"
|
||||
timeout_seconds: 120 # defaults to 90s
|
||||
# puts execution logs into /logs folder, defaults to false
|
||||
save_logs: true
|
||||
# custom column names, only needed if different from default, can be overwritten with CMD --col-NAME="VALUE"
|
||||
# url and status are the only columns required to be present in the google sheet
|
||||
column_names:
|
||||
url: link
|
||||
status: archive status
|
||||
archive: archive location
|
||||
# use this column to override default location data
|
||||
folder: folder
|
||||
date: archive date
|
||||
thumbnail: thumbnail
|
||||
thumbnail_index: thumbnail index
|
||||
timestamp: upload timestamp
|
||||
title: upload title
|
||||
duration: duration
|
||||
screenshot: screenshot
|
||||
hash: hash
|
||||
wacz: wacz
|
||||
# if you want the replaypage to work, make sure to allow CORS on your bucket, see https://replayweb.page/docs/embedding#cors-restrictions
|
||||
replaywebpage: replaywebpage
|
||||
|
||||
121
example.orchestration.yaml
Normal file
@@ -0,0 +1,121 @@
|
||||
steps:
|
||||
# only 1 feeder allowed
|
||||
feeder: gsheet_feeder # defaults to cli_feeder
|
||||
archivers: # order matters, uncomment to activate
|
||||
# - vk_archiver
|
||||
# - telethon_archiver
|
||||
# - telegram_archiver
|
||||
# - twitter_archiver
|
||||
# - twitter_api_archiver
|
||||
# - instagram_tbot_archiver
|
||||
# - instagram_archiver
|
||||
# - tiktok_archiver
|
||||
- youtubedl_archiver
|
||||
- wayback_archiver_enricher
|
||||
enrichers:
|
||||
- hash_enricher
|
||||
# - screenshot_enricher
|
||||
# - thumbnail_enricher
|
||||
# - wayback_archiver_enricher
|
||||
# - wacz_enricher
|
||||
|
||||
formatter: html_formatter # defaults to mute_formatter
|
||||
storages:
|
||||
- local_storage
|
||||
# - s3_storage
|
||||
# - gdrive_storage
|
||||
databases:
|
||||
- console_db
|
||||
# - csv_db
|
||||
# - gsheet_db
|
||||
# - mongo_db
|
||||
|
||||
configurations:
|
||||
gsheet_feeder:
|
||||
sheet: "your sheet name"
|
||||
header: 1
|
||||
service_account: "secrets/service_account.json"
|
||||
# allow_worksheets: "only parse this worksheet"
|
||||
# block_worksheets: "blocked sheet 1,blocked sheet 2"
|
||||
use_sheet_names_in_stored_paths: false
|
||||
columns:
|
||||
url: link
|
||||
status: archive status
|
||||
folder: destination folder
|
||||
archive: archive location
|
||||
date: archive date
|
||||
thumbnail: thumbnail
|
||||
timestamp: upload timestamp
|
||||
title: upload title
|
||||
text: textual content
|
||||
screenshot: screenshot
|
||||
hash: hash
|
||||
wacz: wacz
|
||||
replaywebpage: replaywebpage
|
||||
instagram_tbot_archiver:
|
||||
api_id: "TELEGRAM_BOT_API_ID"
|
||||
api_hash: "TELEGRAM_BOT_API_HASH"
|
||||
# session_file: "secrets/anon"
|
||||
telethon_archiver:
|
||||
api_id: "TELEGRAM_BOT_API_ID"
|
||||
api_hash: "TELEGRAM_BOT_API_HASH"
|
||||
# session_file: "secrets/anon"
|
||||
join_channels: false
|
||||
channel_invites: # if you want to archive from private channels
|
||||
- invite: https://t.me/+123456789
|
||||
id: 0000000001
|
||||
- invite: https://t.me/+123456788
|
||||
id: 0000000002
|
||||
|
||||
twitter_api_archiver:
|
||||
# either bearer_token only
|
||||
bearer_token: "TWITTER_BEARER_TOKEN"
|
||||
# OR all of the below
|
||||
# consumer_key: ""
|
||||
# consumer_secret: ""
|
||||
# access_token: ""
|
||||
# access_secret: ""
|
||||
instagram_archiver:
|
||||
username: "INSTAGRAM_USERNAME"
|
||||
password: "INSTAGRAM_PASSWORD"
|
||||
# session_file: "secrets/instaloader.session"
|
||||
|
||||
vk_archiver:
|
||||
username: "or phone number"
|
||||
password: "vk pass"
|
||||
session_file: "secrets/vk_config.v2.json"
|
||||
|
||||
screenshot_enricher:
|
||||
width: 1280
|
||||
height: 2300
|
||||
wayback_archiver_enricher:
|
||||
timeout: 10
|
||||
key: "wayback key"
|
||||
secret: "wayback secret"
|
||||
hash_enricher:
|
||||
algorithm: "SHA3-512" # can also be SHA-256
|
||||
wacz_enricher:
|
||||
profile: secrets/profile.tar.gz
|
||||
local_storage:
|
||||
save_to: "./local_archive"
|
||||
save_absolute: true
|
||||
filename_generator: static
|
||||
path_generator: flat
|
||||
s3_storage:
|
||||
bucket: your-bucket-name
|
||||
region: reg1
|
||||
key: S3_KEY
|
||||
secret: S3_SECRET
|
||||
endpoint_url: "https://{region}.digitaloceanspaces.com"
|
||||
cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
|
||||
# if private:true S3 urls will not be readable online
|
||||
private: false
|
||||
# with 'random' you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files, alternative is 'default' or not omitted from config
|
||||
key_path: random
|
||||
|
||||
gdrive_storage:
|
||||
path_generator: url
|
||||
filename_generator: random
|
||||
root_folder_id: folder_id_from_url
|
||||
oauth_token: secrets/gd-token.json # needs to be generated with scripts/create_update_gdrive_oauth_token.py
|
||||
service_account: "secrets/service_account.json"
|
||||
@@ -1,82 +0,0 @@
|
||||
steps:
|
||||
# only 1 feeder allowed
|
||||
# a feeder could be in an "infinite loop" for example: gsheets_infinite feeder which holds-> this could be an easy logic addiction by modifying for each to while not feeder.done() if it becomes necessary
|
||||
feeder: gsheet_feeder # default -> only expects URL from CLI
|
||||
archivers: # order matters
|
||||
- telethon
|
||||
# - tiktok
|
||||
# - twitter
|
||||
# - instagram
|
||||
# - webarchive # this way it runs as a failsafe only
|
||||
# enrichers:
|
||||
# - screenshot
|
||||
# - wacz
|
||||
# - webarchive # this way it runs for every case, webarchive extends archiver and enrichment
|
||||
# - thumbnails
|
||||
formatters:
|
||||
- HTMLFormater
|
||||
- PdfFormater
|
||||
storages:
|
||||
- local_storage
|
||||
- s3
|
||||
databases:
|
||||
- gsheets_db
|
||||
- mongo_db
|
||||
|
||||
|
||||
|
||||
configurations:
|
||||
global:
|
||||
- save_logs: False
|
||||
gsheet_feeder:
|
||||
sheet: my-auto-archiver
|
||||
header: 2 # defaults to 1 in GSheetsFeeder
|
||||
service_account: "secrets/service_account.json"
|
||||
# allow_worksheets: "allowed"
|
||||
# block_worksheets: "blocked1,blocked2"
|
||||
columns:
|
||||
'url': 'link'
|
||||
'status': 'archive status'
|
||||
'folder': 'destination folder'
|
||||
'archive': 'archive location'
|
||||
'date': 'archive date'
|
||||
'thumbnail': 'thumbnail'
|
||||
'thumbnail_index': 'thumbnail index'
|
||||
'timestamp': 'upload timestamp'
|
||||
'title': 'upload title'
|
||||
'duration': 'duration'
|
||||
'screenshot': 'screenshot'
|
||||
'hash': 'hash'
|
||||
'wacz': 'wacz'
|
||||
'replaywebpage': 'replaywebpage'
|
||||
telethon:
|
||||
api_id: "1234567"
|
||||
api_hash: "examplehash"
|
||||
session_file: "secrets/anon"
|
||||
channel_invites:
|
||||
- invite: https://t.me/+XXXXXXXXXXXXXX
|
||||
id: 1000000000
|
||||
- invite: https://t.me/joinchat/XXXXXXXXXXXXXX
|
||||
id: 1000000001
|
||||
|
||||
tiktok:
|
||||
api_keys:
|
||||
- username: 1
|
||||
password: 2
|
||||
- username: 3
|
||||
password: 4
|
||||
username: "abc"
|
||||
password: "123"
|
||||
token: "here"
|
||||
screenshot:
|
||||
width: 1280
|
||||
height: 4600
|
||||
wacz:
|
||||
profile: secrets/profile.tar.gz
|
||||
webarchive:
|
||||
api_key: "12345"
|
||||
s3:
|
||||
- bucket: 123
|
||||
- region: "nyc3"
|
||||
- cdn: "{region}{bucket}"
|
||||
|
||||
@@ -10,6 +10,7 @@ from googleapiclient.errors import HttpError
|
||||
# You can run this code to get a new token and verify it belongs to the correct user
|
||||
# This token will be refresh automatically by the auto-archiver
|
||||
# Code below from https://developers.google.com/drive/api/quickstart/python
|
||||
# Example invocation: py scripts/create_update_gdrive_oauth_token.py -c secrets/credentials.json -t secrets/gd-token.json
|
||||
|
||||
SCOPES = ['https://www.googleapis.com/auth/drive']
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
set -e
|
||||
|
||||
TAG=$(python -c 'from src.auto_archiver.version import VERSION; print("v" + VERSION)')
|
||||
TAG=$(python -c 'from src.auto_archiver.version import __version__; print("v" + __version__)')
|
||||
|
||||
read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt
|
||||
|
||||
|
||||
14
setup.cfg
@@ -1,10 +1,11 @@
|
||||
[metadata]
|
||||
name = auto_archiver
|
||||
version = 0.2.19
|
||||
version = attr: auto_archiver.version.__version__
|
||||
author = Bellingcat
|
||||
author_email = tech@bellingcat.com
|
||||
description = Easily archive online media content
|
||||
long_description = file: README.md
|
||||
long_description_content_type = text/markdown
|
||||
keywords = archive, oosi, osint, scraping
|
||||
license = MIT
|
||||
classifiers =
|
||||
@@ -12,21 +13,24 @@ classifiers =
|
||||
Intended Audience :: Science/Research
|
||||
License :: OSI Approved :: MIT License
|
||||
Programming Language :: Python :: 3
|
||||
project_urls =
|
||||
Source Code = https://github.com/bellingcat/auto-archiver
|
||||
Bug Tracker = https://github.com/bellingcat/auto-archiver/issues
|
||||
Bellingcat = https://www.bellingcat.com
|
||||
platforms = any
|
||||
|
||||
[options]
|
||||
setup_requires =
|
||||
setuptools-pipfile
|
||||
zip_safe = False
|
||||
include_package_data = True
|
||||
package_dir=
|
||||
=src
|
||||
packages=find:
|
||||
find_packages=true
|
||||
python_requires = >=3.8
|
||||
|
||||
# [options.package_data]
|
||||
# * = *.txt, *.rst
|
||||
# hello = *.msg
|
||||
[options.package_data]
|
||||
* = *.html
|
||||
|
||||
[options.entry_points]
|
||||
console_scripts =
|
||||
|
||||
@@ -4,4 +4,4 @@ from . import archivers, databases, enrichers, feeders, formatters, storages, ut
|
||||
from .core.orchestrator import ArchivingOrchestrator
|
||||
from .core.config import Config
|
||||
# making accessible directly
|
||||
from .core.metadata import Metadata
|
||||
from .core.metadata import Metadata
|
||||
|
||||
@@ -5,7 +5,7 @@ def main():
|
||||
config = Config()
|
||||
config.parse()
|
||||
orchestrator = ArchivingOrchestrator(config)
|
||||
orchestrator.feed()
|
||||
for r in orchestrator.feed(): pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -3,6 +3,7 @@ from .telethon_archiver import TelethonArchiver
|
||||
from .twitter_archiver import TwitterArchiver
|
||||
from .twitter_api_archiver import TwitterApiArchiver
|
||||
from .instagram_archiver import InstagramArchiver
|
||||
from .instagram_tbot_archiver import InstagramTbotArchiver
|
||||
from .tiktok_archiver import TiktokArchiver
|
||||
from .telegram_archiver import TelegramArchiver
|
||||
from .vk_archiver import VkArchiver
|
||||
|
||||
@@ -3,8 +3,8 @@ from abc import abstractmethod
|
||||
from dataclasses import dataclass
|
||||
import os
|
||||
import mimetypes, requests
|
||||
from ..core import Metadata
|
||||
from ..core import Step
|
||||
|
||||
from ..core import Metadata, Step, ArchivingContext
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -51,7 +51,7 @@ class Archiver(Step):
|
||||
if len(to_filename) > 64:
|
||||
to_filename = to_filename[-64:]
|
||||
if item:
|
||||
to_filename = os.path.join(item.get_tmp_dir(), to_filename)
|
||||
to_filename = os.path.join(ArchivingContext.get_tmp_dir(), to_filename)
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
||||
}
|
||||
|
||||
77
src/auto_archiver/archivers/instagram_tbot_archiver.py
Normal file
@@ -0,0 +1,77 @@
|
||||
|
||||
from telethon.sync import TelegramClient
|
||||
from loguru import logger
|
||||
import time, os
|
||||
from sqlite3 import OperationalError
|
||||
from . import Archiver
|
||||
from ..core import Metadata, Media, ArchivingContext
|
||||
|
||||
|
||||
class InstagramTbotArchiver(Archiver):
|
||||
"""
|
||||
calls a telegram bot to fetch instagram posts/stories... and gets available media from it
|
||||
https://github.com/adw0rd/instagrapi
|
||||
https://t.me/instagram_load_bot
|
||||
"""
|
||||
name = "instagram_tbot_archiver"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
self.assert_valid_string("api_id")
|
||||
self.assert_valid_string("api_hash")
|
||||
self.timeout = int(self.timeout)
|
||||
try:
|
||||
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
|
||||
except OperationalError as e:
|
||||
logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_archiver. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
|
||||
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
||||
"session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
|
||||
"timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."},
|
||||
}
|
||||
|
||||
def setup(self) -> None:
|
||||
logger.info(f"SETUP {self.name} checking login...")
|
||||
with self.client.start():
|
||||
logger.success(f"SETUP {self.name} login works.")
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
if not "instagram.com" in url: return False
|
||||
|
||||
result = Metadata()
|
||||
tmp_dir = ArchivingContext.get_tmp_dir()
|
||||
with self.client.start():
|
||||
chat = self.client.get_entity("instagram_load_bot")
|
||||
since_id = self.client.send_message(entity=chat, message=url).id
|
||||
|
||||
attempts = 0
|
||||
seen_media = []
|
||||
message = ""
|
||||
time.sleep(3)
|
||||
# media is added before text by the bot so it can be used as a stop-logic mechanism
|
||||
while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
|
||||
attempts += 1
|
||||
time.sleep(1)
|
||||
for post in self.client.iter_messages(chat, min_id=since_id):
|
||||
since_id = max(since_id, post.id)
|
||||
if post.media and post.id not in seen_media:
|
||||
filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
|
||||
media = self.client.download_media(post.media, filename_dest)
|
||||
if media:
|
||||
result.add_media(Media(media))
|
||||
seen_media.append(post.id)
|
||||
if post.message: message += post.message
|
||||
|
||||
if "You must enter a URL to a post" in message:
|
||||
logger.debug(f"invalid link {url=} for {self.name}: {message}")
|
||||
return False
|
||||
|
||||
if message:
|
||||
result.set_content(message).set_title(message[:128])
|
||||
|
||||
return result.success("insta-via-bot")
|
||||
@@ -8,7 +8,7 @@ from tqdm import tqdm
|
||||
import re, time, json, os
|
||||
|
||||
from . import Archiver
|
||||
from ..core import Metadata, Media
|
||||
from ..core import Metadata, Media, ArchivingContext
|
||||
|
||||
|
||||
class TelethonArchiver(Archiver):
|
||||
@@ -114,7 +114,7 @@ class TelethonArchiver(Archiver):
|
||||
with self.client.start():
|
||||
# with self.client.start(bot_token=self.bot_token):
|
||||
try:
|
||||
post = self.client.get_messages(chat, ids=post_id)
|
||||
post = self.client.get_messages(chat, ids=post_id)
|
||||
except ValueError as e:
|
||||
logger.error(f"Could not fetch telegram {url} possibly it's private: {e}")
|
||||
return False
|
||||
@@ -128,7 +128,7 @@ class TelethonArchiver(Archiver):
|
||||
media_posts = self._get_media_posts_in_group(chat, post)
|
||||
logger.debug(f'got {len(media_posts)=} for {url=}')
|
||||
|
||||
tmp_dir = item.get_tmp_dir()
|
||||
tmp_dir = ArchivingContext.get_tmp_dir()
|
||||
|
||||
group_id = post.grouped_id if post.grouped_id is not None else post.id
|
||||
title = post.message
|
||||
|
||||
@@ -3,7 +3,7 @@ import tiktok_downloader
|
||||
from loguru import logger
|
||||
|
||||
from . import Archiver
|
||||
from ..core import Metadata, Media
|
||||
from ..core import Metadata, Media, ArchivingContext
|
||||
|
||||
|
||||
class TiktokArchiver(Archiver):
|
||||
@@ -41,7 +41,7 @@ class TiktokArchiver(Archiver):
|
||||
logger.warning(f'Other Tiktok error {error}')
|
||||
|
||||
try:
|
||||
filename = os.path.join(item.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4')
|
||||
filename = os.path.join(ArchivingContext.get_tmp_dir(), f'{str(uuid.uuid4())[0:8]}.mp4')
|
||||
tiktok_media = tiktok_downloader.snaptik(url).get_media()
|
||||
|
||||
if len(tiktok_media) <= 0:
|
||||
|
||||
@@ -37,7 +37,7 @@ class TwitterArchiver(Archiver):
|
||||
return self.link_clean_pattern.sub("\\1", url)
|
||||
|
||||
def is_rearchivable(self, url: str) -> bool:
|
||||
# Twitter posts are static
|
||||
# Twitter posts are static (for now)
|
||||
return False
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
@@ -86,7 +86,7 @@ class TwitterArchiver(Archiver):
|
||||
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
|
||||
result.add_media(media)
|
||||
|
||||
return result.success("twitter")
|
||||
return result.success("twitter-snscrape")
|
||||
|
||||
def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata:
|
||||
"""
|
||||
|
||||
@@ -3,7 +3,7 @@ from vk_url_scraper import VkScraper
|
||||
|
||||
from ..utils.misc import dump_payload
|
||||
from . import Archiver
|
||||
from ..core import Metadata, Media
|
||||
from ..core import Metadata, Media, ArchivingContext
|
||||
|
||||
|
||||
class VkArchiver(Archiver):
|
||||
@@ -50,7 +50,7 @@ class VkArchiver(Archiver):
|
||||
|
||||
result.set_content(dump_payload(vk_scrapes))
|
||||
|
||||
filenames = self.vks.download_media(vk_scrapes, item.get_tmp_dir())
|
||||
filenames = self.vks.download_media(vk_scrapes, ArchivingContext.get_tmp_dir())
|
||||
for filename in filenames:
|
||||
result.add_media(Media(filename))
|
||||
|
||||
|
||||
@@ -2,11 +2,11 @@ import datetime, os, yt_dlp
|
||||
from loguru import logger
|
||||
|
||||
from . import Archiver
|
||||
from ..core import Metadata, Media
|
||||
from ..core import Metadata, Media, ArchivingContext
|
||||
|
||||
|
||||
class YoutubeDLArchiver(Archiver):
|
||||
name = "youtubedl_enricher"
|
||||
name = "youtubedl_archiver"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
@@ -25,7 +25,7 @@ class YoutubeDLArchiver(Archiver):
|
||||
logger.debug('Using Facebook cookie')
|
||||
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
|
||||
|
||||
ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(item.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False})
|
||||
ydl = yt_dlp.YoutubeDL({'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False})
|
||||
|
||||
try:
|
||||
# don'd download since it can be a live stream
|
||||
|
||||
@@ -1,34 +0,0 @@
|
||||
|
||||
#TODO: refactor GDriveStorage before merging to main
|
||||
# is it possible to have something like this with the new pipeline?
|
||||
|
||||
|
||||
# # import tempfile
|
||||
# import auto_archive
|
||||
# from loguru import logger
|
||||
# from configs import Config
|
||||
# from storages import Storage
|
||||
|
||||
|
||||
# def main():
|
||||
# c = Config()
|
||||
# c.parse()
|
||||
# logger.info(f'Opening document {c.sheet} to look for sheet names to archive')
|
||||
|
||||
# gc = c.gsheets_client
|
||||
# sh = gc.open(c.sheet)
|
||||
|
||||
# wks = sh.get_worksheet(0)
|
||||
# values = wks.get_all_values()
|
||||
|
||||
# with tempfile.TemporaryDirectory(dir="./") as tmpdir:
|
||||
# Storage.TMP_FOLDER = tmpdir
|
||||
# for i in range(11, len(values)):
|
||||
# c.sheet = values[i][0]
|
||||
# logger.info(f"Processing {c.sheet}")
|
||||
# auto_archive.process_sheet(c)
|
||||
# c.destroy_webdriver()
|
||||
|
||||
|
||||
# if __name__ == "__main__":
|
||||
# main()
|
||||
@@ -1,6 +1,7 @@
|
||||
from .media import Media
|
||||
from .metadata import Metadata
|
||||
from .media import Media
|
||||
from .step import Step
|
||||
from .context import ArchivingContext
|
||||
|
||||
# cannot import ArchivingOrchestrator/Config to avoid circular dep
|
||||
# from .orchestrator import ArchivingOrchestrator
|
||||
|
||||
@@ -13,6 +13,7 @@ from ..formatters import Formatter
|
||||
from ..storages import Storage
|
||||
from ..enrichers import Enricher
|
||||
from . import Step
|
||||
from ..utils import update_nested_dict
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -38,10 +39,11 @@ class Config:
|
||||
self.cli_ops = {}
|
||||
self.config = {}
|
||||
|
||||
def parse(self, use_cli=True, yaml_config_filename: str = None):
|
||||
def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs: str = {}):
|
||||
"""
|
||||
if yaml_config_filename is provided, the --config argument is ignored,
|
||||
useful for library usage when the config values are preloaded
|
||||
overwrite_configs is a dict that overwrites the yaml file contents
|
||||
"""
|
||||
# 1. parse CLI values
|
||||
if use_cli:
|
||||
@@ -51,7 +53,7 @@ class Config:
|
||||
epilog="Check the code at https://github.com/bellingcat/auto-archiver"
|
||||
)
|
||||
|
||||
parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml')
|
||||
parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml')
|
||||
|
||||
for configurable in self.configurable_parents:
|
||||
child: Step
|
||||
@@ -80,6 +82,7 @@ class Config:
|
||||
|
||||
# 2. read YAML config file (or use provided value)
|
||||
self.yaml_config = self.read_yaml(yaml_config_filename)
|
||||
update_nested_dict(self.yaml_config, overwrite_configs)
|
||||
|
||||
# 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default
|
||||
self.config = defaultdict(dict)
|
||||
|
||||
52
src/auto_archiver/core/context.py
Normal file
@@ -0,0 +1,52 @@
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class ArchivingContext:
|
||||
"""
|
||||
Singleton context class.
|
||||
ArchivingContext._get_instance() to retrieve it if needed
|
||||
otherwise just
|
||||
ArchivingContext.set(key, value)
|
||||
and
|
||||
ArchivingContext.get(key, default)
|
||||
|
||||
When reset is called, all values are cleared EXCEPT if they were .set(keep_on_reset=True)
|
||||
reset(full_reset=True) will recreate everything including the keep_on_reset status
|
||||
"""
|
||||
_instance = None
|
||||
|
||||
def __init__(self):
|
||||
self.configs = {}
|
||||
self.keep_on_reset = set()
|
||||
|
||||
@staticmethod
|
||||
def get_instance():
|
||||
if ArchivingContext._instance is None:
|
||||
ArchivingContext._instance = ArchivingContext()
|
||||
return ArchivingContext._instance
|
||||
|
||||
@staticmethod
|
||||
def set(key, value, keep_on_reset: bool = False):
|
||||
ac = ArchivingContext.get_instance()
|
||||
ac.configs[key] = value
|
||||
if keep_on_reset: ac.keep_on_reset.add(key)
|
||||
|
||||
@staticmethod
|
||||
def get(key: str, default=None):
|
||||
return ArchivingContext.get_instance().configs.get(key, default)
|
||||
|
||||
@staticmethod
|
||||
def reset(full_reset: bool = False):
|
||||
ac = ArchivingContext.get_instance()
|
||||
if full_reset: ac.keep_on_reset = set()
|
||||
ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset}
|
||||
|
||||
# ---- custom getters/setters for widely used context values
|
||||
|
||||
@staticmethod
|
||||
def set_tmp_dir(tmp_dir: str):
|
||||
ArchivingContext.get_instance().configs["tmp_dir"] = tmp_dir
|
||||
|
||||
@staticmethod
|
||||
def get_tmp_dir() -> str:
|
||||
return ArchivingContext.get_instance().configs.get("tmp_dir")
|
||||
@@ -1,20 +1,47 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from ast import List
|
||||
from typing import Any
|
||||
from typing import Any, List
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses_json import dataclass_json
|
||||
from dataclasses_json import dataclass_json, config
|
||||
import mimetypes
|
||||
|
||||
# annotation order matters
|
||||
@dataclass_json
|
||||
from .context import ArchivingContext
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
@dataclass_json # annotation order matters
|
||||
@dataclass
|
||||
class Media:
|
||||
filename: str
|
||||
key: str = None
|
||||
urls: List[str] = field(default_factory=list)
|
||||
_mimetype: str = None # eg: image/jpeg
|
||||
properties: dict = field(default_factory=dict)
|
||||
_mimetype: str = None # eg: image/jpeg
|
||||
_stored: bool = field(default=False, repr=False, metadata=config(exclude=lambda _: True)) # always exclude
|
||||
|
||||
def store(self: Media, override_storages: List = None, url: str = "url-not-available"):
|
||||
# stores the media into the provided/available storages [Storage]
|
||||
# repeats the process for its properties, in case they have inner media themselves
|
||||
# for now it only goes down 1 level but it's easy to make it recursive if needed
|
||||
storages = override_storages or ArchivingContext.get("storages")
|
||||
if not len(storages):
|
||||
logger.warning(f"No storages found in local context or provided directly for {self.filename}.")
|
||||
return
|
||||
|
||||
for s in storages:
|
||||
s.store(self, url)
|
||||
# Media can be inside media properties, examples include transformations on original media
|
||||
for prop in self.properties.values():
|
||||
if isinstance(prop, Media):
|
||||
s.store(prop, url)
|
||||
if isinstance(prop, list):
|
||||
for prop_media in prop:
|
||||
if isinstance(prop_media, Media):
|
||||
s.store(prop_media, url)
|
||||
|
||||
def is_stored(self) -> bool:
|
||||
return len(self.urls) > 0 and len(self.urls) == len(ArchivingContext.get("storages"))
|
||||
|
||||
def set(self, key: str, value: Any) -> Media:
|
||||
self.properties[key] = value
|
||||
@@ -40,3 +67,6 @@ class Media:
|
||||
|
||||
def is_video(self) -> bool:
|
||||
return self.mimetype.startswith("video")
|
||||
|
||||
def is_audio(self) -> bool:
|
||||
return self.mimetype.startswith("audio")
|
||||
|
||||
@@ -1,26 +1,26 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from ast import List, Set
|
||||
from typing import Any, Union, Dict
|
||||
from typing import Any, List, Union, Dict
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses_json import dataclass_json
|
||||
from dataclasses_json import dataclass_json, config
|
||||
import datetime
|
||||
from urllib.parse import urlparse
|
||||
from dateutil.parser import parse as parse_dt
|
||||
from .media import Media
|
||||
from .context import ArchivingContext
|
||||
|
||||
|
||||
# annotation order matters
|
||||
@dataclass_json
|
||||
@dataclass_json # annotation order matters
|
||||
@dataclass
|
||||
class Metadata:
|
||||
status: str = "no archiver"
|
||||
_processed_at: datetime = field(default_factory=datetime.datetime.utcnow)
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude": True}) # keys that are not to be saved in DBs
|
||||
media: List[Media] = field(default_factory=list)
|
||||
rearchivable: bool = True # defaults to true, archivers can overwrite
|
||||
|
||||
def __post_init__(self):
|
||||
self.set("_processed_at", datetime.datetime.utcnow())
|
||||
|
||||
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
|
||||
"""
|
||||
merges two Metadata instances, will overwrite according to overwrite_left flag
|
||||
@@ -30,7 +30,6 @@ class Metadata:
|
||||
if right.status and len(right.status):
|
||||
self.status = right.status
|
||||
self.rearchivable |= right.rearchivable
|
||||
self.tmp_keys |= right.tmp_keys
|
||||
for k, v in right.metadata.items():
|
||||
assert k not in self.metadata or type(v) == type(self.get(k))
|
||||
if type(v) not in [dict, list, set] or k not in self.metadata:
|
||||
@@ -43,10 +42,14 @@ class Metadata:
|
||||
return right.merge(self)
|
||||
return self
|
||||
|
||||
def set(self, key: str, val: Any, is_tmp=False) -> Metadata:
|
||||
# if not self.metadata: self.metadata = {}
|
||||
def store(self: Metadata, override_storages: List = None):
|
||||
# calls .store for all contained media. storages [Storage]
|
||||
storages = override_storages or ArchivingContext.get("storages")
|
||||
for media in self.media:
|
||||
media.store(override_storages=storages, url=self.get_url())
|
||||
|
||||
def set(self, key: str, val: Any) -> Metadata:
|
||||
self.metadata[key] = val
|
||||
if is_tmp: self.tmp_keys.add(key)
|
||||
return self
|
||||
|
||||
def get(self, key: str, default: Any = None, create_if_missing=False) -> Union[Metadata, str]:
|
||||
@@ -63,6 +66,9 @@ class Metadata:
|
||||
def is_success(self) -> bool:
|
||||
return "success" in self.status
|
||||
|
||||
def is_empty(self) -> bool:
|
||||
return not self.is_success() and len(self.media) == 0 and len(self.metadata) <= 2 # url, processed_at
|
||||
|
||||
@property # getter .netloc
|
||||
def netloc(self) -> str:
|
||||
return urlparse(self.get_url()).netloc
|
||||
@@ -82,7 +88,8 @@ class Metadata:
|
||||
|
||||
def set_content(self, content: str) -> Metadata:
|
||||
# a dump with all the relevant content
|
||||
return self.set("content", content)
|
||||
append_content = (self.get("content", "") + content + "\n").strip()
|
||||
return self.set("content", append_content)
|
||||
|
||||
def set_title(self, title: str) -> Metadata:
|
||||
return self.set("title", title)
|
||||
@@ -90,12 +97,6 @@ class Metadata:
|
||||
def get_title(self) -> str:
|
||||
return self.get("title")
|
||||
|
||||
def set_tmp_dir(self, tmp_dir: str) -> Metadata:
|
||||
return self.set("tmp_dir", tmp_dir, True)
|
||||
|
||||
def get_tmp_dir(self) -> str:
|
||||
return self.get("tmp_dir")
|
||||
|
||||
def set_timestamp(self, timestamp: datetime.datetime) -> Metadata:
|
||||
if type(timestamp) == str:
|
||||
timestamp = parse_dt(timestamp)
|
||||
@@ -122,7 +123,7 @@ class Metadata:
|
||||
for m in self.media:
|
||||
if m.get("id") == id: return m
|
||||
return default
|
||||
|
||||
|
||||
def get_first_image(self, default=None) -> Media:
|
||||
for m in self.media:
|
||||
if "image" in m.mimetype: return m
|
||||
@@ -136,8 +137,5 @@ class Metadata:
|
||||
_default = self.media[0] if len(self.media) else None
|
||||
return self.get_media_by_id("_final_media", _default)
|
||||
|
||||
def get_clean_metadata(self) -> Metadata:
|
||||
return dict(
|
||||
{k: v for k, v in self.metadata.items() if k not in self.tmp_keys},
|
||||
**{"processed_at": self._processed_at}
|
||||
)
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
from ast import List
|
||||
from typing import Union
|
||||
from typing import Generator, Union, List
|
||||
|
||||
from .context import ArchivingContext
|
||||
|
||||
from ..archivers import Archiver
|
||||
from ..feeders import Feeder
|
||||
@@ -8,7 +9,6 @@ from ..formatters import Formatter
|
||||
from ..storages import Storage
|
||||
from ..enrichers import Enricher
|
||||
from ..databases import Database
|
||||
from .media import Media
|
||||
from .metadata import Metadata
|
||||
|
||||
import tempfile, traceback
|
||||
@@ -23,18 +23,19 @@ class ArchivingOrchestrator:
|
||||
self.archivers: List[Archiver] = config.archivers
|
||||
self.databases: List[Database] = config.databases
|
||||
self.storages: List[Storage] = config.storages
|
||||
ArchivingContext.set("storages", self.storages, keep_on_reset=True)
|
||||
|
||||
for a in self.archivers: a.setup()
|
||||
|
||||
def feed(self) -> None:
|
||||
def feed(self) -> Generator[Metadata]:
|
||||
for item in self.feeder:
|
||||
self.feed_item(item)
|
||||
yield self.feed_item(item)
|
||||
|
||||
def feed_item(self, item: Metadata) -> Metadata:
|
||||
print("ARCHIVING", item)
|
||||
try:
|
||||
ArchivingContext.reset()
|
||||
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
|
||||
item.set_tmp_dir(tmp_dir)
|
||||
ArchivingContext.set_tmp_dir(tmp_dir)
|
||||
return self.archive(item)
|
||||
except KeyboardInterrupt:
|
||||
# catches keyboard interruptions to do a clean exit
|
||||
@@ -90,7 +91,7 @@ class ArchivingOrchestrator:
|
||||
# Q: should this be refactored so it's just a.download(result)?
|
||||
result.merge(a.download(result))
|
||||
if result.is_success(): break
|
||||
except Exception as e: logger.error(f"Unexpected error with archiver {a.name}: {e}")
|
||||
except Exception as e: logger.error(f"Unexpected error with archiver {a.name}: {e}: {traceback.format_exc()}")
|
||||
|
||||
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
|
||||
# should it call the HTMLgenerator as if it's not an enrichment?
|
||||
@@ -102,28 +103,21 @@ class ArchivingOrchestrator:
|
||||
# eg: screenshot, wacz, webarchive, thumbnails
|
||||
for e in self.enrichers:
|
||||
try: e.enrich(result)
|
||||
except Exception as exc: logger.error(f"Unexpected error with enricher {e.name}: {exc}")
|
||||
except Exception as exc: logger.error(f"Unexpected error with enricher {e.name}: {exc}: {traceback.format_exc()}")
|
||||
|
||||
# 5 - store media
|
||||
# looks for Media in result.media and also result.media[x].properties (as list or dict values)
|
||||
for s in self.storages:
|
||||
for m in result.media:
|
||||
s.store(m, result) # modifies media
|
||||
# Media can be inside media properties, examples include transformations on original media
|
||||
for prop in m.properties.values():
|
||||
if isinstance(prop, Media):
|
||||
s.store(prop, result)
|
||||
if isinstance(prop, list) and len(prop) > 0 and isinstance(prop[0], Media):
|
||||
for prop_media in prop:
|
||||
s.store(prop_media, result)
|
||||
result.store()
|
||||
|
||||
# 6 - format and store formatted if needed
|
||||
# enrichers typically need access to already stored URLs etc
|
||||
if (final_media := self.formatter.format(result)):
|
||||
for s in self.storages:
|
||||
s.store(final_media, result)
|
||||
final_media.store(url=url)
|
||||
result.set_final_media(final_media)
|
||||
|
||||
if result.is_empty():
|
||||
result.status = "nothing archived"
|
||||
|
||||
# signal completion to databases (DBs, Google Sheets, CSV, ...)
|
||||
for d in self.databases: d.done(result)
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ class Step(ABC):
|
||||
|
||||
def init(name: str, config: dict, child: Type[Step]) -> Step:
|
||||
"""
|
||||
looks into direct subclasses of child for name and returns such ab object
|
||||
looks into direct subclasses of child for name and returns such an object
|
||||
TODO: cannot find subclasses of child.subclasses
|
||||
"""
|
||||
for sub in child.__subclasses__():
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from .database import Database
|
||||
from .gsheet_db import GsheetsDb
|
||||
from .console_db import ConsoleDb
|
||||
from .csv_db import CSVDb
|
||||
from .csv_db import CSVDb
|
||||
from .api_db import AAApiDb
|
||||
41
src/auto_archiver/databases/api_db.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import requests, os
|
||||
from loguru import logger
|
||||
|
||||
from . import Database
|
||||
from ..core import Metadata
|
||||
|
||||
|
||||
class AAApiDb(Database):
|
||||
"""
|
||||
Connects to auto-archiver-api instance
|
||||
"""
|
||||
name = "auto_archiver_api_db"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
self.assert_valid_string("api_endpoint")
|
||||
self.assert_valid_string("api_secret")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
|
||||
"api_secret": {"default": None, "help": "API authentication secret"},
|
||||
"public": {"default": False, "help": "whether the URL should be publicly available via the API"},
|
||||
"author_id": {"default": None, "help": "which email to assign as author"},
|
||||
"group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
|
||||
"tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))},
|
||||
}
|
||||
|
||||
def done(self, item: Metadata) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
logger.info(f"saving archive of {item.get_url()} to the AA API.")
|
||||
|
||||
payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
|
||||
response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, auth=("abc", self.api_secret))
|
||||
|
||||
if response.status_code == 200:
|
||||
logger.success(f"AA API: {response.json()}")
|
||||
else:
|
||||
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
|
||||
@@ -2,13 +2,10 @@ from typing import Union, Tuple
|
||||
import datetime
|
||||
from urllib.parse import quote
|
||||
|
||||
# from metadata import Metadata
|
||||
from loguru import logger
|
||||
|
||||
# from . import Enricher
|
||||
from . import Database
|
||||
from ..core import Metadata
|
||||
from ..core import Media
|
||||
from ..core import Metadata, Media, ArchivingContext
|
||||
from ..utils import GWorksheet
|
||||
|
||||
|
||||
@@ -61,13 +58,14 @@ class GsheetsDb(Database):
|
||||
cell_updates.append((row, 'status', item.status))
|
||||
|
||||
media: Media = item.get_final_media()
|
||||
|
||||
batch_if_valid('archive', "\n".join(media.urls))
|
||||
if hasattr(media, "urls"):
|
||||
batch_if_valid('archive', "\n".join(media.urls))
|
||||
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
|
||||
batch_if_valid('title', item.get_title())
|
||||
batch_if_valid('text', item.get("content", "")[:500])
|
||||
batch_if_valid('text', item.get("content", ""))
|
||||
batch_if_valid('timestamp', item.get_timestamp())
|
||||
if (screenshot := item.get_media_by_id("screenshot")):
|
||||
batch_if_valid('hash', media.get("hash", "not-calculated"))
|
||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
|
||||
batch_if_valid('screenshot', "\n".join(screenshot.urls))
|
||||
|
||||
if (thumbnail := item.get_first_image("thumbnail")):
|
||||
@@ -88,7 +86,7 @@ class GsheetsDb(Database):
|
||||
logger.debug(f"Unable to update sheet: {e}")
|
||||
|
||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now
|
||||
gw: GWorksheet = item.get("gsheet").get("worksheet")
|
||||
row: int = item.get("gsheet").get("row")
|
||||
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
|
||||
gw: GWorksheet = ArchivingContext.get("gsheet").get("worksheet")
|
||||
row: int = ArchivingContext.get("gsheet").get("row")
|
||||
return gw, row
|
||||
|
||||
@@ -3,4 +3,5 @@ from .screenshot_enricher import ScreenshotEnricher
|
||||
from .wayback_enricher import WaybackArchiverEnricher
|
||||
from .hash_enricher import HashEnricher
|
||||
from .thumbnail_enricher import ThumbnailEnricher
|
||||
from .wacz_enricher import WaczEnricher
|
||||
from .wacz_enricher import WaczEnricher
|
||||
from .whisper_enricher import WhisperEnricher
|
||||
@@ -2,7 +2,7 @@ import hashlib
|
||||
from loguru import logger
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Metadata
|
||||
from ..core import Metadata, ArchivingContext
|
||||
|
||||
|
||||
class HashEnricher(Enricher):
|
||||
@@ -16,11 +16,14 @@ class HashEnricher(Enricher):
|
||||
super().__init__(config)
|
||||
algo_choices = self.configs()["algorithm"]["choices"]
|
||||
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
|
||||
self.chunksize = int(self.chunksize)
|
||||
ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}
|
||||
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
|
||||
"chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
@@ -28,12 +31,19 @@ class HashEnricher(Enricher):
|
||||
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
|
||||
|
||||
for i, m in enumerate(to_enrich.media):
|
||||
with open(m.filename, "rb") as f:
|
||||
bytes = f.read() # read entire file as bytes
|
||||
hash = None
|
||||
if self.algorithm == "SHA-256":
|
||||
hash = hashlib.sha256(bytes)
|
||||
elif self.algorithm == "SHA3-512":
|
||||
hash = hashlib.sha3_512(bytes)
|
||||
else: continue
|
||||
to_enrich.media[i].set("hash", f"{self.algorithm}:{hash.hexdigest()}")
|
||||
if len(hd := self.calculate_hash(m.filename)):
|
||||
to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
|
||||
|
||||
def calculate_hash(self, filename):
|
||||
hash = None
|
||||
if self.algorithm == "SHA-256":
|
||||
hash = hashlib.sha256()
|
||||
elif self.algorithm == "SHA3-512":
|
||||
hash = hashlib.sha3_512()
|
||||
else: return ""
|
||||
with open(filename, "rb") as f:
|
||||
while True:
|
||||
buf = f.read(self.chunksize)
|
||||
if not buf: break
|
||||
hash.update(buf)
|
||||
return hash.hexdigest()
|
||||
|
||||
@@ -3,8 +3,8 @@ import time, uuid, os
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
|
||||
from . import Enricher
|
||||
from ..utils import Webdriver
|
||||
from ..core import Media, Metadata
|
||||
from ..utils import Webdriver, UrlUtil
|
||||
from ..core import Media, Metadata, ArchivingContext
|
||||
|
||||
class ScreenshotEnricher(Enricher):
|
||||
name = "screenshot_enricher"
|
||||
@@ -14,21 +14,25 @@ class ScreenshotEnricher(Enricher):
|
||||
return {
|
||||
"width": {"default": 1280, "help": "width of the screenshots"},
|
||||
"height": {"default": 720, "help": "height of the screenshots"},
|
||||
"timeout": {"default": 60, "help": "timeout for taking the screenshot"}
|
||||
"timeout": {"default": 60, "help": "timeout for taking the screenshot"},
|
||||
"sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
if UrlUtil.is_auth_wall(url):
|
||||
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
|
||||
return
|
||||
|
||||
logger.debug(f"Enriching screenshot for {url=}")
|
||||
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url) as driver:
|
||||
try:
|
||||
driver.get(url)
|
||||
time.sleep(2)
|
||||
screenshot_file = os.path.join(to_enrich.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
|
||||
time.sleep(int(self.sleep_before_screenshot))
|
||||
screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{str(uuid.uuid4())[0:8]}.png")
|
||||
driver.save_screenshot(screenshot_file)
|
||||
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
|
||||
except TimeoutException:
|
||||
logger.info("TimeoutException loading page for screenshot")
|
||||
except Exception as e:
|
||||
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")
|
||||
# return None
|
||||
|
||||
@@ -2,7 +2,7 @@ import ffmpeg, os, uuid
|
||||
from loguru import logger
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Media, Metadata
|
||||
from ..core import Media, Metadata, ArchivingContext
|
||||
|
||||
|
||||
class ThumbnailEnricher(Enricher):
|
||||
@@ -23,7 +23,7 @@ class ThumbnailEnricher(Enricher):
|
||||
logger.debug(f"generating thumbnails")
|
||||
for i, m in enumerate(to_enrich.media[::]):
|
||||
if m.is_video():
|
||||
folder = os.path.join(to_enrich.get_tmp_dir(), str(uuid.uuid4()))
|
||||
folder = os.path.join(ArchivingContext.get_tmp_dir(), str(uuid.uuid4()))
|
||||
os.makedirs(folder, exist_ok=True)
|
||||
logger.debug(f"generating thumbnails for {m.filename}")
|
||||
fps, duration = 0.5, m.get("duration")
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
import os, shutil, subprocess, uuid
|
||||
from loguru import logger
|
||||
|
||||
from ..core import Media, Metadata
|
||||
from ..core import Media, Metadata, ArchivingContext
|
||||
from . import Enricher
|
||||
from ..utils import UrlUtil
|
||||
|
||||
|
||||
class WaczEnricher(Enricher):
|
||||
@@ -20,35 +21,56 @@ class WaczEnricher(Enricher):
|
||||
return {
|
||||
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
|
||||
"timeout": {"default": 90, "help": "timeout for WACZ generation in seconds"},
|
||||
"ignore_auth_wall": {"default": True, "help": "skip URL if it is behind authentication wall, set to False if you have browsertrix profile configured for private content."},
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> bool:
|
||||
# TODO: figure out support for browsertrix in docker
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"generating WACZ for {url=}")
|
||||
|
||||
collection = str(uuid.uuid4())[0:8]
|
||||
browsertrix_home = os.path.abspath(to_enrich.get_tmp_dir())
|
||||
cmd = [
|
||||
"docker", "run",
|
||||
"--rm", # delete container once it has completed running
|
||||
"-v", f"{browsertrix_home}:/crawls/",
|
||||
# "-it", # this leads to "the input device is not a TTY"
|
||||
"webrecorder/browsertrix-crawler", "crawl",
|
||||
"--url", url,
|
||||
"--scopeType", "page",
|
||||
"--generateWACZ",
|
||||
"--text",
|
||||
"--collection", collection,
|
||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||
"--behaviorTimeout", str(self.timeout),
|
||||
"--timeout", str(self.timeout)
|
||||
]
|
||||
if self.profile:
|
||||
profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
|
||||
shutil.copyfile(self.profile, profile_fn)
|
||||
# TODO: test which is right
|
||||
cmd.extend(["--profile", profile_fn])
|
||||
# cmd.extend(["--profile", "/crawls/profile.tar.gz"])
|
||||
browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
|
||||
|
||||
if os.getenv('RUNNING_IN_DOCKER'):
|
||||
logger.debug(f"generating WACZ without Docker for {url=}")
|
||||
|
||||
cmd = [
|
||||
"crawl",
|
||||
"--url", url,
|
||||
"--scopeType", "page",
|
||||
"--generateWACZ",
|
||||
"--text",
|
||||
"--collection", collection,
|
||||
"--id", collection,
|
||||
"--saveState", "never",
|
||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||
"--behaviorTimeout", str(self.timeout),
|
||||
"--timeout", str(self.timeout)]
|
||||
|
||||
if self.profile:
|
||||
cmd.extend(["--profile", os.path.join("/app", str(self.profile))])
|
||||
else:
|
||||
logger.debug(f"generating WACZ in Docker for {url=}")
|
||||
|
||||
cmd = [
|
||||
"docker", "run",
|
||||
"--rm", # delete container once it has completed running
|
||||
"-v", f"{browsertrix_home}:/crawls/",
|
||||
# "-it", # this leads to "the input device is not a TTY"
|
||||
"webrecorder/browsertrix-crawler", "crawl",
|
||||
"--url", url,
|
||||
"--scopeType", "page",
|
||||
"--generateWACZ",
|
||||
"--text",
|
||||
"--collection", collection,
|
||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||
"--behaviorTimeout", str(self.timeout),
|
||||
"--timeout", str(self.timeout)
|
||||
]
|
||||
|
||||
if self.profile:
|
||||
profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
|
||||
shutil.copyfile(self.profile, profile_fn)
|
||||
cmd.extend(["--profile", os.path.join("/crawls", "profile.tar.gz")])
|
||||
|
||||
try:
|
||||
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
|
||||
@@ -57,7 +79,13 @@ class WaczEnricher(Enricher):
|
||||
logger.error(f"WACZ generation failed: {e}")
|
||||
return False
|
||||
|
||||
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
|
||||
|
||||
|
||||
if os.getenv('RUNNING_IN_DOCKER'):
|
||||
filename = os.path.join("collections", collection, f"{collection}.wacz")
|
||||
else:
|
||||
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
|
||||
|
||||
if not os.path.exists(filename):
|
||||
logger.warning(f"Unable to locate and upload WACZ {filename=}")
|
||||
return False
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
from loguru import logger
|
||||
import time, requests
|
||||
|
||||
|
||||
from . import Enricher
|
||||
from ..archivers import Archiver
|
||||
from ..utils import UrlUtil
|
||||
from ..core import Metadata
|
||||
|
||||
class WaybackArchiverEnricher(Enricher, Archiver):
|
||||
@@ -33,6 +35,10 @@ class WaybackArchiverEnricher(Enricher, Archiver):
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> bool:
|
||||
url = to_enrich.get_url()
|
||||
if UrlUtil.is_auth_wall(url):
|
||||
logger.debug(f"[SKIP] WAYBACK since url is behind AUTH WALL: {url=}")
|
||||
return
|
||||
|
||||
logger.debug(f"calling wayback for {url=}")
|
||||
|
||||
if to_enrich.get("wayback"):
|
||||
|
||||
130
src/auto_archiver/enrichers/whisper_enricher.py
Normal file
@@ -0,0 +1,130 @@
|
||||
import traceback
|
||||
import requests, time
|
||||
from loguru import logger
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Metadata, Media, ArchivingContext
|
||||
from ..storages import S3Storage
|
||||
|
||||
|
||||
class WhisperEnricher(Enricher):
|
||||
"""
|
||||
Connects with a Whisper API service to get texts out of audio
|
||||
whisper API repository: TODO
|
||||
Only works if an S3 compatible storage is used
|
||||
"""
|
||||
name = "whisper_enricher"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
assert type(self.api_key) == str and len(self.api_key) > 0, "please provide a value for the whisper_enricher api_key"
|
||||
self.timeout = int(self.timeout)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"api_endpoint": {"default": "https://whisper.spoettel.dev/api/v1", "help": "WhisperApi api endpoint"},
|
||||
"api_key": {"default": None, "help": "WhisperApi api key for authentication"},
|
||||
"include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
|
||||
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
|
||||
"action": {"default": "translation", "help": "which Whisper operation to execute", "choices": ["transcript", "translation", "language_detection"]},
|
||||
|
||||
}
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
if not self._get_s3_storage():
|
||||
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
|
||||
return
|
||||
|
||||
url = to_enrich.get_url()
|
||||
logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.")
|
||||
|
||||
job_results = {}
|
||||
for i, m in enumerate(to_enrich.media):
|
||||
if m.is_video() or m.is_audio():
|
||||
m.store(url=url)
|
||||
try:
|
||||
job_id = self.submit_job(m)
|
||||
job_results[job_id] = False
|
||||
logger.debug(f"JOB SUBMITTED: {job_id=} for {m.key=}")
|
||||
to_enrich.media[i].set("whisper_model", {"job_id": job_id})
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to submit whisper job for {m.filename=} with error {e}\n{traceback.format_exc()}")
|
||||
|
||||
job_results = self.check_jobs(job_results)
|
||||
|
||||
for i, m in enumerate(to_enrich.media):
|
||||
if m.is_video() or m.is_audio():
|
||||
job_id = to_enrich.media[i].get("whisper_model")["job_id"]
|
||||
to_enrich.media[i].set("whisper_model", {
|
||||
"job_id": job_id,
|
||||
**(job_results[job_id] if job_results[job_id] else {"result": "incomplete or failed job"})
|
||||
})
|
||||
# append the extracted text to the content of the post so it gets written to the DBs like gsheets text column
|
||||
if job_results[job_id]:
|
||||
for k,v in job_results[job_id].items():
|
||||
if "_text" in k and len(v):
|
||||
to_enrich.set_content(f"\n[automatic video transcript]: {v}")
|
||||
|
||||
def submit_job(self, media: Media):
|
||||
s3 = self._get_s3_storage()
|
||||
s3_url = s3.get_cdn_url(media)
|
||||
assert s3_url in media.urls, f"Could not find S3 url ({s3_url}) in list of stored media urls "
|
||||
payload = {
|
||||
"url": s3_url,
|
||||
"type": self.action,
|
||||
# "language": "string" # may be a config
|
||||
}
|
||||
response = requests.post(f'{self.api_endpoint}/jobs', json=payload, headers={'Authorization': f'Bearer {self.api_key}'})
|
||||
assert response.status_code == 201, f"calling the whisper api {self.api_endpoint} returned a non-success code: {response.status_code}"
|
||||
logger.debug(response.json())
|
||||
return response.json()['id']
|
||||
|
||||
def check_jobs(self, job_results: dict):
|
||||
start_time = time.time()
|
||||
all_completed = False
|
||||
while not all_completed and (time.time() - start_time) <= self.timeout:
|
||||
all_completed = True
|
||||
for job_id in job_results:
|
||||
if job_results[job_id] != False: continue
|
||||
all_completed = False # at least one not ready
|
||||
try: job_results[job_id] = self.check_job(job_id)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to check {job_id=} with error {e}\n{traceback.format_exc()}")
|
||||
if not all_completed: time.sleep(3)
|
||||
return job_results
|
||||
|
||||
def check_job(self, job_id):
|
||||
r = requests.get(f'{self.api_endpoint}/jobs/{job_id}', headers={'Authorization': f'Bearer {self.api_key}'})
|
||||
assert r.status_code == 200, f"Job status did not respond with 200, instead with: {r.status_code}"
|
||||
j = r.json()
|
||||
logger.debug(f"Checked job {job_id=} with status='{j['status']}'")
|
||||
if j['status'] == "processing": return False
|
||||
elif j['status'] == "error": return f"Error: {j['meta']['error']}"
|
||||
elif j['status'] == "success":
|
||||
r_res = requests.get(f'{self.api_endpoint}/jobs/{job_id}/artifacts', headers={'Authorization': f'Bearer {self.api_key}'})
|
||||
assert r_res.status_code == 200, f"Job artifacts did not respond with 200, instead with: {r_res.status_code}"
|
||||
logger.success(r_res.json())
|
||||
result = {}
|
||||
for art_id, artifact in enumerate(r_res.json()):
|
||||
subtitle = []
|
||||
full_text = []
|
||||
for i, d in enumerate(artifact.get("data")):
|
||||
subtitle.append(f"{i+1}\n{d.get('start')} --> {d.get('end')}\n{d.get('text').strip()}")
|
||||
full_text.append(d.get('text').strip())
|
||||
if not len(subtitle): continue
|
||||
if self.include_srt: result[f"artifact_{art_id}_subtitle"] = "\n".join(subtitle)
|
||||
result[f"artifact_{art_id}_text"] = "\n".join(full_text)
|
||||
# call /delete endpoint on timely success
|
||||
r_del = requests.delete(f'{self.api_endpoint}/jobs/{job_id}', headers={'Authorization': f'Bearer {self.api_key}'})
|
||||
logger.debug(f"DELETE whisper {job_id=} result: {r_del.status_code}")
|
||||
return result
|
||||
return False
|
||||
|
||||
def _get_s3_storage(self) -> S3Storage:
|
||||
try:
|
||||
return next(s for s in ArchivingContext.get("storages") if s.__class__ == S3Storage)
|
||||
except:
|
||||
logger.warning("No S3Storage instance found in storages")
|
||||
return
|
||||
@@ -1,7 +1,7 @@
|
||||
from loguru import logger
|
||||
|
||||
from . import Feeder
|
||||
from ..core import Metadata
|
||||
from ..core import Metadata, ArchivingContext
|
||||
|
||||
|
||||
class CLIFeeder(Feeder):
|
||||
@@ -26,5 +26,7 @@ class CLIFeeder(Feeder):
|
||||
def __iter__(self) -> Metadata:
|
||||
for url in self.urls:
|
||||
logger.debug(f"Processing {url}")
|
||||
yield Metadata().set_url(url).set("folder", "cli", True)
|
||||
yield Metadata().set_url(url)
|
||||
ArchivingContext.set("folder", "cli")
|
||||
|
||||
logger.success(f"Processed {len(self.urls)} URL(s)")
|
||||
|
||||
@@ -5,9 +5,10 @@ from slugify import slugify
|
||||
|
||||
# from . import Enricher
|
||||
from . import Feeder
|
||||
from ..core import Metadata
|
||||
from ..core import Metadata, ArchivingContext
|
||||
from ..utils import Gsheets, GWorksheet
|
||||
|
||||
|
||||
class GsheetsFeeder(Gsheets, Feeder):
|
||||
name = "gsheet_feeder"
|
||||
|
||||
@@ -31,14 +32,14 @@ class GsheetsFeeder(Gsheets, Feeder):
|
||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
},
|
||||
"use_sheet_names_in_stored_paths":{
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": True,
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
}
|
||||
})
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
sh = self.gsheets_client.open(self.sheet)
|
||||
sh = self.open_sheet()
|
||||
for ii, wks in enumerate(sh.worksheets()):
|
||||
if not self.should_process_sheet(wks.title):
|
||||
logger.debug(f"SKIPPED worksheet '{wks.title}' due to allow/block rules")
|
||||
@@ -61,11 +62,20 @@ class GsheetsFeeder(Gsheets, Feeder):
|
||||
if status not in ['', None]: continue
|
||||
|
||||
# All checks done - archival process starts here
|
||||
m = Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True)
|
||||
if self.use_sheet_names_in_stored_paths:
|
||||
m.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
|
||||
m = Metadata().set_url(url)
|
||||
ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
|
||||
if gw.get_cell_or_default(row, 'folder', "") is None:
|
||||
folder = ''
|
||||
else:
|
||||
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
|
||||
if len(folder):
|
||||
if self.use_sheet_names_in_stored_paths:
|
||||
ArchivingContext.set("folder", os.path.join(folder, slugify(self.sheet), slugify(wks.title)), True)
|
||||
else:
|
||||
ArchivingContext.set("folder", folder, True)
|
||||
|
||||
yield m
|
||||
|
||||
|
||||
logger.success(f'Finished worksheet {wks.title}')
|
||||
|
||||
def should_process_sheet(self, sheet_name: str) -> bool:
|
||||
|
||||
@@ -3,9 +3,12 @@ from dataclasses import dataclass
|
||||
import mimetypes, uuid, os, pathlib
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
from urllib.parse import quote
|
||||
from loguru import logger
|
||||
|
||||
from ..core import Metadata, Media
|
||||
from ..version import __version__
|
||||
from ..core import Metadata, Media, ArchivingContext
|
||||
from . import Formatter
|
||||
from ..enrichers import HashEnricher
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -25,25 +28,35 @@ class HtmlFormatter(Formatter):
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"},
|
||||
|
||||
"detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
|
||||
}
|
||||
|
||||
def format(self, item: Metadata) -> Media:
|
||||
url = item.get_url()
|
||||
if item.is_empty():
|
||||
logger.debug(f"[SKIP] FORMAT there is no media or metadata to format: {url=}")
|
||||
return
|
||||
|
||||
content = self.template.render(
|
||||
url=item.get_url(),
|
||||
url=url,
|
||||
title=item.get_title(),
|
||||
media=item.media,
|
||||
metadata=item.get_clean_metadata()
|
||||
metadata=item.metadata,
|
||||
version=__version__
|
||||
)
|
||||
html_path = os.path.join(item.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html")
|
||||
html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html")
|
||||
with open(html_path, mode="w", encoding="utf-8") as outf:
|
||||
outf.write(content)
|
||||
return Media(filename=html_path)
|
||||
final_media = Media(filename=html_path)
|
||||
|
||||
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
|
||||
if len(hd := he.calculate_hash(final_media.filename)):
|
||||
final_media.set("hash", f"{he.algorithm}:{hd}")
|
||||
|
||||
return final_media
|
||||
|
||||
|
||||
# JINJA helper filters
|
||||
|
||||
class JinjaHelpers:
|
||||
@staticmethod
|
||||
def is_list(v) -> bool:
|
||||
|
||||
0
src/auto_archiver/formatters/templates/__init__.py
Normal file
@@ -29,7 +29,7 @@
|
||||
margin: auto;
|
||||
border: 1px solid;
|
||||
border-collapse: collapse;
|
||||
vertical-align:top;
|
||||
vertical-align: top;
|
||||
}
|
||||
|
||||
table.metadata td:first-child {
|
||||
@@ -42,7 +42,7 @@
|
||||
}
|
||||
|
||||
.copy:hover {
|
||||
font-weight: 600;
|
||||
background: aliceblue;
|
||||
cursor: copy;
|
||||
}
|
||||
|
||||
@@ -162,7 +162,7 @@
|
||||
{% endfor %}
|
||||
</table>
|
||||
|
||||
<p style="text-align:center;">Made with <a href="https://github.com/bellingcat/auto-archiver">bellingcat/auto-archiver</a></p>
|
||||
<p style="text-align:center;">Made with <a href="https://github.com/bellingcat/auto-archiver">bellingcat/auto-archiver</a> v{{ version }}</p>
|
||||
</body>
|
||||
<script defer>
|
||||
// notification logic
|
||||
@@ -185,7 +185,11 @@
|
||||
el.addEventListener("copy", (e) => {
|
||||
e.preventDefault();
|
||||
if (e.clipboardData) {
|
||||
e.clipboardData.setData("text/plain", el.textContent);
|
||||
if (el.hasAttribute("copy-value")) {
|
||||
e.clipboardData.setData("text/plain", el.getAttribute("copy-value"));
|
||||
} else {
|
||||
e.clipboardData.setData("text/plain", el.textContent);
|
||||
}
|
||||
console.log(e.clipboardData.getData("text"))
|
||||
showNotification("copied!")
|
||||
}
|
||||
|
||||
@@ -46,14 +46,16 @@ No preview available for {{ m.key }}.
|
||||
{% endif %}
|
||||
{% if links %}
|
||||
<a href="{{ url }}">open</a> or
|
||||
<a href="{{ url }}" download="">download</a>
|
||||
<a href="{{ url }}" download="">download</a> or
|
||||
{{ copy_urlize(url, "copy") }}
|
||||
|
||||
<br>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
{%- endmacro -%}
|
||||
|
||||
{% macro copy_urlize(val) -%}
|
||||
{% macro copy_urlize(val, href_text) -%}
|
||||
|
||||
{% if val is mapping %}
|
||||
<ul>
|
||||
@@ -65,7 +67,11 @@ No preview available for {{ m.key }}.
|
||||
</ul>
|
||||
|
||||
{% else %}
|
||||
{% if href_text | length == 0 %}
|
||||
<span class="copy">{{ val | string | urlize }}</span>
|
||||
{% else %}
|
||||
<span class="copy" copy-value="{{val}}">{{ href_text | string | urlize }}</span>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
{%- endmacro -%}
|
||||
@@ -1,10 +1,10 @@
|
||||
from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from dataclasses import dataclass
|
||||
import hashlib
|
||||
from typing import IO, Any
|
||||
from typing import IO
|
||||
|
||||
from ..core import Media, Metadata, Step
|
||||
from ..core import Media, Step, ArchivingContext
|
||||
from ..enrichers import HashEnricher
|
||||
from loguru import logger
|
||||
import os, uuid
|
||||
from slugify import slugify
|
||||
@@ -41,8 +41,11 @@ class Storage(Step):
|
||||
# only for typing...
|
||||
return Step.init(name, config, Storage)
|
||||
|
||||
def store(self, media: Media, item: Metadata) -> None:
|
||||
self.set_key(media, item)
|
||||
def store(self, media: Media, url: str) -> None:
|
||||
if media.is_stored():
|
||||
logger.debug(f"{media.key} already stored, skipping")
|
||||
return
|
||||
self.set_key(media, url)
|
||||
self.upload(media)
|
||||
media.add_url(self.get_cdn_url(media))
|
||||
|
||||
@@ -57,25 +60,25 @@ class Storage(Step):
|
||||
with open(media.filename, 'rb') as f:
|
||||
return self.uploadf(f, media, **kwargs)
|
||||
|
||||
def set_key(self, media: Media, item: Metadata) -> None:
|
||||
def set_key(self, media: Media, url) -> None:
|
||||
"""takes the media and optionally item info and generates a key"""
|
||||
if media.key is not None and len(media.key) > 0: return
|
||||
folder = item.get("folder", "")
|
||||
folder = ArchivingContext.get("folder", "")
|
||||
filename, ext = os.path.splitext(media.filename)
|
||||
|
||||
# path_generator logic
|
||||
if self.path_generator == "flat":
|
||||
if self.path_generator == "flat":
|
||||
path = ""
|
||||
filename = slugify(filename) # in case it comes with os.sep
|
||||
elif self.path_generator == "url": path = slugify(item.get_url())
|
||||
filename = slugify(filename) # in case it comes with os.sep
|
||||
elif self.path_generator == "url": path = slugify(url)
|
||||
elif self.path_generator == "random":
|
||||
path = item.get("random_path", str(uuid.uuid4())[:16], True)
|
||||
path = ArchivingContext.get("random_path", str(uuid.uuid4())[:16], True)
|
||||
|
||||
# filename_generator logic
|
||||
if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
|
||||
elif self.filename_generator == "static":
|
||||
with open(media.filename, "rb") as f:
|
||||
bytes = f.read() # read entire file as bytes
|
||||
filename = hashlib.sha256(bytes).hexdigest()[:24]
|
||||
elif self.filename_generator == "static":
|
||||
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
|
||||
hd = he.calculate_hash(media.filename)
|
||||
filename = hd[:24]
|
||||
|
||||
media.key = os.path.join(folder, path, f"{filename}{ext}")
|
||||
media.key = os.path.join(folder, path, f"{filename}{ext}")
|
||||
|
||||
@@ -2,4 +2,5 @@
|
||||
from .gworksheet import GWorksheet
|
||||
from .misc import *
|
||||
from .webdriver import Webdriver
|
||||
from .gsheet import Gsheets
|
||||
from .gsheet import Gsheets
|
||||
from .url import UrlUtil
|
||||
@@ -10,16 +10,17 @@ class Gsheets(Step):
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||
#TODO: config should be responsible for conversions
|
||||
# TODO: config should be responsible for conversions
|
||||
try: self.header = int(self.header)
|
||||
except: pass
|
||||
assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}"
|
||||
assert self.sheet is not None, "You need to define a sheet name in your orchestration file when using gsheets."
|
||||
assert self.sheet is not None or self.sheet_id is not None, "You need to define either a 'sheet' name or a 'sheet_id' in your orchestration file when using gsheets."
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"sheet": {"default": None, "help": "name of the sheet to archive"},
|
||||
"sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"},
|
||||
"header": {"default": 1, "help": "index of the header row (starts at 1)"},
|
||||
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
|
||||
"columns": {
|
||||
@@ -30,11 +31,9 @@ class Gsheets(Step):
|
||||
'archive': 'archive location',
|
||||
'date': 'archive date',
|
||||
'thumbnail': 'thumbnail',
|
||||
'thumbnail_index': 'thumbnail index',
|
||||
'timestamp': 'upload timestamp',
|
||||
'title': 'upload title',
|
||||
'text': 'text content',
|
||||
'duration': 'duration',
|
||||
'screenshot': 'screenshot',
|
||||
'hash': 'hash',
|
||||
'wacz': 'wacz',
|
||||
@@ -43,4 +42,10 @@ class Gsheets(Step):
|
||||
"help": "names of columns in the google sheet (stringified JSON object)",
|
||||
"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
def open_sheet(self):
|
||||
if self.sheet:
|
||||
return self.gsheets_client.open(self.sheet)
|
||||
else: # self.sheet_id
|
||||
return self.gsheets_client.open_by_key(self.sheet_id)
|
||||
|
||||
@@ -15,10 +15,8 @@ class GWorksheet:
|
||||
'archive': 'archive location',
|
||||
'date': 'archive date',
|
||||
'thumbnail': 'thumbnail',
|
||||
'thumbnail_index': 'thumbnail index',
|
||||
'timestamp': 'upload timestamp',
|
||||
'title': 'upload title',
|
||||
'duration': 'duration',
|
||||
'screenshot': 'screenshot',
|
||||
'hash': 'hash',
|
||||
'wacz': 'wacz',
|
||||
@@ -40,11 +38,11 @@ class GWorksheet:
|
||||
|
||||
def _col_index(self, col: str):
|
||||
self._check_col_exists(col)
|
||||
return self.headers.index(self.columns[col])
|
||||
return self.headers.index(self.columns[col].lower())
|
||||
|
||||
def col_exists(self, col: str):
|
||||
self._check_col_exists(col)
|
||||
return self.columns[col] in self.headers
|
||||
return self.columns[col].lower() in self.headers
|
||||
|
||||
def count_rows(self):
|
||||
return len(self.values)
|
||||
@@ -98,7 +96,7 @@ class GWorksheet:
|
||||
cell_updates = [
|
||||
{
|
||||
'range': self.to_a1(row, col),
|
||||
'values': [[val]]
|
||||
'values': [[str(val)[0:49999]]]
|
||||
}
|
||||
for row, col, val in cell_updates
|
||||
]
|
||||
|
||||
@@ -40,3 +40,12 @@ class DateTimeEncoder(json.JSONEncoder):
|
||||
|
||||
def dump_payload(p):
|
||||
return json.dumps(p, ensure_ascii=False, indent=4, cls=DateTimeEncoder)
|
||||
|
||||
|
||||
def update_nested_dict(dictionary, update_dict):
|
||||
# takes 2 dicts and overwrites the first with the second only on the changed balues
|
||||
for key, value in update_dict.items():
|
||||
if key in dictionary and isinstance(value, dict) and isinstance(dictionary[key], dict):
|
||||
update_nested_dict(dictionary[key], value)
|
||||
else:
|
||||
dictionary[key] = value
|
||||
|
||||
19
src/auto_archiver/utils/url.py
Normal file
@@ -0,0 +1,19 @@
|
||||
import re
|
||||
|
||||
class UrlUtil:
|
||||
telegram_private = re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)")
|
||||
is_istagram = re.compile(r"https:\/\/www\.instagram\.com")
|
||||
|
||||
@staticmethod
|
||||
def clean(url): return url
|
||||
|
||||
@staticmethod
|
||||
def is_auth_wall(url):
|
||||
"""
|
||||
checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work
|
||||
"""
|
||||
if UrlUtil.telegram_private.match(url): return True
|
||||
if UrlUtil.is_istagram.match(url): return True
|
||||
|
||||
return False
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
|
||||
_MAJOR = "0"
|
||||
_MINOR = "2"
|
||||
_MINOR = "5"
|
||||
# On main and in a nightly release the patch should be one ahead of the last
|
||||
# released build.
|
||||
_PATCH = "19"
|
||||
_PATCH = "20"
|
||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||
_SUFFIX = ""
|
||||
|
||||
VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
|
||||
VERSION = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX)
|
||||
__version__ = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX)
|
||||