mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-10 12:18:30 +03:00
Compare commits
40 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b7c69c0f0d | ||
|
|
c98991cdfb | ||
|
|
45b982ec38 | ||
|
|
e11be449e8 | ||
|
|
134bf09257 | ||
|
|
417ca9ef51 | ||
|
|
5b79dcb80c | ||
|
|
52d7b4a016 | ||
|
|
31f6aae7b9 | ||
|
|
26373d4545 | ||
|
|
7a34915f8e | ||
|
|
b67a7b818a | ||
|
|
2e63cb8411 | ||
|
|
9cb73c073f | ||
|
|
9d078a648f | ||
|
|
e150370657 | ||
|
|
4116c90168 | ||
|
|
2c5b115fbe | ||
|
|
bda812f850 | ||
|
|
ac82764ffc | ||
|
|
0fae7d96fb | ||
|
|
2f7181ced6 | ||
|
|
9c25b33f1c | ||
|
|
ae3e607705 | ||
|
|
c1a60fde8a | ||
|
|
875e1de589 | ||
|
|
8f3d4e05c3 | ||
|
|
3bd6bed825 | ||
|
|
2659675f06 | ||
|
|
9d44f4b207 | ||
|
|
5b0bff612e | ||
|
|
ae7ceba0e5 | ||
|
|
97821a81bc | ||
|
|
9191b38cf2 | ||
|
|
567edfc35e | ||
|
|
8c22a9df72 | ||
|
|
d2d6db162b | ||
|
|
5cfbcc0137 | ||
|
|
5fdaa6c739 | ||
|
|
3d389ee05b |
15
.github/workflows/docker-publish.yaml
vendored
15
.github/workflows/docker-publish.yaml
vendored
@@ -26,6 +26,14 @@ jobs:
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v1
|
||||
# https://github.com/docker/setup-buildx-action
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
id: buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
|
||||
@@ -40,9 +48,10 @@ jobs:
|
||||
images: bellingcat/auto-archiver
|
||||
|
||||
- name: Build and push Docker image
|
||||
uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
|
||||
uses: docker/build-push-action@v2
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
platforms: linux/amd64,linux/arm64
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
|
||||
33
Dockerfile
33
Dockerfile
@@ -1,35 +1,36 @@
|
||||
# stage 1 - all dependencies
|
||||
From python:3.10
|
||||
FROM webrecorder/browsertrix-crawler:latest
|
||||
|
||||
ENV RUNNING_IN_DOCKER=1
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# TODO: use custom ffmpeg builds instead of apt-get install
|
||||
RUN pip install --upgrade pip && \
|
||||
pip install pipenv && \
|
||||
add-apt-repository ppa:mozillateam/ppa && \
|
||||
apt-get update && \
|
||||
apt-get install -y gcc ffmpeg fonts-noto firefox-esr && \
|
||||
wget https://github.com/mozilla/geckodriver/releases/download/v0.32.0/geckodriver-v0.32.0-linux64.tar.gz && \
|
||||
apt-get install -y gcc ffmpeg fonts-noto && \
|
||||
apt-get install -y --no-install-recommends firefox-esr && \
|
||||
ln -s /usr/bin/firefox-esr /usr/bin/firefox && \
|
||||
wget https://github.com/mozilla/geckodriver/releases/download/v0.33.0/geckodriver-v0.33.0-linux64.tar.gz && \
|
||||
tar -xvzf geckodriver* -C /usr/local/bin && \
|
||||
chmod +x /usr/local/bin/geckodriver && \
|
||||
rm geckodriver-v*
|
||||
rm geckodriver-v*
|
||||
|
||||
|
||||
# install docker for WACZ
|
||||
# TODO: currently disabled see https://github.com/bellingcat/auto-archiver/issues/66
|
||||
# RUN curl -fsSL https://get.docker.com | sh
|
||||
|
||||
# TODO: avoid copying unnecessary files, including .git
|
||||
COPY Pipfile Pipfile.lock ./
|
||||
RUN pipenv install --python=3.10 --system --deploy
|
||||
# ENV IS_DOCKER=1
|
||||
COPY Pipfile* ./
|
||||
RUN pipenv install
|
||||
|
||||
# doing this at the end helps during development, builds are quick
|
||||
COPY ./src/ .
|
||||
|
||||
# TODO: figure out how to make volumes not be root, does it depend on host or dockerfile?
|
||||
# RUN useradd --system --groups sudo --shell /bin/bash archiver && chown -R archiver:sudo .
|
||||
# USER archiver
|
||||
ENTRYPOINT ["python"]
|
||||
# ENTRYPOINT ["docker-entrypoint.sh"]
|
||||
|
||||
# should be executed with 2 volumes (3 if local_storage)
|
||||
# docker run -v /var/run/docker.sock:/var/run/docker.sock -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa --help
|
||||
|
||||
ENTRYPOINT ["pipenv", "run", "python3", "-m", "auto_archiver"]
|
||||
|
||||
# should be executed with 2 volumes (3 if local_storage is used)
|
||||
# docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml
|
||||
6
Pipfile
6
Pipfile
@@ -30,9 +30,13 @@ cryptography = "==38.0.4"
|
||||
dataclasses-json = "*"
|
||||
yt-dlp = ">=2023.2.17"
|
||||
vk-url-scraper = "*"
|
||||
uwsgi = "*"
|
||||
requests = {extras = ["socks"], version = "*"}
|
||||
# wacz = "==0.4.8"
|
||||
pywb = ">=2.7.3"
|
||||
|
||||
[requires]
|
||||
python_version = "3.9"
|
||||
python_version = "3.10"
|
||||
|
||||
[dev-packages]
|
||||
autopep8 = "*"
|
||||
|
||||
1653
Pipfile.lock
generated
1653
Pipfile.lock
generated
File diff suppressed because it is too large
Load Diff
14
README.md
14
README.md
@@ -33,7 +33,7 @@ Docker works like a virtual machine running inside your computer, it isolates ev
|
||||
|
||||
1. install [docker](https://docs.docker.com/get-docker/)
|
||||
2. pull the auto-archiver docker [image](https://hub.docker.com/r/bellingcat/auto-archiver) with `docker pull bellingcat/auto-archiver`
|
||||
3. run the docker image locally in a container: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver -m auto_archiver --config secrets/orchestration.yaml` breaking this command down:
|
||||
3. run the docker image locally in a container: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --config secrets/orchestration.yaml` breaking this command down:
|
||||
1. `docker run` tells docker to start a new container (an instance of the image)
|
||||
2. `--rm` makes sure this container is removed after execution (less garbage locally)
|
||||
3. `-v $PWD/secrets:/app/secrets` - your secrets folder
|
||||
@@ -87,11 +87,9 @@ The archiver work is orchestrated by the following workflow (we call each a **st
|
||||
4. **Formatter** creates a report from all the archived content (HTML, PDF, ...)
|
||||
5. **Database** knows what's been archived and also stores the archive result (spreadsheet, CSV, or just the console)
|
||||
|
||||
To check all available steps (which archivers, storages, databses, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).
|
||||
To setup an auto-archiver instance, instance, create an `orchestration.yaml` which contains the workflow you would like. We advise you put this file into a `secrets/` folder and do not share it with others because it will contain passwords and other secrets.
|
||||
|
||||
The great thing is you configure all the workflow in your `orchestration.yaml` file which we advise you put into a `secrets/` folder and don't share it with others because it will contain passwords and other secrets.
|
||||
|
||||
The structure of orchestration file is split into 2 parts: `steps` (what **steps** to use) and `configs` (how those steps should behave), here's a simplification:
|
||||
The structure of orchestration file is split into 2 parts: `steps` (what **steps** to use) and `configurations` (how those steps should behave), here's a simplification:
|
||||
```yaml
|
||||
# orchestration.yaml content
|
||||
steps:
|
||||
@@ -113,10 +111,12 @@ configurations:
|
||||
# ... configurations for the other steps here ...
|
||||
```
|
||||
|
||||
To see all available `steps` (which archivers, storages, databses, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).
|
||||
|
||||
All the `configurations` in the `orchestration.yaml` file (you can name it differently but need to pass it in the `--config FILENAME` argument) can be seen in the console by using the `--help` flag. They can also be overwritten, for example if you are using the `cli_feeder` to archive from the command line and want to provide the URLs you should do:
|
||||
|
||||
```bash
|
||||
auto-archiver --config orchestration.yaml --cli_feeder.urls="url1,url2,url3"
|
||||
auto-archiver --config secrets/orchestration.yaml --cli_feeder.urls="url1,url2,url3"
|
||||
```
|
||||
|
||||
Here's the complete workflow that the auto-archiver goes through:
|
||||
@@ -193,7 +193,7 @@ Use `python -m src.auto_archiver --config secrets/orchestration.yaml` to run fro
|
||||
#### Docker development
|
||||
working with docker locally:
|
||||
* `docker build . -t auto-archiver` to build a local image
|
||||
* `docker run --rm -v $PWD/secrets:/app/secrets aa --config secrets/config.yaml`
|
||||
* `docker run --rm -v $PWD/secrets:/app/secrets aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml`
|
||||
* to use local archive, also create a volume `-v` for it by adding `-v $PWD/local_archive:/app/local_archive`
|
||||
|
||||
|
||||
|
||||
@@ -45,11 +45,9 @@ configurations:
|
||||
archive: archive location
|
||||
date: archive date
|
||||
thumbnail: thumbnail
|
||||
thumbnail_index: thumbnail index
|
||||
timestamp: upload timestamp
|
||||
title: upload title
|
||||
text: textual content
|
||||
duration: duration
|
||||
screenshot: screenshot
|
||||
hash: hash
|
||||
wacz: wacz
|
||||
|
||||
@@ -10,6 +10,7 @@ from googleapiclient.errors import HttpError
|
||||
# You can run this code to get a new token and verify it belongs to the correct user
|
||||
# This token will be refresh automatically by the auto-archiver
|
||||
# Code below from https://developers.google.com/drive/api/quickstart/python
|
||||
# Example invocation: py scripts/create_update_gdrive_oauth_token.py -c secrets/credentials.json -t secrets/gd-token.json
|
||||
|
||||
SCOPES = ['https://www.googleapis.com/auth/drive']
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ class InstagramTbotArchiver(Archiver):
|
||||
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
|
||||
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
|
||||
"session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
|
||||
"timeout": {"default": 15, "help": "timeout to fetch the instagram content in seconds."},
|
||||
"timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."},
|
||||
}
|
||||
|
||||
def setup(self) -> None:
|
||||
@@ -52,9 +52,9 @@ class InstagramTbotArchiver(Archiver):
|
||||
attempts = 0
|
||||
seen_media = []
|
||||
message = ""
|
||||
time.sleep(4)
|
||||
time.sleep(3)
|
||||
# media is added before text by the bot so it can be used as a stop-logic mechanism
|
||||
while attempts < self.timeout and (not message or not len(seen_media)):
|
||||
while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
|
||||
attempts += 1
|
||||
time.sleep(1)
|
||||
for post in self.client.iter_messages(chat, min_id=since_id):
|
||||
|
||||
@@ -1,34 +0,0 @@
|
||||
|
||||
#TODO: refactor GDriveStorage before merging to main
|
||||
# is it possible to have something like this with the new pipeline?
|
||||
|
||||
|
||||
# # import tempfile
|
||||
# import auto_archive
|
||||
# from loguru import logger
|
||||
# from configs import Config
|
||||
# from storages import Storage
|
||||
|
||||
|
||||
# def main():
|
||||
# c = Config()
|
||||
# c.parse()
|
||||
# logger.info(f'Opening document {c.sheet} to look for sheet names to archive')
|
||||
|
||||
# gc = c.gsheets_client
|
||||
# sh = gc.open(c.sheet)
|
||||
|
||||
# wks = sh.get_worksheet(0)
|
||||
# values = wks.get_all_values()
|
||||
|
||||
# with tempfile.TemporaryDirectory(dir="./") as tmpdir:
|
||||
# Storage.TMP_FOLDER = tmpdir
|
||||
# for i in range(11, len(values)):
|
||||
# c.sheet = values[i][0]
|
||||
# logger.info(f"Processing {c.sheet}")
|
||||
# auto_archive.process_sheet(c)
|
||||
# c.destroy_webdriver()
|
||||
|
||||
|
||||
# if __name__ == "__main__":
|
||||
# main()
|
||||
@@ -27,7 +27,6 @@ class ArchivingContext:
|
||||
|
||||
@staticmethod
|
||||
def set(key, value, keep_on_reset: bool = False):
|
||||
logger.error(f"SET [{key}]={value}")
|
||||
ac = ArchivingContext.get_instance()
|
||||
ac.configs[key] = value
|
||||
if keep_on_reset: ac.keep_on_reset.add(key)
|
||||
|
||||
@@ -19,7 +19,7 @@ class Media:
|
||||
urls: List[str] = field(default_factory=list)
|
||||
properties: dict = field(default_factory=dict)
|
||||
_mimetype: str = None # eg: image/jpeg
|
||||
_stored: bool = field(default=False, repr=False, metadata=config(exclude=lambda _: True)) # always exclude
|
||||
_stored: bool = field(default=False, repr=False, metadata=config(exclude=lambda _: True)) # always exclude
|
||||
|
||||
def store(self: Media, override_storages: List = None, url: str = "url-not-available"):
|
||||
# stores the media into the provided/available storages [Storage]
|
||||
@@ -42,7 +42,7 @@ class Media:
|
||||
s.store(prop_media, url)
|
||||
|
||||
def is_stored(self) -> bool:
|
||||
return len(self.urls) > 0
|
||||
return len(self.urls) > 0 and len(self.urls) == len(ArchivingContext.get("storages"))
|
||||
|
||||
def set(self, key: str, value: Any) -> Media:
|
||||
self.properties[key] = value
|
||||
|
||||
@@ -47,7 +47,7 @@ class Metadata:
|
||||
# calls .store for all contained media. storages [Storage]
|
||||
storages = override_storages or ArchivingContext.get("storages")
|
||||
for media in self.media:
|
||||
media.store(override_storages=storages)
|
||||
media.store(override_storages=storages, url=self.get_url())
|
||||
|
||||
def set(self, key: str, val: Any) -> Metadata:
|
||||
self.metadata[key] = val
|
||||
@@ -89,7 +89,8 @@ class Metadata:
|
||||
|
||||
def set_content(self, content: str) -> Metadata:
|
||||
# a dump with all the relevant content
|
||||
return self.set("content", content)
|
||||
append_content = (self.get("content", "") + content + "\n").strip()
|
||||
return self.set("content", append_content)
|
||||
|
||||
def set_title(self, title: str) -> Metadata:
|
||||
return self.set("title", title)
|
||||
|
||||
@@ -93,7 +93,7 @@ class ArchivingOrchestrator:
|
||||
# Q: should this be refactored so it's just a.download(result)?
|
||||
result.merge(a.download(result))
|
||||
if result.is_success(): break
|
||||
except Exception as e: logger.error(f"Unexpected error with archiver {a.name}: {e}")
|
||||
except Exception as e: logger.error(f"Unexpected error with archiver {a.name}: {e}: {traceback.format_exc()}")
|
||||
|
||||
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
|
||||
# should it call the HTMLgenerator as if it's not an enrichment?
|
||||
@@ -105,7 +105,7 @@ class ArchivingOrchestrator:
|
||||
# eg: screenshot, wacz, webarchive, thumbnails
|
||||
for e in self.enrichers:
|
||||
try: e.enrich(result)
|
||||
except Exception as exc: logger.error(f"Unexpected error with enricher {e.name}: {exc}")
|
||||
except Exception as exc: logger.error(f"Unexpected error with enricher {e.name}: {exc}: {traceback.format_exc()}")
|
||||
|
||||
# 5 - store media
|
||||
# looks for Media in result.media and also result.media[x].properties (as list or dict values)
|
||||
@@ -114,7 +114,7 @@ class ArchivingOrchestrator:
|
||||
# 6 - format and store formatted if needed
|
||||
# enrichers typically need access to already stored URLs etc
|
||||
if (final_media := self.formatter.format(result)):
|
||||
final_media.store()
|
||||
final_media.store(url=url)
|
||||
result.set_final_media(final_media)
|
||||
|
||||
if result.is_empty():
|
||||
|
||||
@@ -62,8 +62,9 @@ class GsheetsDb(Database):
|
||||
batch_if_valid('archive', "\n".join(media.urls))
|
||||
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
|
||||
batch_if_valid('title', item.get_title())
|
||||
batch_if_valid('text', item.get("content", "")[:500])
|
||||
batch_if_valid('text', item.get("content", ""))
|
||||
batch_if_valid('timestamp', item.get_timestamp())
|
||||
batch_if_valid('hash', media.get("hash", "not-calculated"))
|
||||
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
|
||||
batch_if_valid('screenshot', "\n".join(screenshot.urls))
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ import hashlib
|
||||
from loguru import logger
|
||||
|
||||
from . import Enricher
|
||||
from ..core import Metadata
|
||||
from ..core import Metadata, ArchivingContext
|
||||
|
||||
|
||||
class HashEnricher(Enricher):
|
||||
@@ -17,6 +17,7 @@ class HashEnricher(Enricher):
|
||||
algo_choices = self.configs()["algorithm"]["choices"]
|
||||
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
|
||||
self.chunksize = int(self.chunksize)
|
||||
ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
|
||||
@@ -26,36 +26,58 @@ class WaczEnricher(Enricher):
|
||||
|
||||
def enrich(self, to_enrich: Metadata) -> bool:
|
||||
# TODO: figure out support for browsertrix in docker
|
||||
|
||||
url = to_enrich.get_url()
|
||||
|
||||
if UrlUtil.is_auth_wall(url):
|
||||
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
|
||||
return
|
||||
|
||||
logger.debug(f"generating WACZ for {url=}")
|
||||
|
||||
collection = str(uuid.uuid4())[0:8]
|
||||
browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
|
||||
cmd = [
|
||||
"docker", "run",
|
||||
"--rm", # delete container once it has completed running
|
||||
"-v", f"{browsertrix_home}:/crawls/",
|
||||
# "-it", # this leads to "the input device is not a TTY"
|
||||
"webrecorder/browsertrix-crawler", "crawl",
|
||||
"--url", url,
|
||||
"--scopeType", "page",
|
||||
"--generateWACZ",
|
||||
"--text",
|
||||
"--collection", collection,
|
||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||
"--behaviorTimeout", str(self.timeout),
|
||||
"--timeout", str(self.timeout)
|
||||
]
|
||||
if self.profile:
|
||||
profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
|
||||
shutil.copyfile(self.profile, profile_fn)
|
||||
# TODO: test which is right
|
||||
cmd.extend(["--profile", profile_fn])
|
||||
# cmd.extend(["--profile", "/crawls/profile.tar.gz"])
|
||||
|
||||
if os.getenv('RUNNING_IN_DOCKER'):
|
||||
logger.debug(f"generating WACZ without Docker for {url=}")
|
||||
|
||||
cmd = [
|
||||
"crawl",
|
||||
"--url", url,
|
||||
"--scopeType", "page",
|
||||
"--generateWACZ",
|
||||
"--text",
|
||||
"--collection", collection,
|
||||
"--id", collection,
|
||||
"--saveState", "never",
|
||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||
"--behaviorTimeout", str(self.timeout),
|
||||
"--timeout", str(self.timeout),
|
||||
"--profile", str(self.profile)
|
||||
]
|
||||
else:
|
||||
logger.debug(f"generating WACZ in Docker for {url=}")
|
||||
|
||||
cmd = [
|
||||
"docker", "run",
|
||||
"--rm", # delete container once it has completed running
|
||||
"-v", f"{browsertrix_home}:/crawls/",
|
||||
# "-it", # this leads to "the input device is not a TTY"
|
||||
"webrecorder/browsertrix-crawler", "crawl",
|
||||
"--url", url,
|
||||
"--scopeType", "page",
|
||||
"--generateWACZ",
|
||||
"--text",
|
||||
"--collection", collection,
|
||||
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
|
||||
"--behaviorTimeout", str(self.timeout),
|
||||
"--timeout", str(self.timeout)
|
||||
]
|
||||
|
||||
if self.profile:
|
||||
profile_fn = os.path.join(browsertrix_home, "profile.tar.gz")
|
||||
shutil.copyfile(self.profile, profile_fn)
|
||||
# TODO: test which is right
|
||||
cmd.extend(["--profile", profile_fn])
|
||||
# cmd.extend(["--profile", "/crawls/profile.tar.gz"])
|
||||
|
||||
try:
|
||||
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
|
||||
@@ -64,7 +86,13 @@ class WaczEnricher(Enricher):
|
||||
logger.error(f"WACZ generation failed: {e}")
|
||||
return False
|
||||
|
||||
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
|
||||
|
||||
|
||||
if os.getenv('RUNNING_IN_DOCKER'):
|
||||
filename = os.path.join("collections", collection, f"{collection}.wacz")
|
||||
else:
|
||||
filename = os.path.join(browsertrix_home, "collections", collection, f"{collection}.wacz")
|
||||
|
||||
if not os.path.exists(filename):
|
||||
logger.warning(f"Unable to locate and upload WACZ {filename=}")
|
||||
return False
|
||||
|
||||
@@ -26,6 +26,7 @@ class WhisperEnricher(Enricher):
|
||||
return {
|
||||
"api_endpoint": {"default": "https://whisper.spoettel.dev/api/v1", "help": "WhisperApi api endpoint"},
|
||||
"api_key": {"default": None, "help": "WhisperApi api key for authentication"},
|
||||
"include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
|
||||
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
|
||||
"action": {"default": "translation", "help": "which Whisper operation to execute", "choices": ["transcript", "translation", "language_detection"]},
|
||||
|
||||
@@ -42,7 +43,7 @@ class WhisperEnricher(Enricher):
|
||||
job_results = {}
|
||||
for i, m in enumerate(to_enrich.media):
|
||||
if m.is_video() or m.is_audio():
|
||||
m.store()
|
||||
m.store(url=url)
|
||||
try:
|
||||
job_id = self.submit_job(m)
|
||||
job_results[job_id] = False
|
||||
@@ -58,8 +59,13 @@ class WhisperEnricher(Enricher):
|
||||
job_id = to_enrich.media[i].get("whisper_model")["job_id"]
|
||||
to_enrich.media[i].set("whisper_model", {
|
||||
"job_id": job_id,
|
||||
self.action: job_results[job_id]
|
||||
**(job_results[job_id] if job_results[job_id] else {"result": "incomplete or failed job"})
|
||||
})
|
||||
# append the extracted text to the content of the post so it gets written to the DBs like gsheets text column
|
||||
if job_results[job_id]:
|
||||
for k,v in job_results[job_id].items():
|
||||
if "_text" in k and len(v):
|
||||
to_enrich.set_content(f"\n[automatic video transcript]: {v}")
|
||||
|
||||
def submit_job(self, media: Media):
|
||||
s3 = self._get_s3_storage()
|
||||
@@ -81,7 +87,7 @@ class WhisperEnricher(Enricher):
|
||||
while not all_completed and (time.time() - start_time) <= self.timeout:
|
||||
all_completed = True
|
||||
for job_id in job_results:
|
||||
if job_results[job_id]: continue
|
||||
if job_results[job_id] != False: continue
|
||||
all_completed = False # at least one not ready
|
||||
try: job_results[job_id] = self.check_job(job_id)
|
||||
except Exception as e:
|
||||
@@ -100,18 +106,19 @@ class WhisperEnricher(Enricher):
|
||||
r_res = requests.get(f'{self.api_endpoint}/jobs/{job_id}/artifacts', headers={'Authorization': f'Bearer {self.api_key}'})
|
||||
assert r_res.status_code == 200, f"Job artifacts did not respond with 200, instead with: {r_res.status_code}"
|
||||
logger.success(r_res.json())
|
||||
result = []
|
||||
for artifact in r_res.json():
|
||||
result = {}
|
||||
for art_id, artifact in enumerate(r_res.json()):
|
||||
subtitle = []
|
||||
full_text = []
|
||||
for i, d in enumerate(artifact.get("data")):
|
||||
subtitle.append(f"{i+1}\n{d.get('start')} --> {d.get('end')}\n{d.get('text').strip()}")
|
||||
full_text.append(d.get('text').strip())
|
||||
if not len(subtitle): continue
|
||||
result.append({
|
||||
"subtitle": "\n".join(subtitle),
|
||||
"full_text": "\n".join(full_text),
|
||||
})
|
||||
if self.include_srt: result[f"artifact_{art_id}_subtitle"] = "\n".join(subtitle)
|
||||
result[f"artifact_{art_id}_text"] = "\n".join(full_text)
|
||||
# call /delete endpoint on timely success
|
||||
r_del = requests.delete(f'{self.api_endpoint}/jobs/{job_id}', headers={'Authorization': f'Bearer {self.api_key}'})
|
||||
logger.debug(f"DELETE whisper {job_id=} result: {r_del.status_code}")
|
||||
return result
|
||||
return False
|
||||
|
||||
|
||||
@@ -64,8 +64,13 @@ class GsheetsFeeder(Gsheets, Feeder):
|
||||
# All checks done - archival process starts here
|
||||
m = Metadata().set_url(url)
|
||||
ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
|
||||
if self.use_sheet_names_in_stored_paths:
|
||||
ArchivingContext.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
|
||||
folder = slugify(gw.get_cell(row, 'folder').strip())
|
||||
if len(folder):
|
||||
if self.use_sheet_names_in_stored_paths:
|
||||
ArchivingContext.set("folder", os.path.join(folder, slugify(self.sheet), slugify(wks.title)), True)
|
||||
else:
|
||||
ArchivingContext.set("folder", folder, True)
|
||||
|
||||
yield m
|
||||
|
||||
logger.success(f'Finished worksheet {wks.title}')
|
||||
|
||||
@@ -8,6 +8,7 @@ from loguru import logger
|
||||
from ..version import __version__
|
||||
from ..core import Metadata, Media, ArchivingContext
|
||||
from . import Formatter
|
||||
from ..enrichers import HashEnricher
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -46,11 +47,16 @@ class HtmlFormatter(Formatter):
|
||||
html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{str(uuid.uuid4())}.html")
|
||||
with open(html_path, mode="w", encoding="utf-8") as outf:
|
||||
outf.write(content)
|
||||
return Media(filename=html_path)
|
||||
final_media = Media(filename=html_path)
|
||||
|
||||
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
|
||||
if len(hd := he.calculate_hash(final_media.filename)):
|
||||
final_media.set("hash", f"{he.algorithm}:{hd}")
|
||||
|
||||
return final_media
|
||||
|
||||
|
||||
# JINJA helper filters
|
||||
|
||||
class JinjaHelpers:
|
||||
@staticmethod
|
||||
def is_list(v) -> bool:
|
||||
|
||||
@@ -42,7 +42,7 @@
|
||||
}
|
||||
|
||||
.copy:hover {
|
||||
font-weight: 600;
|
||||
background: aliceblue;
|
||||
cursor: copy;
|
||||
}
|
||||
|
||||
|
||||
@@ -43,7 +43,7 @@ class Storage(Step):
|
||||
|
||||
def store(self, media: Media, url: str) -> None:
|
||||
if media.is_stored():
|
||||
logger.debug(f"{self.key} already stored, skipping")
|
||||
logger.debug(f"{media.key} already stored, skipping")
|
||||
return
|
||||
self.set_key(media, url)
|
||||
self.upload(media)
|
||||
@@ -77,7 +77,7 @@ class Storage(Step):
|
||||
# filename_generator logic
|
||||
if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
|
||||
elif self.filename_generator == "static":
|
||||
he = HashEnricher({"hash_enricher": {"algorithm": "SHA-256", "chunksize": 1.6e7}})
|
||||
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
|
||||
hd = he.calculate_hash(media.filename)
|
||||
filename = hd[:24]
|
||||
|
||||
|
||||
@@ -30,11 +30,9 @@ class Gsheets(Step):
|
||||
'archive': 'archive location',
|
||||
'date': 'archive date',
|
||||
'thumbnail': 'thumbnail',
|
||||
'thumbnail_index': 'thumbnail index',
|
||||
'timestamp': 'upload timestamp',
|
||||
'title': 'upload title',
|
||||
'text': 'text content',
|
||||
'duration': 'duration',
|
||||
'screenshot': 'screenshot',
|
||||
'hash': 'hash',
|
||||
'wacz': 'wacz',
|
||||
|
||||
@@ -15,10 +15,8 @@ class GWorksheet:
|
||||
'archive': 'archive location',
|
||||
'date': 'archive date',
|
||||
'thumbnail': 'thumbnail',
|
||||
'thumbnail_index': 'thumbnail index',
|
||||
'timestamp': 'upload timestamp',
|
||||
'title': 'upload title',
|
||||
'duration': 'duration',
|
||||
'screenshot': 'screenshot',
|
||||
'hash': 'hash',
|
||||
'wacz': 'wacz',
|
||||
@@ -98,7 +96,7 @@ class GWorksheet:
|
||||
cell_updates = [
|
||||
{
|
||||
'range': self.to_a1(row, col),
|
||||
'values': [[val]]
|
||||
'values': [[str(val)[0:49999]]]
|
||||
}
|
||||
for row, col, val in cell_updates
|
||||
]
|
||||
|
||||
@@ -3,7 +3,7 @@ _MAJOR = "0"
|
||||
_MINOR = "5"
|
||||
# On main and in a nightly release the patch should be one ahead of the last
|
||||
# released build.
|
||||
_PATCH = "6"
|
||||
_PATCH = "12"
|
||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||
_SUFFIX = ""
|
||||
|
||||
Reference in New Issue
Block a user