Merge branch 'load_modules' into timestamping_rewrite

This commit is contained in:
Patrick Robertson
2025-02-11 15:21:31 +00:00
122 changed files with 3281 additions and 1011 deletions

View File

@@ -1 +1 @@
from api_db import AAApiDb
from .api_db import AAApiDb

View File

@@ -1,28 +1,49 @@
{
"name": "Auto-Archiver API Database",
"type": ["database"],
"entry_point": "api_db:AAApiDb",
"entry_point": "api_db::AAApiDb",
"requires_setup": True,
"external_dependencies": {
"python": ["requests",
"loguru"],
"dependencies": {
"python": ["requests", "loguru"],
},
"configs": {
"api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
"api_token": {"default": None, "help": "API Bearer token."},
"public": {"default": False, "help": "whether the URL should be publicly available via the API"},
"author_id": {"default": None, "help": "which email to assign as author"},
"group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
"allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived", "type": "bool",},
"store_results": {"default": True, "help": "when set, will send the results to the API database.", "type": "bool",},
"tags": {"default": [], "help": "what tags to add to the archived URL",}
"api_endpoint": {
"required": True,
"help": "API endpoint where calls are made to",
},
"api_token": {"default": None,
"help": "API Bearer token."},
"public": {
"default": False,
"type": "bool",
"help": "whether the URL should be publicly available via the API",
},
"author_id": {"default": None, "help": "which email to assign as author"},
"group_id": {
"default": None,
"help": "which group of users have access to the archive in case public=false as author",
},
"use_api_cache": {
"default": True,
"type": "bool",
"help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived",
},
"store_results": {
"default": True,
"type": "bool",
"help": "when set, will send the results to the API database.",
},
"tags": {
"default": [],
"help": "what tags to add to the archived URL",
},
},
"description": """
Provides integration with the Auto-Archiver API for querying and storing archival data.
### Features
- **API Integration**: Supports querying for existing archives and submitting results.
- **Duplicate Prevention**: Avoids redundant archiving when `allow_rearchive` is disabled.
- **Duplicate Prevention**: Avoids redundant archiving when `use_api_cache` is disabled.
- **Configurable**: Supports settings like API endpoint, authentication token, tags, and permissions.
- **Tagging and Metadata**: Adds tags and manages metadata for archives.
- **Optional Storage**: Archives results conditionally based on configuration.

View File

@@ -1,5 +1,7 @@
from typing import Union
import requests, os
import os
import requests
from loguru import logger
from auto_archiver.core import Database
@@ -7,27 +9,17 @@ from auto_archiver.core import Metadata
class AAApiDb(Database):
"""
Connects to auto-archiver-api instance
"""
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
self.allow_rearchive = bool(self.allow_rearchive)
self.store_results = bool(self.store_results)
self.assert_valid_string("api_endpoint")
"""Connects to auto-archiver-api instance"""
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
""" query the database for the existence of this item.
Helps avoid re-archiving the same URL multiple times.
"""
if not self.allow_rearchive: return
if not self.use_api_cache: return
params = {"url": item.get_url(), "limit": 15}
headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"}
response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers)
response = requests.get(os.path.join(self.api_endpoint, "url/search"), params=params, headers=headers)
if response.status_code == 200:
if len(response.json()):
@@ -38,21 +30,26 @@ class AAApiDb(Database):
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
return False
def done(self, item: Metadata, cached: bool=False) -> None:
def done(self, item: Metadata, cached: bool = False) -> None:
"""archival result ready - should be saved to DB"""
if not self.store_results: return
if cached:
if cached:
logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached")
return
logger.debug(f"saving archive of {item.get_url()} to the AA API.")
payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
payload = {
'author_id': self.author_id,
'url': item.get_url(),
'public': self.public,
'group_id': self.group_id,
'tags': list(self.tags),
'result': item.to_json(),
}
headers = {"Authorization": f"Bearer {self.api_token}"}
response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, headers=headers)
response = requests.post(os.path.join(self.api_endpoint, "interop/submit-archive"), json=payload, headers=headers)
if response.status_code == 200:
if response.status_code == 201:
logger.success(f"AA API: {response.json()}")
else:
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")

View File

@@ -1 +0,0 @@
from .atlos import AtlosStorage

View File

@@ -1,40 +0,0 @@
{
"name": "atlos_storage",
"type": ["storage"],
"requires_setup": True,
"external_dependencies": {"python": ["loguru", "requests"], "bin": [""]},
"configs": {
"path_generator": {
"default": "url",
"help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
},
"filename_generator": {
"default": "random",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
},
"api_token": {
"default": None,
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
"type": "str",
},
"atlos_url": {
"default": "https://platform.atlos.org",
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
"type": "str",
},
},
"description": """
AtlosStorage: A storage module for saving media files to the Atlos platform.
### Features
- Uploads media files to Atlos using Atlos-specific APIs.
- Automatically calculates SHA-256 hashes of media files for integrity verification.
- Skips uploads for files that already exist on Atlos with the same hash.
- Supports attaching metadata, such as `atlos_id`, to the uploaded files.
- Provides CDN-like URLs for accessing uploaded media.
### Notes
- Requires Atlos API configuration, including `atlos_url` and `api_token`.
- Files are linked to an `atlos_id` in the metadata, ensuring proper association with Atlos source materials.
""",
}

View File

@@ -1,9 +1,9 @@
{
"name": "Atlos Database",
"type": ["database"],
"entry_point": "atlos_db:AtlosDb",
"entry_point": "atlos_db::AtlosDb",
"requires_setup": True,
"external_dependencies":
"dependencies":
{"python": ["loguru",
""],
"bin": [""]},

View File

@@ -1,14 +1,10 @@
import os
from typing import Union
from loguru import logger
from csv import DictWriter
from dataclasses import asdict
import requests
from loguru import logger
from auto_archiver.core import Database
from auto_archiver.core import Metadata
from auto_archiver.utils import get_atlos_config_options
class AtlosDb(Database):

View File

@@ -2,14 +2,14 @@
"name": "Atlos Feeder",
"type": ["feeder"],
"requires_setup": True,
"external_dependencies": {
"dependencies": {
"python": ["loguru", "requests"],
},
"configs": {
"api_token": {
"default": None,
"type": "str",
"required": True,
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
"type": "str"
},
"atlos_url": {
"default": "https://platform.atlos.org",

View File

@@ -1,19 +1,12 @@
from loguru import logger
import requests
from loguru import logger
from auto_archiver.core import Feeder
from auto_archiver.core import Metadata, ArchivingContext
from auto_archiver.utils import get_atlos_config_options
from auto_archiver.core import Metadata
class AtlosFeeder(Feeder):
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
if type(self.api_token) != str:
raise Exception("Atlos Feeder did not receive an Atlos API token")
def __iter__(self) -> Metadata:
# Get all the urls from the Atlos API
count = 0
@@ -47,5 +40,3 @@ class AtlosFeeder(Feeder):
if len(data["results"]) == 0 or cursor is None:
break
logger.success(f"Processed {count} URL(s)")

View File

@@ -1,12 +1,12 @@
import os
from typing import IO, List, Optional
from loguru import logger
import requests
import hashlib
import os
from typing import IO, Optional
import requests
from loguru import logger
from auto_archiver.core import Media, Metadata
from auto_archiver.core import Storage
from auto_archiver.utils import get_atlos_config_options
class AtlosStorage(Storage):

View File

@@ -1 +0,0 @@
from .cli_feeder import CLIFeeder

View File

@@ -1,27 +0,0 @@
{
"name": "CLI Feeder",
"type": ["feeder"],
"requires_setup": False,
"external_dependencies": {
"python": ["loguru"],
},
'entry_point': 'cli_feeder::CLIFeeder',
"configs": {
"urls": {
"help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
"nargs": "+",
"required": True,
"do_not_store": True,
"metavar": "INPUT URLS",
},
},
"description": """
Processes URLs to archive passed via the command line and feeds them into the archiving pipeline.
### Features
- Takes a single URL or a list of URLs provided via the command line.
- Converts each URL into a `Metadata` object and yields it for processing.
- Ensures URLs are processed only if they are explicitly provided.
"""
}

View File

@@ -1,15 +0,0 @@
from loguru import logger
from auto_archiver.core import Feeder
from auto_archiver.core import Metadata, ArchivingContext
class CLIFeeder(Feeder):
def __iter__(self) -> Metadata:
for url in self.urls:
logger.debug(f"Processing URL: '{url}'")
yield Metadata().set_url(url)
ArchivingContext.set("folder", "cli")
logger.success(f"Processed {len(self.urls)} URL(s)")

View File

@@ -2,7 +2,7 @@
"name": "Console Database",
"type": ["database"],
"requires_setup": False,
"external_dependencies": {
"dependencies": {
"python": ["loguru"],
},
"description": """

View File

@@ -2,7 +2,7 @@
"name": "CSV Database",
"type": ["database"],
"requires_setup": False,
"external_dependencies": {"python": ["loguru"]
"dependencies": {"python": ["loguru"]
},
'entry_point': 'csv_db::CSVDb',
"configs": {

View File

@@ -2,7 +2,7 @@
"name": "CSV Feeder",
"type": ["feeder"],
"requires_setup": False,
"external_dependencies": {
"dependencies": {
"python": ["loguru"],
"bin": [""]
},
@@ -13,6 +13,9 @@
"default": None,
"help": "Path to the input file(s) to read the URLs from, comma separated. \
Input files should be formatted with one URL per line",
"required": True,
"type": "valid_file",
"nargs": "+",
},
"column": {
"default": None,
@@ -26,9 +29,9 @@
- Supports reading URLs from multiple input files, specified as a comma-separated list.
- Allows specifying the column number or name to extract URLs from.
- Skips header rows if the first value is not a valid URL.
- Integrates with the `ArchivingContext` to manage URL feeding.
### Setu N
- Input files should be formatted with one URL per line.
### Setup
- Input files should be formatted with one URL per line, with or without a header row.
- If you have a header row, you can specify the column number or name to read URLs from using the 'column' config option.
"""
}

View File

@@ -2,24 +2,37 @@ from loguru import logger
import csv
from auto_archiver.core import Feeder
from auto_archiver.core import Metadata, ArchivingContext
from auto_archiver.core import Metadata
from auto_archiver.utils import url_or_none
class CSVFeeder(Feeder):
column = None
def __iter__(self) -> Metadata:
url_column = self.column or 0
for file in self.files:
with open(file, "r") as f:
reader = csv.reader(f)
first_row = next(reader)
if not(url_or_none(first_row[url_column])):
# it's a header row, skip it
url_column = self.column or 0
if isinstance(url_column, str):
try:
url_column = first_row.index(url_column)
except ValueError:
logger.error(f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?")
return
elif not(url_or_none(first_row[url_column])):
# it's a header row, but we've been given a column number already
logger.debug(f"Skipping header row: {first_row}")
for row in reader:
url = row[0]
logger.debug(f"Processing {url}")
yield Metadata().set_url(url)
ArchivingContext.set("folder", "cli")
else:
# first row isn't a header row, rewind the file
f.seek(0)
logger.success(f"Processed {len(self.urls)} URL(s)")
for row in reader:
if not url_or_none(row[url_column]):
logger.warning(f"Not a valid URL in row: {row}, skipping")
continue
url = row[url_column]
logger.debug(f"Processing {url}")
yield Metadata().set_url(url)

View File

@@ -1,14 +1,14 @@
{
"name": "Google Drive Storage",
"type": ["storage"],
"author": "Dave Mateer",
"entry_point": "gdrive_storage::GDriveStorage",
"requires_setup": True,
"external_dependencies": {
"dependencies": {
"python": [
"loguru",
"google-api-python-client",
"google-auth",
"google-auth-oauthlib",
"google-auth-httplib2"
"googleapiclient",
"google",
],
},
"configs": {
@@ -18,17 +18,23 @@
"choices": ["flat", "url", "random"],
},
"filename_generator": {
"default": "random",
"default": "static",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
"choices": ["random", "static"],
},
"root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"},
"oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."},
"root_folder_id": {"required": True,
"help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"},
"oauth_token": {"default": None,
"help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."},
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."},
},
"description": """
GDriveStorage: A storage module for saving archived content to Google Drive.
Author: Dave Mateer, (And maintained by: )
Source Documentation: https://davemateer.com/2022/04/28/google-drive-with-python
### Features
- Saves media files to Google Drive, organizing them into folders based on the provided path structure.
- Supports OAuth token-based authentication or service account credentials for API access.
@@ -39,5 +45,55 @@
- Requires setup with either a Google OAuth token or a service account JSON file.
- Files are uploaded to the specified `root_folder_id` and organized by the `media.key` structure.
- Automatically handles Google Drive API token refreshes for long-running jobs.
"""
## Overview
This module integrates Google Drive as a storage backend, enabling automatic folder creation and file uploads. It supports authentication via **service accounts** (recommended for automation) or **OAuth tokens** (for user-based authentication).
## Features
- Saves files to Google Drive, organizing them into structured folders.
- Supports both **service account** and **OAuth token** authentication.
- Automatically creates folders if they don't exist.
- Generates public URLs for easy file sharing.
## Setup Guide
1. **Enable Google Drive API**
- Create a Google Cloud project at [Google Cloud Console](https://console.cloud.google.com/)
- Enable the **Google Drive API**.
2. **Set Up a Google Drive Folder**
- Create a folder in **Google Drive** and copy its **folder ID** from the URL.
- Add the **folder ID** to your configuration (`orchestration.yaml`):
```yaml
root_folder_id: "FOLDER_ID"
```
3. **Authentication Options**
- **Option 1: Service Account (Recommended)**
- Create a **service account** in Google Cloud IAM.
- Download the JSON key file and save it as:
```
secrets/service_account.json
```
- **Share your Drive folder** with the service accounts `client_email` (found in the JSON file).
- **Option 2: OAuth Token (User Authentication)**
- Create OAuth **Desktop App credentials** in Google Cloud.
- Save the credentials as:
```
secrets/oauth_credentials.json
```
- Generate an OAuth token by running:
```sh
python scripts/create_update_gdrive_oauth_token.py -c secrets/oauth_credentials.json
```
Notes on the OAuth token:
Tokens are refreshed after 1 hour however keep working for 7 days (tbc)
so as long as the job doesn't last for 7 days then this method of refreshing only once per run will work
see this link for details on the token:
https://davemateer.com/2022/04/28/google-drive-with-python#tokens
"""
}

View File

@@ -1,68 +1,67 @@
import shutil, os, time, json
import json
import os
import time
from typing import IO
from loguru import logger
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from google.auth.transport.requests import Request
from google.oauth2 import service_account
from google.oauth2.credentials import Credentials
from google.auth.transport.requests import Request
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from loguru import logger
from auto_archiver.core import Media
from auto_archiver.core import Storage
class GDriveStorage(Storage):
def __init__(self, config: dict) -> None:
super().__init__(config)
def setup(self) -> None:
self.scopes = ['https://www.googleapis.com/auth/drive']
# Initialize Google Drive service
self._setup_google_drive_service()
SCOPES = ['https://www.googleapis.com/auth/drive']
if self.oauth_token is not None:
"""
Tokens are refreshed after 1 hour
however keep working for 7 days (tbc)
so as long as the job doesn't last for 7 days
then this method of refreshing only once per run will work
see this link for details on the token
https://davemateer.com/2022/04/28/google-drive-with-python#tokens
"""
logger.debug(f'Using GD OAuth token {self.oauth_token}')
# workaround for missing 'refresh_token' in from_authorized_user_file
with open(self.oauth_token, 'r') as stream:
creds_json = json.load(stream)
creds_json['refresh_token'] = creds_json.get("refresh_token", "")
creds = Credentials.from_authorized_user_info(creds_json, SCOPES)
# creds = Credentials.from_authorized_user_file(self.oauth_token, SCOPES)
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
logger.debug('Requesting new GD OAuth token')
creds.refresh(Request())
else:
raise Exception("Problem with creds - create the token again")
# Save the credentials for the next run
with open(self.oauth_token, 'w') as token:
logger.debug('Saving new GD OAuth token')
token.write(creds.to_json())
else:
logger.debug('GD OAuth Token valid')
def _setup_google_drive_service(self):
"""Initialize Google Drive service based on provided credentials."""
if self.oauth_token:
logger.debug(f"Using Google Drive OAuth token: {self.oauth_token}")
self.service = self._initialize_with_oauth_token()
elif self.service_account:
logger.debug(f"Using Google Drive service account: {self.service_account}")
self.service = self._initialize_with_service_account()
else:
gd_service_account = self.service_account
logger.debug(f'Using GD Service Account {gd_service_account}')
creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES)
raise ValueError("Missing credentials: either `oauth_token` or `service_account` must be provided.")
self.service = build('drive', 'v3', credentials=creds)
def _initialize_with_oauth_token(self):
"""Initialize Google Drive service with OAuth token."""
with open(self.oauth_token, 'r') as stream:
creds_json = json.load(stream)
creds_json['refresh_token'] = creds_json.get("refresh_token", "")
creds = Credentials.from_authorized_user_info(creds_json, self.scopes)
if not creds.valid and creds.expired and creds.refresh_token:
creds.refresh(Request())
with open(self.oauth_token, 'w') as token_file:
logger.debug("Saving refreshed OAuth token.")
token_file.write(creds.to_json())
elif not creds.valid:
raise ValueError("Invalid OAuth token. Please regenerate the token.")
return build('drive', 'v3', credentials=creds)
def _initialize_with_service_account(self):
"""Initialize Google Drive service with service account."""
creds = service_account.Credentials.from_service_account_file(self.service_account, scopes=self.scopes)
return build('drive', 'v3', credentials=creds)
def get_cdn_url(self, media: Media) -> str:
"""
only support files saved in a folder for GD
S3 supports folder and all stored in the root
"""
# full_name = os.path.join(self.folder, media.key)
parent_id, folder_id = self.root_folder_id, None
path_parts = media.key.split(os.path.sep)
@@ -71,13 +70,16 @@ class GDriveStorage(Storage):
for folder in path_parts[0:-1]:
folder_id = self._get_id_from_parent_and_name(parent_id, folder, use_mime_type=True, raise_on_missing=True)
parent_id = folder_id
# get id of file inside folder (or sub folder)
file_id = self._get_id_from_parent_and_name(folder_id, filename)
file_id = self._get_id_from_parent_and_name(folder_id, filename, raise_on_missing=True)
if not file_id:
#
logger.info(f"file {filename} not found in folder {folder_id}")
return None
return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
def upload(self, media: Media, **kwargs) -> bool:
logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key}')
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
"""
1. for each sub-folder in the path check if exists or create
2. upload file to root_id/other_paths.../filename
@@ -105,7 +107,13 @@ class GDriveStorage(Storage):
# must be implemented even if unused
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False):
def _get_id_from_parent_and_name(self, parent_id: str,
name: str,
retries: int = 1,
sleep_seconds: int = 10,
use_mime_type: bool = False,
raise_on_missing: bool = True,
use_cache=False):
"""
Retrieves the id of a folder or file from its @name and the @parent_id folder
Optionally does multiple @retries and sleeps @sleep_seconds between them
@@ -168,8 +176,3 @@ class GDriveStorage(Storage):
gd_folder = self.service.files().create(supportsAllDrives=True, body=file_metadata, fields='id').execute()
return gd_folder.get('id')
# def exists(self, key):
# try:
# self.get_cdn_url(key)
# return True
# except: return False

View File

@@ -20,6 +20,7 @@ the broader archiving framework.
- Retrieves metadata like titles, descriptions, upload dates, and durations.
- Downloads subtitles and comments when enabled.
- Configurable options for handling live streams, proxies, and more.
- Supports authentication of websites using the 'authentication' settings from your orchestration.
### Dropins
- For websites supported by `yt-dlp` that also contain posts in addition to videos
@@ -29,10 +30,6 @@ custom dropins can be created to handle additional websites and passed to the ar
via the command line using the `--dropins` option (TODO!).
""",
"configs": {
"facebook_cookie": {
"default": None,
"help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'",
},
"subtitles": {"default": True, "help": "download subtitles if available", "type": "bool"},
"comments": {
"default": False,
@@ -67,14 +64,5 @@ via the command line using the `--dropins` option (TODO!).
"default": "inf",
"help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
},
"cookies_from_browser": {
"default": None,
"type": "str",
"help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale",
},
"cookie_file": {
"default": None,
"help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp",
},
},
}

View File

@@ -23,19 +23,8 @@ class Bluesky(GenericDropin):
def extract_post(self, url: str, ie_instance: InfoExtractor) -> dict:
# TODO: If/when this PR (https://github.com/yt-dlp/yt-dlp/pull/12098) is merged on ytdlp, remove the comments and delete the code below
# handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id')
# return ie_instance._extract_post(handle=handle, post_id=video_id)
handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id')
return ie_instance._download_json(
'https://public.api.bsky.app/xrpc/app.bsky.feed.getPostThread',
video_id, query={
'uri': f'at://{handle}/app.bsky.feed.post/{video_id}',
'depth': 0,
'parentHeight': 0,
})['thread']['post']
return ie_instance._extract_post(handle=handle, post_id=video_id)
def _download_bsky_embeds(self, post: dict, archiver: Extractor) -> list[Media]:
"""

View File

@@ -0,0 +1,17 @@
from .dropin import GenericDropin
class Facebook(GenericDropin):
def extract_post(self, url: str, ie_instance):
video_id = ie_instance._match_valid_url(url).group('id')
ie_instance._download_webpage(
url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
webpage = ie_instance._download_webpage(url, ie_instance._match_valid_url(url).group('id'))
post_data = ie_instance._extract_from_url.extract_metadata(webpage)
return post_data
def create_metadata(self, post: dict, ie_instance, archiver, url):
metadata = archiver.create_metadata(url)
metadata.set_title(post.get('title')).set_content(post.get('description')).set_post_data(post)
return metadata

View File

@@ -6,7 +6,7 @@ from yt_dlp.extractor.common import InfoExtractor
from loguru import logger
from auto_archiver.core.extractor import Extractor
from ...core import Metadata, Media, ArchivingContext
from auto_archiver.core import Metadata, Media
class GenericExtractor(Extractor):
_dropins = {}
@@ -266,19 +266,30 @@ class GenericExtractor(Extractor):
def download(self, item: Metadata) -> Metadata:
url = item.get_url()
if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
logger.debug('Using Facebook cookie')
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
if item.netloc in ['youtube.com', 'www.youtube.com']:
if self.cookies_from_browser:
logger.debug(f'Extracting cookies from browser {self.cookies_from_browser} for Youtube')
ydl_options['cookiesfrombrowser'] = (self.cookies_from_browser,)
elif self.cookie_file:
logger.debug(f'Using cookies from file {self.cookie_file}')
ydl_options['cookiefile'] = self.cookie_file
ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'),
'quiet': False, 'noplaylist': not self.allow_playlist ,
'writesubtitles': self.subtitles,'writeautomaticsub': self.subtitles,
"live_from_start": self.live_from_start, "proxy": self.proxy,
"max_downloads": self.max_downloads, "playlistend": self.max_downloads}
# set up auth
auth = self.auth_for_site(url, extract_cookies=False)
# order of importance: username/pasword -> api_key -> cookie -> cookie_from_browser -> cookies_file
if auth:
if 'username' in auth and 'password' in auth:
logger.debug(f'Using provided auth username and password for {url}')
ydl_options['username'] = auth['username']
ydl_options['password'] = auth['password']
elif 'cookie' in auth:
logger.debug(f'Using provided auth cookie for {url}')
yt_dlp.utils.std_headers['cookie'] = auth['cookie']
elif 'cookie_from_browser' in auth:
logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}')
ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
elif 'cookies_file' in auth:
logger.debug(f'Using cookies from file {self.cookie_file} for {url}')
ydl_options['cookiesfile'] = auth['cookies_file']
ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"

View File

@@ -5,7 +5,7 @@ from loguru import logger
from slugify import slugify
from auto_archiver.core.metadata import Metadata, Media
from auto_archiver.utils import UrlUtil
from auto_archiver.utils import url as UrlUtil
from auto_archiver.core.extractor import Extractor
from .dropin import GenericDropin, InfoExtractor

View File

@@ -3,8 +3,8 @@
"type": ["database"],
"entry_point": "gsheet_db::GsheetsDb",
"requires_setup": True,
"external_dependencies": {
"python": ["loguru", "gspread", "python-slugify"],
"dependencies": {
"python": ["loguru", "gspread", "slugify"],
},
"configs": {
"allow_worksheets": {
@@ -17,6 +17,7 @@
},
"use_sheet_names_in_stored_paths": {
"default": True,
"type": "bool",
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
}
},

View File

@@ -1,39 +1,38 @@
from typing import Union, Tuple
import datetime
from urllib.parse import quote
from loguru import logger
from auto_archiver.core import Database
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.core import Metadata, Media
from auto_archiver.modules.gsheet_feeder import GWorksheet
from auto_archiver.utils.misc import get_current_timestamp
class GsheetsDb(Database):
"""
NB: only works if GsheetFeeder is used.
could be updated in the future to support non-GsheetFeeder metadata
NB: only works if GsheetFeeder is used.
could be updated in the future to support non-GsheetFeeder metadata
"""
def started(self, item: Metadata) -> None:
logger.warning(f"STARTED {item}")
gw, row = self._retrieve_gsheet(item)
gw.set_cell(row, 'status', 'Archive in progress')
gw.set_cell(row, "status", "Archive in progress")
def failed(self, item: Metadata, reason:str) -> None:
def failed(self, item: Metadata, reason: str) -> None:
logger.error(f"FAILED {item}")
self._safe_status_update(item, f'Archive failed {reason}')
self._safe_status_update(item, f"Archive failed {reason}")
def aborted(self, item: Metadata) -> None:
logger.warning(f"ABORTED {item}")
self._safe_status_update(item, '')
self._safe_status_update(item, "")
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
"""check if the given item has been archived already"""
return False
def done(self, item: Metadata, cached: bool=False) -> None:
def done(self, item: Metadata, cached: bool = False) -> None:
"""archival result ready - should be saved to DB"""
logger.success(f"DONE {item.get_url()}")
gw, row = self._retrieve_gsheet(item)
@@ -45,23 +44,25 @@ class GsheetsDb(Database):
def batch_if_valid(col, val, final_value=None):
final_value = final_value or val
try:
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == "":
cell_updates.append((row, col, final_value))
except Exception as e:
logger.error(f"Unable to batch {col}={final_value} due to {e}")
status_message = item.status
if cached:
status_message = f"[cached] {status_message}"
cell_updates.append((row, 'status', status_message))
cell_updates.append((row, "status", status_message))
media: Media = item.get_final_media()
if hasattr(media, "urls"):
batch_if_valid('archive', "\n".join(media.urls))
batch_if_valid('date', True, datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat())
batch_if_valid('title', item.get_title())
batch_if_valid('text', item.get("content", ""))
batch_if_valid('timestamp', item.get_timestamp())
if media: batch_if_valid('hash', media.get("hash", "not-calculated"))
batch_if_valid("archive", "\n".join(media.urls))
batch_if_valid("date", True, get_current_timestamp())
batch_if_valid("title", item.get_title())
batch_if_valid("text", item.get("content", ""))
batch_if_valid("timestamp", item.get_timestamp())
if media:
batch_if_valid("hash", media.get("hash", "not-calculated"))
# merge all pdq hashes into a single string, if present
pdq_hashes = []
@@ -70,34 +71,44 @@ class GsheetsDb(Database):
if pdq := m.get("pdq_hash"):
pdq_hashes.append(pdq)
if len(pdq_hashes):
batch_if_valid('pdq_hash', ",".join(pdq_hashes))
batch_if_valid("pdq_hash", ",".join(pdq_hashes))
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
batch_if_valid('screenshot', "\n".join(screenshot.urls))
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(
screenshot, "urls"
):
batch_if_valid("screenshot", "\n".join(screenshot.urls))
if (thumbnail := item.get_first_image("thumbnail")):
if thumbnail := item.get_first_image("thumbnail"):
if hasattr(thumbnail, "urls"):
batch_if_valid('thumbnail', f'=IMAGE("{thumbnail.urls[0]}")')
batch_if_valid("thumbnail", f'=IMAGE("{thumbnail.urls[0]}")')
if (browsertrix := item.get_media_by_id("browsertrix")):
batch_if_valid('wacz', "\n".join(browsertrix.urls))
batch_if_valid('replaywebpage', "\n".join([f'https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}' for wacz in browsertrix.urls]))
if browsertrix := item.get_media_by_id("browsertrix"):
batch_if_valid("wacz", "\n".join(browsertrix.urls))
batch_if_valid(
"replaywebpage",
"\n".join(
[
f"https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}"
for wacz in browsertrix.urls
]
),
)
gw.batch_set_cell(cell_updates)
def _safe_status_update(self, item: Metadata, new_status: str) -> None:
try:
gw, row = self._retrieve_gsheet(item)
gw.set_cell(row, 'status', new_status)
gw.set_cell(row, "status", new_status)
except Exception as e:
logger.debug(f"Unable to update sheet: {e}")
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
if gsheet := ArchivingContext.get("gsheet"):
if gsheet := item.get_context("gsheet"):
gw: GWorksheet = gsheet.get("worksheet")
row: int = gsheet.get("row")
elif self.sheet_id:
print(self.sheet_id)
logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.")
return gw, row

View File

@@ -3,8 +3,8 @@
"type": ["feeder"],
"entry_point": "gsheet_feeder::GsheetsFeeder",
"requires_setup": True,
"external_dependencies": {
"python": ["loguru", "gspread", "python-slugify"],
"dependencies": {
"python": ["loguru", "gspread", "slugify"],
},
"configs": {
"sheet": {"default": None, "help": "name of the sheet to archive"},

View File

@@ -15,14 +15,13 @@ from loguru import logger
from slugify import slugify
from auto_archiver.core import Feeder
from auto_archiver.core import Metadata, ArchivingContext
from auto_archiver.core import Metadata
from . import GWorksheet
class GsheetsFeeder(Feeder):
def setup(self, config: dict):
super().setup(config)
def setup(self) -> None:
self.gsheets_client = gspread.service_account(filename=self.service_account)
# TODO mv to validators
assert self.sheet or self.sheet_id, (
@@ -37,43 +36,48 @@ class GsheetsFeeder(Feeder):
def __iter__(self) -> Metadata:
sh = self.open_sheet()
for ii, wks in enumerate(sh.worksheets()):
if not self.should_process_sheet(wks.title):
logger.debug(f"SKIPPED worksheet '{wks.title}' due to allow/block rules")
for ii, worksheet in enumerate(sh.worksheets()):
if not self.should_process_sheet(worksheet.title):
logger.debug(f"SKIPPED worksheet '{worksheet.title}' due to allow/block rules")
continue
logger.info(f'Opening worksheet {ii=}: {wks.title=} header={self.header}')
gw = GWorksheet(wks, header_row=self.header, columns=self.columns)
logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}')
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
if len(missing_cols := self.missing_required_columns(gw)):
logger.warning(f"SKIPPED worksheet '{wks.title}' due to missing required column(s) for {missing_cols}")
logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}")
continue
for row in range(1 + self.header, gw.count_rows() + 1):
url = gw.get_cell(row, 'url').strip()
if not len(url): continue
# process and yield metadata here:
yield from self._process_rows(gw)
logger.success(f'Finished worksheet {worksheet.title}')
original_status = gw.get_cell(row, 'status')
status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
# TODO: custom status parser(?) aka should_retry_from_status
if status not in ['', None]: continue
def _process_rows(self, gw: GWorksheet):
for row in range(1 + self.header, gw.count_rows() + 1):
url = gw.get_cell(row, 'url').strip()
if not len(url): continue
original_status = gw.get_cell(row, 'status')
status = gw.get_cell(row, 'status', fresh=original_status in ['', None])
# TODO: custom status parser(?) aka should_retry_from_status
if status not in ['', None]: continue
# All checks done - archival process starts here
m = Metadata().set_url(url)
ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
if gw.get_cell_or_default(row, 'folder', "") is None:
folder = ''
else:
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
if len(folder):
if self.use_sheet_names_in_stored_paths:
ArchivingContext.set("folder", os.path.join(folder, slugify(self.sheet), slugify(wks.title)), True)
else:
ArchivingContext.set("folder", folder, True)
# All checks done - archival process starts here
m = Metadata().set_url(url)
self._set_context(m, gw, row)
yield m
yield m
def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata:
# TODO: Check folder value not being recognised
m.set_context("gsheet", {"row": row, "worksheet": gw})
if gw.get_cell_or_default(row, 'folder', "") is None:
folder = ''
else:
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
if len(folder):
if self.use_sheet_names_in_stored_paths:
m.set_context("folder", os.path.join(folder, slugify(self.sheet), slugify(gw.wks.title)))
else:
m.set_context("folder", folder)
logger.success(f'Finished worksheet {wks.title}')
def should_process_sheet(self, sheet_name: str) -> bool:
if len(self.allow_worksheets) and sheet_name not in self.allow_worksheets:

View File

@@ -2,7 +2,7 @@
"name": "Hash Enricher",
"type": ["enricher"],
"requires_setup": False,
"external_dependencies": {
"dependencies": {
"python": ["loguru"],
},
"configs": {

View File

@@ -11,7 +11,8 @@ import hashlib
from loguru import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, ArchivingContext
from auto_archiver.core import Metadata
from auto_archiver.utils.misc import calculate_file_hash
class HashEnricher(Enricher):
@@ -19,16 +20,6 @@ class HashEnricher(Enricher):
Calculates hashes for Media instances
"""
def __init__(self, config: dict = None):
"""
Initialize the HashEnricher with a configuration dictionary.
"""
super().__init__()
# TODO set these from the manifest?
# Set default values
self.algorithm = config.get("algorithm", "SHA-256") if config else "SHA-256"
self.chunksize = config.get("chunksize", int(1.6e7)) if config else int(1.6e7)
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
@@ -39,15 +30,10 @@ class HashEnricher(Enricher):
to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
def calculate_hash(self, filename) -> str:
hash = None
hash_algo = None
if self.algorithm == "SHA-256":
hash = hashlib.sha256()
hash_algo = hashlib.sha256
elif self.algorithm == "SHA3-512":
hash = hashlib.sha3_512()
hash_algo = hashlib.sha3_512
else: return ""
with open(filename, "rb") as f:
while True:
buf = f.read(self.chunksize)
if not buf: break
hash.update(buf)
return hash.hexdigest()
return calculate_file_hash(filename, hash_algo, self.chunksize)

View File

@@ -2,8 +2,8 @@
"name": "HTML Formatter",
"type": ["formatter"],
"requires_setup": False,
"external_dependencies": {
"python": ["loguru", "jinja2"],
"dependencies": {
"python": ["hash_enricher", "loguru", "jinja2"],
"bin": [""]
},
"configs": {

View File

@@ -1,5 +1,4 @@
from __future__ import annotations
from dataclasses import dataclass
import mimetypes, os, pathlib
from jinja2 import Environment, FileSystemLoader
from urllib.parse import quote
@@ -8,20 +7,18 @@ import json
import base64
from auto_archiver.version import __version__
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.core import Metadata, Media
from auto_archiver.core import Formatter
from auto_archiver.modules.hash_enricher import HashEnricher
from auto_archiver.utils.misc import random_str
from auto_archiver.core.module import get_module
@dataclass
class HtmlFormatter(Formatter):
environment: Environment = None
template: any = None
def setup(self, config: dict) -> None:
def setup(self) -> None:
"""Sets up the Jinja2 environment and loads the template."""
super().setup(config) # Ensure the base class logic is executed
template_dir = os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")
self.environment = Environment(loader=FileSystemLoader(template_dir), autoescape=True)
@@ -48,12 +45,13 @@ class HtmlFormatter(Formatter):
version=__version__
)
html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{random_str(24)}.html")
html_path = os.path.join(self.tmp_dir, f"formatted{random_str(24)}.html")
with open(html_path, mode="w", encoding="utf-8") as outf:
outf.write(content)
final_media = Media(filename=html_path, _mimetype="text/html")
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
# get the already instantiated hash_enricher module
he = get_module('hash_enricher', self.config)
if len(hd := he.calculate_hash(final_media.filename)):
final_media.set("hash", f"{he.algorithm}:{hd}")

View File

@@ -1,7 +1,8 @@
{
"name": "Instagram API Extractor",
"type": ["extractor"],
"external_dependencies":
"entry_point": "instagram_api_extractor::InstagramAPIExtractor",
"dependencies":
{"python": ["requests",
"loguru",
"retrying",
@@ -9,24 +10,31 @@
},
"requires_setup": True,
"configs": {
"access_token": {"default": None, "help": "a valid instagrapi-api token"},
"api_endpoint": {"default": None, "help": "API endpoint to use"},
"access_token": {"default": None,
"help": "a valid instagrapi-api token"},
"api_endpoint": {"required": True,
"help": "API endpoint to use"},
"full_profile": {
"default": False,
"type": "bool",
"help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.",
},
"full_profile_max_posts": {
"default": 0,
"type": "int",
"help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights",
},
"minimize_json_output": {
"default": True,
"type": "bool",
"help": "if true, will remove empty values from the json output",
},
},
"description": """
Archives various types of Instagram content using the Instagrapi API.
Requires setting up an Instagrapi API deployment and providing an access token and API endpoint.
### Features
- Connects to an Instagrapi API deployment to fetch Instagram profiles, posts, stories, highlights, reels, and tagged content.
- Supports advanced configuration options, including:

View File

@@ -28,20 +28,14 @@ class InstagramAPIExtractor(Extractor):
# TODO: improvement collect aggregates of locations[0].location and mentions for all posts
"""
global_pattern = re.compile(
valid_url = re.compile(
r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?"
)
def __init__(self, config: dict) -> None:
super().__init__(config)
self.assert_valid_string("access_token")
self.assert_valid_string("api_endpoint")
self.full_profile_max_posts = int(self.full_profile_max_posts)
def setup(self) -> None:
if self.api_endpoint[-1] == "/":
self.api_endpoint = self.api_endpoint[:-1]
self.full_profile = bool(self.full_profile)
self.minimize_json_output = bool(self.minimize_json_output)
def download(self, item: Metadata) -> Metadata:
url = item.get_url()
@@ -49,7 +43,7 @@ class InstagramAPIExtractor(Extractor):
url.replace("instagr.com", "instagram.com").replace(
"instagr.am", "instagram.com"
)
insta_matches = self.global_pattern.findall(url)
insta_matches = self.valid_url.findall(url)
logger.info(f"{insta_matches=}")
if not len(insta_matches) or len(insta_matches[0]) != 3:
return

View File

@@ -1,7 +1,7 @@
{
"name": "Instagram Extractor",
"type": ["extractor"],
"external_dependencies": {
"dependencies": {
"python": [
"instaloader",
"loguru",
@@ -9,9 +9,10 @@
},
"requires_setup": True,
"configs": {
"username": {"default": None, "help": "a valid Instagram username"},
"username": {"required": True,
"help": "a valid Instagram username"},
"password": {
"default": None,
"required": True,
"help": "the corresponding Instagram account password",
},
"download_folder": {
@@ -25,9 +26,11 @@
# TODO: fine-grain
# "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"},
},
"description": """Uses the Instaloader library to download content from Instagram. This class handles both individual posts
and user profiles, downloading as much information as possible, including images, videos, text, stories,
highlights, and tagged posts. Authentication is required via username/password or a session file.
"description": """
Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. This class handles both individual posts
and user profiles, downloading as much information as possible, including images, videos, text, stories,
highlights, and tagged posts.
Authentication is required via username/password or a session file.
""",
}

View File

@@ -4,7 +4,7 @@
"""
import re, os, shutil, traceback
import instaloader # https://instaloader.github.io/as-module.html
import instaloader
from loguru import logger
from auto_archiver.core import Extractor
@@ -16,19 +16,17 @@ class InstagramExtractor(Extractor):
Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
"""
# NB: post regex should be tested before profile
valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/")
# https://regex101.com/r/MGPquX/1
post_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(?:p|reel)\/(\w+)")
post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url))
# https://regex101.com/r/6Wbsxa/1
profile_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(\w+)")
profile_pattern = re.compile(r"{valid_url}(\w+)".format(valid_url=valid_url))
# TODO: links to stories
def __init__(self, config: dict) -> None:
super().__init__(config)
# TODO: refactor how configuration validation is done
self.assert_valid_string("username")
self.assert_valid_string("password")
self.assert_valid_string("download_folder")
self.assert_valid_string("session_file")
def setup(self) -> None:
self.insta = instaloader.Instaloader(
download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}"
)

View File

@@ -1,15 +1,16 @@
{
"name": "Instagram Telegram Bot Extractor",
"type": ["extractor"],
"external_dependencies": {"python": ["loguru",
"telethon",],
"dependencies": {"python": ["loguru", "telethon",],
},
"requires_setup": True,
"configs": {
"api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
"api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
"session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
"timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."},
"timeout": {"default": 45,
"type": "int",
"help": "timeout to fetch the instagram content in seconds."},
},
"description": """
The `InstagramTbotExtractor` module uses a Telegram bot (`instagram_load_bot`) to fetch and archive Instagram content,
@@ -28,6 +29,12 @@ returned as part of a `Metadata` object.
To use the `InstagramTbotExtractor`, you need to provide the following configuration settings:
- **API ID and Hash**: Telegram API credentials obtained from [my.telegram.org/apps](https://my.telegram.org/apps).
- **Session File**: Optional path to store the Telegram session file for future use.
- The session file is created automatically and should be unique for each instance.
- You may need to enter your Telegram credentials (phone) and use the a 2FA code sent to you the first time you run the extractor.:
```2025-01-30 00:43:49.348 | INFO | auto_archiver.modules.instagram_tbot_extractor.instagram_tbot_extractor:setup:36 - SETUP instagram_tbot_extractor checking login...
Please enter your phone (or bot token): +447123456789
Please enter the code you received: 00000
Signed in successfully as E C; remember to not break the ToS or you will risk an account ban!
```
""",
}

View File

@@ -16,7 +16,7 @@ from loguru import logger
from telethon.sync import TelegramClient
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.core import Metadata, Media
from auto_archiver.utils import random_str
@@ -33,17 +33,30 @@ class InstagramTbotExtractor(Extractor):
2. checks if the session file is valid
"""
logger.info(f"SETUP {self.name} checking login...")
self._prepare_session_file()
self._initialize_telegram_client()
# make a copy of the session that is used exclusively with this archiver instance
def _prepare_session_file(self):
"""
Creates a copy of the session file for exclusive use with this archiver instance.
Ensures that a valid session file exists before proceeding.
"""
new_session_file = os.path.join("secrets/", f"instabot-{time.strftime('%Y-%m-%d')}{random_str(8)}.session")
if not os.path.exists(f"{self.session_file}.session"):
raise FileNotFoundError(f"Session file {self.session_file}.session not found.")
shutil.copy(self.session_file + ".session", new_session_file)
self.session_file = new_session_file.replace(".session", "")
def _initialize_telegram_client(self):
"""Initializes the Telegram client."""
try:
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
except OperationalError as e:
logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_extractor. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}")
logger.error(
f"Unable to access the {self.session_file} session. "
"Ensure that you don't use the same session file here and in telethon_extractor. "
"If you do, disable at least one of the archivers for the first-time setup of the telethon session: {e}"
)
with self.client.start():
logger.success(f"SETUP {self.name} login works.")
@@ -58,34 +71,51 @@ class InstagramTbotExtractor(Extractor):
if not "instagram.com" in url: return False
result = Metadata()
tmp_dir = ArchivingContext.get_tmp_dir()
tmp_dir = self.tmp_dir
with self.client.start():
chat = self.client.get_entity("instagram_load_bot")
since_id = self.client.send_message(entity=chat, message=url).id
attempts = 0
seen_media = []
message = ""
time.sleep(3)
# media is added before text by the bot so it can be used as a stop-logic mechanism
while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
attempts += 1
time.sleep(1)
for post in self.client.iter_messages(chat, min_id=since_id):
since_id = max(since_id, post.id)
if post.media and post.id not in seen_media:
filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
media = self.client.download_media(post.media, filename_dest)
if media:
result.add_media(Media(media))
seen_media.append(post.id)
if post.message: message += post.message
chat, since_id = self._send_url_to_bot(url)
message = self._process_messages(chat, since_id, tmp_dir, result)
if "You must enter a URL to a post" in message:
if "You must enter a URL to a post" in message:
logger.debug(f"invalid link {url=} for {self.name}: {message}")
return False
# # TODO: It currently returns this as a success - is that intentional?
# if "Media not found or unavailable" in message:
# logger.debug(f"invalid link {url=} for {self.name}: {message}")
# return False
if message:
result.set_content(message).set_title(message[:128])
return result.success("insta-via-bot")
def _send_url_to_bot(self, url: str):
"""
Sends the URL to the 'instagram_load_bot' and returns (chat, since_id).
"""
chat = self.client.get_entity("instagram_load_bot")
since_message = self.client.send_message(entity=chat, message=url)
return chat, since_message.id
def _process_messages(self, chat, since_id, tmp_dir, result):
attempts = 0
seen_media = []
message = ""
time.sleep(3)
# media is added before text by the bot so it can be used as a stop-logic mechanism
while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
attempts += 1
time.sleep(1)
for post in self.client.iter_messages(chat, min_id=since_id):
since_id = max(since_id, post.id)
# Skip known filler message:
if post.message == 'The bot receives information through https://hikerapi.com/p/hJqpppqi':
continue
if post.media and post.id not in seen_media:
filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
media = self.client.download_media(post.media, filename_dest)
if media:
result.add_media(Media(media))
seen_media.append(post.id)
if post.message: message += post.message
return message.strip()

View File

@@ -2,7 +2,7 @@
"name": "Local Storage",
"type": ["storage"],
"requires_setup": False,
"external_dependencies": {
"dependencies": {
"python": ["loguru"],
},
"configs": {

View File

@@ -2,7 +2,7 @@
"name": "Archive Metadata Enricher",
"type": ["enricher"],
"requires_setup": False,
"external_dependencies": {
"dependencies": {
"python": ["loguru"],
},
"description": """

View File

@@ -2,7 +2,7 @@
"name": "Media Metadata Enricher",
"type": ["enricher"],
"requires_setup": True,
"external_dependencies": {
"dependencies": {
"python": ["loguru"],
"bin": ["exiftool"]
},

View File

@@ -2,7 +2,7 @@
"name": "Mute Formatter",
"type": ["formatter"],
"requires_setup": True,
"external_dependencies": {
"dependencies": {
},
"description": """ Default formatter.
""",

View File

@@ -1,11 +1,9 @@
from __future__ import annotations
from dataclasses import dataclass
from auto_archiver.core import Metadata, Media
from auto_archiver.core import Formatter
@dataclass
class MuteFormatter(Formatter):
def format(self, item: Metadata) -> Media: return None

View File

@@ -2,8 +2,8 @@
"name": "PDQ Hash Enricher",
"type": ["enricher"],
"requires_setup": False,
"external_dependencies": {
"python": ["loguru", "pdqhash", "numpy", "Pillow"],
"dependencies": {
"python": ["loguru", "pdqhash", "numpy", "PIL"],
},
"description": """
PDQ Hash Enricher for generating perceptual hashes of media files.

View File

@@ -1 +1 @@
from .s3 import S3Storage
from .s3_storage import S3Storage

View File

@@ -2,17 +2,17 @@
"name": "S3 Storage",
"type": ["storage"],
"requires_setup": True,
"external_dependencies": {
"python": ["boto3", "loguru"],
"dependencies": {
"python": ["hash_enricher", "boto3", "loguru"],
},
"configs": {
"path_generator": {
"default": "url",
"default": "flat",
"help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
"choices": ["flat", "url", "random"],
},
"filename_generator": {
"default": "random",
"default": "static",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
"choices": ["random", "static"],
},
@@ -20,7 +20,9 @@
"region": {"default": None, "help": "S3 region name"},
"key": {"default": None, "help": "S3 API key"},
"secret": {"default": None, "help": "S3 API secret"},
"random_no_duplicate": {"default": False, "help": "if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`"},
"random_no_duplicate": {"default": False,
"type": "bool",
"help": "if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`"},
"endpoint_url": {
"default": 'https://{region}.digitaloceanspaces.com',
"help": "S3 bucket endpoint, {region} are inserted at runtime"
@@ -29,7 +31,9 @@
"default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
"help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
},
"private": {"default": False, "help": "if true S3 files will not be readable online"},
"private": {"default": False,
"type": "bool",
"help": "if true S3 files will not be readable online"},
},
"description": """
S3Storage: A storage module for saving media files to an S3-compatible object storage.
@@ -45,5 +49,6 @@
- Requires S3 credentials (API key and secret) and a bucket name to function.
- The `random_no_duplicate` option ensures no duplicate uploads by leveraging hash-based folder structures.
- Uses `boto3` for interaction with the S3 API.
- Depends on the `HashEnricher` module for hash calculation.
"""
}

View File

@@ -1,19 +1,19 @@
from typing import IO
import boto3, os
from auto_archiver.utils.misc import random_str
from auto_archiver.core import Media
from auto_archiver.core import Storage
from auto_archiver.modules.hash_enricher import HashEnricher
import boto3
import os
from loguru import logger
from auto_archiver.core import Media
from auto_archiver.core import Storage
from auto_archiver.utils.misc import calculate_file_hash, random_str
NO_DUPLICATES_FOLDER = "no-dups/"
class S3Storage(Storage):
def __init__(self, config: dict) -> None:
super().__init__(config)
def setup(self) -> None:
self.s3 = boto3.client(
's3',
region_name=self.region,
@@ -21,7 +21,6 @@ class S3Storage(Storage):
aws_access_key_id=self.key,
aws_secret_access_key=self.secret
)
self.random_no_duplicate = bool(self.random_no_duplicate)
if self.random_no_duplicate:
logger.warning("random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`.")
@@ -41,15 +40,13 @@ class S3Storage(Storage):
extra_args['ContentType'] = media.mimetype
except Exception as e:
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
return True
def is_upload_needed(self, media: Media) -> bool:
if self.random_no_duplicate:
# checks if a folder with the hash already exists, if so it skips the upload
he = HashEnricher({"hash_enricher": {"algorithm": "SHA-256", "chunksize": 1.6e7}})
hd = he.calculate_hash(media.filename)
hd = calculate_file_hash(media.filename)
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
if existing_key:=self.file_in_folder(path):
@@ -61,8 +58,7 @@ class S3Storage(Storage):
_, ext = os.path.splitext(media.key)
media.key = os.path.join(path, f"{random_str(24)}{ext}")
return True
def file_in_folder(self, path:str) -> str:
# checks if path exists and is not an empty folder
if not path.endswith('/'):

View File

@@ -2,7 +2,7 @@
"name": "Screenshot Enricher",
"type": ["enricher"],
"requires_setup": True,
"external_dependencies": {
"dependencies": {
"python": ["loguru", "selenium"],
"bin": ["chromedriver"]
},

View File

@@ -6,8 +6,8 @@ from selenium.common.exceptions import TimeoutException
from auto_archiver.core import Enricher
from auto_archiver.utils import Webdriver, UrlUtil, random_str
from auto_archiver.core import Media, Metadata, ArchivingContext
from auto_archiver.utils import Webdriver, url as UrlUtil, random_str
from auto_archiver.core import Media, Metadata
class ScreenshotEnricher(Enricher):
@@ -19,15 +19,17 @@ class ScreenshotEnricher(Enricher):
return
logger.debug(f"Enriching screenshot for {url=}")
with Webdriver(self.width, self.height, self.timeout, 'facebook.com' in url, http_proxy=self.http_proxy, print_options=self.print_options) as driver:
auth = self.auth_for_site(url)
with Webdriver(self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
http_proxy=self.http_proxy, print_options=self.print_options, auth=auth) as driver:
try:
driver.get(url)
time.sleep(int(self.sleep_before_screenshot))
screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png")
screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png")
driver.save_screenshot(screenshot_file)
to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
if self.save_to_pdf:
pdf_file = os.path.join(ArchivingContext.get_tmp_dir(), f"pdf_{random_str(8)}.pdf")
pdf_file = os.path.join(self.tmp_dir, f"pdf_{random_str(8)}.pdf")
pdf = driver.print_page(driver.print_options)
with open(pdf_file, "wb") as f:
f.write(base64.b64decode(pdf))

View File

@@ -2,8 +2,8 @@
"name": "SSL Certificate Enricher",
"type": ["enricher"],
"requires_setup": False,
"external_dependencies": {
"python": ["loguru", "python-slugify"],
"dependencies": {
"python": ["loguru", "slugify"],
},
'entry_point': 'ssl_enricher::SSLEnricher',
"configs": {

View File

@@ -4,7 +4,7 @@ from urllib.parse import urlparse
from loguru import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, ArchivingContext, Media
from auto_archiver.core import Metadata, Media
class SSLEnricher(Enricher):
@@ -23,6 +23,6 @@ class SSLEnricher(Enricher):
logger.debug(f"fetching SSL certificate for {domain=} in {url=}")
cert = ssl.get_server_certificate((domain, 443))
cert_fn = os.path.join(ArchivingContext.get_tmp_dir(), f"{slugify(domain)}.pem")
cert_fn = os.path.join(self.tmp_dir, f"{slugify(domain)}.pem")
with open(cert_fn, "w") as f: f.write(cert)
to_enrich.add_media(Media(filename=cert_fn), id="ssl_certificate")

View File

@@ -2,7 +2,7 @@
"name": "Telegram Extractor",
"type": ["extractor"],
"requires_setup": False,
"external_dependencies": {
"dependencies": {
"python": [
"requests",
"bs4",
@@ -13,7 +13,7 @@
The `TelegramExtractor` retrieves publicly available media content from Telegram message links without requiring login credentials.
It processes URLs to fetch images and videos embedded in Telegram messages, ensuring a structured output using `Metadata`
and `Media` objects. Recommended for scenarios where login-based archiving is not viable, although `telethon_archiver`
is advised for more comprehensive functionality.
is advised for more comprehensive functionality, and higher quality media extraction.
### Features
- Extracts images and videos from public Telegram message links (`t.me`).

View File

@@ -1 +1 @@
from .telethon_extractor import TelethonArchiver
from .telethon_extractor import TelethonExtractor

View File

@@ -2,7 +2,7 @@
"name": "telethon_extractor",
"type": ["extractor"],
"requires_setup": True,
"external_dependencies": {
"dependencies": {
"python": ["telethon",
"loguru",
"tqdm",

View File

@@ -6,19 +6,20 @@ from telethon.tl.functions.messages import ImportChatInviteRequest
from telethon.errors.rpcerrorlist import UserAlreadyParticipantError, FloodWaitError, InviteRequestSentError, InviteHashExpiredError
from loguru import logger
from tqdm import tqdm
import re, time, json, os
import re, time, os
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.core import Metadata, Media
from auto_archiver.utils import random_str
class TelethonArchiver(Extractor):
link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
class TelethonExtractor(Extractor):
valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
def setup(self) -> None:
"""
1. makes a copy of session_file that is removed in cleanup
2. trigger login process for telegram or proceed if already saved in a session file
@@ -92,7 +93,7 @@ class TelethonArchiver(Extractor):
"""
url = item.get_url()
# detect URLs that we definitely cannot handle
match = self.link_pattern.search(url)
match = self.valid_url.search(url)
logger.debug(f"TELETHON: {match=}")
if not match: return False
@@ -120,7 +121,7 @@ class TelethonArchiver(Extractor):
media_posts = self._get_media_posts_in_group(chat, post)
logger.debug(f'got {len(media_posts)=} for {url=}')
tmp_dir = ArchivingContext.get_tmp_dir()
tmp_dir = self.tmp_dir
group_id = post.grouped_id if post.grouped_id is not None else post.id
title = post.message

View File

@@ -2,8 +2,8 @@
"name": "Thumbnail Enricher",
"type": ["enricher"],
"requires_setup": False,
"external_dependencies": {
"python": ["loguru", "ffmpeg-python"],
"dependencies": {
"python": ["loguru", "ffmpeg"],
"bin": ["ffmpeg"]
},
"configs": {

View File

@@ -10,7 +10,7 @@ import ffmpeg, os
from loguru import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Media, Metadata, ArchivingContext
from auto_archiver.core import Media, Metadata
from auto_archiver.utils.misc import random_str
@@ -28,7 +28,7 @@ class ThumbnailEnricher(Enricher):
logger.debug(f"generating thumbnails for {to_enrich.get_url()}")
for m_id, m in enumerate(to_enrich.media[::]):
if m.is_video():
folder = os.path.join(ArchivingContext.get_tmp_dir(), random_str(24))
folder = os.path.join(self.tmp_dir, random_str(24))
os.makedirs(folder, exist_ok=True)
logger.debug(f"generating thumbnails for {m.filename}")
duration = m.get("duration")

View File

@@ -2,7 +2,7 @@
"name": "Timestamping Enricher",
"type": ["enricher"],
"requires_setup": True,
"external_dependencies": {
"dependencies": {
"python": [
"loguru",
"slugify",

View File

@@ -10,8 +10,7 @@ from asn1crypto.core import Asn1Value
import certifi
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, ArchivingContext, Media
from auto_archiver.core import Metadata, Media
class TimestampingEnricher(Enricher):
"""
@@ -33,7 +32,7 @@ class TimestampingEnricher(Enricher):
logger.warning(f"No hashes found in {url=}")
return
tmp_dir = ArchivingContext.get_tmp_dir()
tmp_dir = self.tmp_dir
hashes_fn = os.path.join(tmp_dir, "hashes.txt")
data_to_sign = "\n".join(hashes)
@@ -102,9 +101,9 @@ class TimestampingEnricher(Enricher):
cert_chain = []
for cert in path:
cert_fn = os.path.join(ArchivingContext.get_tmp_dir(), f"{str(cert.serial_number)[:20]}.crt")
cert_fn = os.path.join(self.tmp_dir, f"{str(cert.serial_number)[:20]}.crt")
with open(cert_fn, "wb") as f:
f.write(cert.dump())
cert_chain.append(Media(filename=cert_fn).set("subject", cert.subject.native["common_name"]))
return cert_chain
return cert_chain

View File

@@ -2,7 +2,7 @@
"name": "Twitter API Extractor",
"type": ["extractor"],
"requires_setup": True,
"external_dependencies": {
"dependencies": {
"python": ["requests",
"loguru",
"pytwitter",

View File

@@ -9,14 +9,13 @@ from pytwitter import Api
from slugify import slugify
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata,Media
from auto_archiver.core import Metadata, Media
class TwitterApiExtractor(Extractor):
link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
def setup(self, config: dict) -> None:
super().setup(config)
valid_url: re.Pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
def setup(self) -> None:
self.api_index = 0
self.apis = []
if len(self.bearer_tokens):
@@ -54,7 +53,7 @@ class TwitterApiExtractor(Extractor):
def get_username_tweet_id(self, url):
# detect URLs that we definitely cannot handle
matches = self.link_pattern.findall(url)
matches = self.valid_url.findall(url)
if not len(matches): return False, False
username, tweet_id = matches[0] # only one URL supported

View File

@@ -3,15 +3,19 @@
"type": ["extractor"],
"requires_setup": True,
"depends": ["core", "utils"],
"external_dependencies": {
"python": ["loguru",
"vk_url_scraper"],
"dependencies": {
"python": ["loguru", "vk_url_scraper"],
},
"configs": {
"username": {"default": None, "help": "valid VKontakte username"},
"password": {"default": None, "help": "valid VKontakte password"},
"session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
"username": {"required": True,
"help": "valid VKontakte username"},
"password": {"required": True,
"help": "valid VKontakte password"},
"session_file": {
"default": "secrets/vk_config.v2.json",
"help": "valid VKontakte password",
},
},
"description": """
The `VkExtractor` fetches posts, text, and images from VK (VKontakte) social media pages.
This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract
@@ -31,6 +35,5 @@ To use the `VkArchiver`, you must provide valid VKontakte login credentials and
Credentials can be set in the configuration file or directly via environment variables. Ensure you
have access to the VKontakte API by creating an account at [VKontakte](https://vk.com/).
"""
,
""",
}

View File

@@ -3,7 +3,7 @@ from vk_url_scraper import VkScraper
from auto_archiver.utils.misc import dump_payload
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.core import Metadata, Media
class VkExtractor(Extractor):
@@ -12,10 +12,7 @@ class VkExtractor(Extractor):
Currently only works for /wall posts
"""
def __init__(self, config: dict) -> None:
super().__init__(config)
self.assert_valid_string("username")
self.assert_valid_string("password")
def setup(self) -> None:
self.vks = VkScraper(self.username, self.password, session_file=self.session_file)
def download(self, item: Metadata) -> Metadata:
@@ -37,7 +34,7 @@ class VkExtractor(Extractor):
result.set_content(dump_payload(vk_scrapes))
filenames = self.vks.download_media(vk_scrapes, ArchivingContext.get_tmp_dir())
filenames = self.vks.download_media(vk_scrapes, self.tmp_dir)
for filename in filenames:
result.add_media(Media(filename))

View File

@@ -1,8 +1,9 @@
{
"name": "WACZ Enricher",
"type": ["enricher", "archiver"],
"entry_point": "wacz_enricher::WaczExtractorEnricher",
"requires_setup": True,
"external_dependencies": {
"dependencies": {
"python": [
"loguru",
"jsonlines",
@@ -25,6 +26,7 @@
},
"description": """
Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving.
[Browsertrix-crawler](https://crawler.docs.browsertrix.com/user-guide/) is a headless browser-based crawler that archives web pages in WACZ format.
### Features
- Archives web pages into .WACZ format using Docker or direct invocation of `browsertrix-crawler`.
@@ -33,7 +35,7 @@
- Generates metadata from the archived page's content and structure (e.g., titles, text).
### Notes
- Requires Docker for running `browsertrix-crawler` unless explicitly disabled.
- Requires Docker for running `browsertrix-crawler` .
- Configurable via parameters for timeout, media extraction, screenshots, and proxy settings.
"""
}

View File

@@ -5,9 +5,9 @@ from zipfile import ZipFile
from loguru import logger
from warcio.archiveiterator import ArchiveIterator
from auto_archiver.core import Media, Metadata, ArchivingContext
from auto_archiver.core import Media, Metadata
from auto_archiver.core import Extractor, Enricher
from auto_archiver.utils import UrlUtil, random_str
from auto_archiver.utils import url as UrlUtil, random_str
class WaczExtractorEnricher(Enricher, Extractor):
@@ -19,6 +19,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
"""
def setup(self) -> None:
self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')
self.docker_in_docker = os.environ.get('WACZ_ENABLE_DOCKER') and os.environ.get('RUNNING_IN_DOCKER')
@@ -49,7 +50,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
url = to_enrich.get_url()
collection = random_str(8)
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(ArchivingContext.get_tmp_dir())
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir)
browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host
cmd = [
@@ -152,7 +153,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
logger.info(f"WACZ extract_media or extract_screenshot flag is set, extracting media from {wacz_filename=}")
# unzipping the .wacz
tmp_dir = ArchivingContext.get_tmp_dir()
tmp_dir = self.tmp_dir
unzipped_dir = os.path.join(tmp_dir, "unzipped")
with ZipFile(wacz_filename, 'r') as z_obj:
z_obj.extractall(path=unzipped_dir)

View File

@@ -1 +0,0 @@
from .wayback_enricher import WaybackExtractorEnricher

View File

@@ -1,30 +0,0 @@
{
"name": "Wayback Machine Enricher",
"type": ["enricher", "archiver"],
"requires_setup": True,
"external_dependencies": {
"python": ["loguru", "requests"],
},
"entry_point": "wayback_enricher::WaybackExtractorEnricher",
"configs": {
"timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."},
"if_not_archived_within": {"default": None, "help": "only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA"},
"key": {"default": None, "required": True, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"},
"secret": {"default": None, "required": True, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"},
"proxy_http": {"default": None, "help": "http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port"},
"proxy_https": {"default": None, "help": "https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port"},
},
"description": """
Submits the current URL to the Wayback Machine for archiving and returns either a job ID or the completed archive URL.
### Features
- Archives URLs using the Internet Archive's Wayback Machine API.
- Supports conditional archiving based on the existence of prior archives within a specified time range.
- Provides proxies for HTTP and HTTPS requests.
- Fetches and confirms the archive URL or provides a job ID for later status checks.
### Notes
- Requires a valid Wayback Machine API key and secret.
- Handles rate-limiting by Wayback Machine and retries status checks with exponential backoff.
"""
}

View File

@@ -0,0 +1 @@
from .wayback_extractor_enricher import WaybackExtractorEnricher

View File

@@ -0,0 +1,56 @@
{
"name": "Wayback Machine Enricher",
"type": ["enricher", "archiver"],
"entry_point": "wayback_extractor_enricher::WaybackExtractorEnricher",
"requires_setup": True,
"dependencies": {
"python": ["loguru", "requests"],
},
"configs": {
"timeout": {
"default": 15,
"help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually.",
},
"if_not_archived_within": {
"default": None,
"help": "only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA",
},
"key": {
"required": True,
"help": "wayback API key. to get credentials visit https://archive.org/account/s3.php",
},
"secret": {
"required": True,
"help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php",
},
"proxy_http": {
"default": None,
"help": "http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port",
},
"proxy_https": {
"default": None,
"help": "https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port",
},
},
"description": """
Submits the current URL to the Wayback Machine for archiving and returns either a job ID or the completed archive URL.
### Features
- Archives URLs using the Internet Archive's Wayback Machine API.
- Supports conditional archiving based on the existence of prior archives within a specified time range.
- Provides proxies for HTTP and HTTPS requests.
- Fetches and confirms the archive URL or provides a job ID for later status checks.
### Notes
- Requires a valid Wayback Machine API key and secret.
- Handles rate-limiting by Wayback Machine and retries status checks with exponential backoff.
### Steps to Get an Wayback API Key:
- Sign up for an account at [Internet Archive](https://archive.org/account/signup).
- Log in to your account.
- Navigte to your [account settings](https://archive.org/account).
- or: https://archive.org/developers/tutorial-get-ia-credentials.html
- Under Wayback Machine API Keys, generate a new key.
- Note down your API key and secret, as they will be required for authentication.
""",
}

View File

@@ -3,7 +3,7 @@ from loguru import logger
import time, requests
from auto_archiver.core import Extractor, Enricher
from auto_archiver.utils import UrlUtil
from auto_archiver.utils import url as UrlUtil
from auto_archiver.core import Metadata
class WaybackExtractorEnricher(Enricher, Extractor):

View File

@@ -2,15 +2,19 @@
"name": "Whisper Enricher",
"type": ["enricher"],
"requires_setup": True,
"external_dependencies": {
"python": ["loguru", "requests"],
"dependencies": {
"python": ["s3_storage", "loguru", "requests"],
},
"configs": {
"api_endpoint": {"default": None, "help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
"api_key": {"default": None, "help": "WhisperApi api key for authentication"},
"api_endpoint": {"required": True,
"help": "WhisperApi api endpoint, eg: https://whisperbox-api.com/api/v1, a deployment of https://github.com/bellingcat/whisperbox-transcribe."},
"api_key": {"required": True,
"help": "WhisperApi api key for authentication"},
"include_srt": {"default": False, "help": "Whether to include a subtitle SRT (SubRip Subtitle file) for the video (can be used in video players)."},
"timeout": {"default": 90, "help": "How many seconds to wait at most for a successful job completion."},
"action": {"default": "translate", "help": "which Whisper operation to execute", "choices": ["transcribe", "translate", "language_detection"]},
"action": {"default": "translate",
"help": "which Whisper operation to execute",
"choices": ["transcribe", "translate", "language_detection"]},
},
"description": """
Integrates with a Whisper API service to transcribe, translate, or detect the language of audio and video files.
@@ -25,6 +29,7 @@
### Notes
- Requires a Whisper API endpoint and API key for authentication.
- Only compatible with S3-compatible storage systems for media file accessibility.
- ** This stores the media files in S3 prior to enriching them as Whisper requires public URLs to access the media files.
- Handles multiple jobs and retries for failed or incomplete processing.
"""
}

View File

@@ -3,9 +3,8 @@ import requests, time
from loguru import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.modules.s3_storage import S3Storage
from auto_archiver.core import Metadata, Media
from auto_archiver.core.module import get_module
class WhisperEnricher(Enricher):
"""
@@ -14,18 +13,25 @@ class WhisperEnricher(Enricher):
Only works if an S3 compatible storage is used
"""
def enrich(self, to_enrich: Metadata) -> None:
if not self._get_s3_storage():
def setup(self) -> None:
self.stores = self.config['steps']['storages']
self.s3 = get_module("s3_storage", self.config)
if not "s3_storage" in self.stores:
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
return
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
logger.debug(f"WHISPER[{self.action}]: iterating media items for {url=}.")
job_results = {}
for i, m in enumerate(to_enrich.media):
if m.is_video() or m.is_audio():
m.store(url=url, metadata=to_enrich)
# TODO: this used to pass all storage items to store now
# Now only passing S3, the rest will get added later in the usual order (?)
m.store(url=url, metadata=to_enrich, storages=[self.s3])
try:
job_id = self.submit_job(m)
job_results[job_id] = False
@@ -53,8 +59,8 @@ class WhisperEnricher(Enricher):
to_enrich.set_content(f"\n[automatic video transcript]: {v}")
def submit_job(self, media: Media):
s3 = self._get_s3_storage()
s3_url = s3.get_cdn_url(media)
s3_url = self.s3.get_cdn_url(media)
assert s3_url in media.urls, f"Could not find S3 url ({s3_url}) in list of stored media urls "
payload = {
"url": s3_url,
@@ -107,10 +113,3 @@ class WhisperEnricher(Enricher):
logger.debug(f"DELETE whisper {job_id=} result: {r_del.status_code}")
return result
return False
def _get_s3_storage(self) -> S3Storage:
try:
return next(s for s in ArchivingContext.get("storages") if s.__class__ == S3Storage)
except:
logger.warning("No S3Storage instance found in storages")
return