mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 12:48:28 +03:00
Merge branch 'main' into tests/add_module_tests
This commit is contained in:
@@ -48,6 +48,7 @@ authentication: {}
|
||||
|
||||
logging:
|
||||
level: INFO
|
||||
|
||||
""")
|
||||
# note: 'logging' is explicitly added above in order to better format the config file
|
||||
|
||||
|
||||
@@ -1,3 +1,8 @@
|
||||
"""
|
||||
Database module for the auto-archiver that defines the interface for implementing database modules
|
||||
in the media archiving framework.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from typing import Union
|
||||
@@ -5,6 +10,11 @@ from typing import Union
|
||||
from auto_archiver.core import Metadata, BaseModule
|
||||
|
||||
class Database(BaseModule):
|
||||
"""
|
||||
Base class for implementing database modules in the media archiving framework.
|
||||
|
||||
Subclasses must implement the `fetch` and `done` methods to define platform-specific behavior.
|
||||
"""
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
"""signals the DB that the given item archival has started"""
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
"""
|
||||
Enrichers are modular components that enhance archived content by adding
|
||||
Base module for Enrichers – modular components that enhance archived content by adding
|
||||
context, metadata, or additional processing.
|
||||
|
||||
These add additional information to the context, such as screenshots, hashes, and metadata.
|
||||
@@ -13,7 +13,16 @@ from abc import abstractmethod
|
||||
from auto_archiver.core import Metadata, BaseModule
|
||||
|
||||
class Enricher(BaseModule):
|
||||
"""Base classes and utilities for enrichers in the Auto-Archiver system."""
|
||||
"""Base classes and utilities for enrichers in the Auto-Archiver system.
|
||||
|
||||
Enricher modules must implement the `enrich` method to define their behavior.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def enrich(self, to_enrich: Metadata) -> None: pass
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
"""
|
||||
Enriches a Metadata object with additional information or context.
|
||||
|
||||
Takes the metadata object to enrich as an argument and modifies it in place, returning None.
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -29,14 +29,24 @@ class Extractor(BaseModule):
|
||||
valid_url: re.Pattern = None
|
||||
|
||||
def cleanup(self) -> None:
|
||||
# called when extractors are done, or upon errors, cleanup any resources
|
||||
"""
|
||||
Called when extractors are done, or upon errors, cleanup any resources
|
||||
"""
|
||||
pass
|
||||
|
||||
def sanitize_url(self, url: str) -> str:
|
||||
# used to clean unnecessary URL parameters OR unfurl redirect links
|
||||
"""
|
||||
Used to clean unnecessary URL parameters OR unfurl redirect links
|
||||
"""
|
||||
return url
|
||||
|
||||
def match_link(self, url: str) -> re.Match:
|
||||
"""
|
||||
Returns a match object if the given URL matches the valid_url pattern or False/None if not.
|
||||
|
||||
Normally used in the `suitable` method to check if the URL is supported by this extractor.
|
||||
|
||||
"""
|
||||
return self.valid_url.match(url)
|
||||
|
||||
def suitable(self, url: str) -> bool:
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
"""
|
||||
The feeder base module defines the interface for implementing feeders in the media archiving framework.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from auto_archiver.core import Metadata
|
||||
@@ -5,5 +9,17 @@ from auto_archiver.core import BaseModule
|
||||
|
||||
class Feeder(BaseModule):
|
||||
|
||||
"""
|
||||
Base class for implementing feeders in the media archiving framework.
|
||||
|
||||
Subclasses must implement the `__iter__` method to define platform-specific behavior.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def __iter__(self) -> Metadata: return None
|
||||
def __iter__(self) -> Metadata:
|
||||
"""
|
||||
Returns an iterator (use `yield`) over the items to be archived.
|
||||
|
||||
These should be instances of Metadata, typically created with Metadata().set_url(url).
|
||||
"""
|
||||
return None
|
||||
@@ -1,9 +1,24 @@
|
||||
"""
|
||||
Base module for formatters – modular components that format metadata into media objects for storage.
|
||||
|
||||
The most commonly used formatter is the HTML formatter, which takes metadata and formats it into an HTML file for storage.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from auto_archiver.core import Metadata, Media, BaseModule
|
||||
|
||||
|
||||
class Formatter(BaseModule):
|
||||
"""
|
||||
Base class for implementing formatters in the media archiving framework.
|
||||
|
||||
Subclasses must implement the `format` method to define their behavior.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def format(self, item: Metadata) -> Media: return None
|
||||
def format(self, item: Metadata) -> Media:
|
||||
"""
|
||||
Formats a Metadata object into a user-viewable format (e.g. HTML) and stores it if needed.
|
||||
"""
|
||||
return None
|
||||
@@ -1,3 +1,7 @@
|
||||
"""
|
||||
Base module for Storage modules – modular components that store media objects in various locations.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from typing import IO
|
||||
@@ -12,6 +16,12 @@ from auto_archiver.core import Media, BaseModule, Metadata
|
||||
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
|
||||
from auto_archiver.core.module import get_module
|
||||
class Storage(BaseModule):
|
||||
|
||||
"""
|
||||
Base class for implementing storage modules in the media archiving framework.
|
||||
|
||||
Subclasses must implement the `get_cdn_url` and `uploadf` methods to define their behavior.
|
||||
"""
|
||||
|
||||
def store(self, media: Media, url: str, metadata: Metadata=None) -> None:
|
||||
if media.is_stored(in_storage=self):
|
||||
@@ -22,10 +32,18 @@ class Storage(BaseModule):
|
||||
media.add_url(self.get_cdn_url(media))
|
||||
|
||||
@abstractmethod
|
||||
def get_cdn_url(self, media: Media) -> str: pass
|
||||
def get_cdn_url(self, media: Media) -> str:
|
||||
"""
|
||||
Returns the URL of the media object stored in the CDN.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
||||
"""
|
||||
Uploads (or saves) a file to the storage service/location.
|
||||
"""
|
||||
pass
|
||||
|
||||
def upload(self, media: Media, **kwargs) -> bool:
|
||||
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
|
||||
|
||||
@@ -11,6 +11,8 @@
|
||||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"required": True,
|
||||
"type": "str",
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
|
||||
@@ -1,13 +0,0 @@
|
||||
def get_atlos_config_options():
|
||||
return {
|
||||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"type": str
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"type": str
|
||||
},
|
||||
}
|
||||
32
src/auto_archiver/modules/atlos_storage/__manifest__.py
Normal file
32
src/auto_archiver/modules/atlos_storage/__manifest__.py
Normal file
@@ -0,0 +1,32 @@
|
||||
{
|
||||
"name": "Atlos Storage",
|
||||
"type": ["storage"],
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "boto3"],
|
||||
"bin": []
|
||||
},
|
||||
"description": """
|
||||
Stores media files in a [Atlos](https://www.atlos.org/).
|
||||
|
||||
### Features
|
||||
- Saves media files to Atlos, organizing them into folders based on the provided path structure.
|
||||
|
||||
### Notes
|
||||
- Requires setup with Atlos credentials.
|
||||
- Files are uploaded to the specified `root_folder_id` and organized by the `media.key` structure.
|
||||
""",
|
||||
"configs": {
|
||||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"required": True,
|
||||
"type": "str"
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"type": "str"
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -32,7 +32,6 @@
|
||||
|
||||
GDriveStorage: A storage module for saving archived content to Google Drive.
|
||||
|
||||
Author: Dave Mateer, (And maintained by: )
|
||||
Source Documentation: https://davemateer.com/2022/04/28/google-drive-with-python
|
||||
|
||||
### Features
|
||||
|
||||
@@ -20,5 +20,6 @@
|
||||
- Processes HTML content of messages to retrieve embedded media.
|
||||
- Sets structured metadata, including timestamps, content, and media details.
|
||||
- Does not require user authentication for Telegram.
|
||||
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"name": "telethon_extractor",
|
||||
"name": "Telethon Extractor",
|
||||
"type": ["extractor"],
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
@@ -40,5 +40,9 @@ To use the `TelethonExtractor`, you must configure the following:
|
||||
- **Bot Token**: Optional, allows access to additional content (e.g., large videos) but limits private channel archiving.
|
||||
- **Channel Invites**: Optional, specify a JSON string of invite links to join channels during setup.
|
||||
|
||||
### First Time Login
|
||||
The first time you run, you will be prompted to do a authentication with the phone number associated, alternatively you can put your `anon.session` in the root.
|
||||
|
||||
|
||||
"""
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "WACZ Enricher",
|
||||
"type": ["enricher", "archiver"],
|
||||
"type": ["enricher", "extractor"],
|
||||
"entry_point": "wacz_enricher::WaczExtractorEnricher",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "Wayback Machine Enricher",
|
||||
"type": ["enricher", "archiver"],
|
||||
"type": ["enricher", "extractor"],
|
||||
"entry_point": "wayback_extractor_enricher::WaybackExtractorEnricher",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
# we need to explicitly expose the available imports here
|
||||
from .misc import *
|
||||
from .webdriver import Webdriver
|
||||
from .atlos import get_atlos_config_options
|
||||
|
||||
# handy utils from ytdlp
|
||||
from yt_dlp.utils import (clean_html, traverse_obj, strip_or_none, url_or_none)
|
||||
@@ -1,13 +0,0 @@
|
||||
def get_atlos_config_options():
|
||||
return {
|
||||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
},
|
||||
}
|
||||
Reference in New Issue
Block a user