mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 05:08:28 +03:00
Basic docs structure for RTD
This commit is contained in:
@@ -1,3 +1,6 @@
|
||||
""" Core modules to handle things such as orchestration, metadata and configs..
|
||||
|
||||
"""
|
||||
from .metadata import Metadata
|
||||
from .media import Media
|
||||
from .step import Step
|
||||
|
||||
@@ -1,4 +1,9 @@
|
||||
"""
|
||||
The Config class initializes and parses configurations for all other steps.
|
||||
It supports CLI argument parsing, loading from YAML file, and overrides to allow
|
||||
flexible setup in various environments.
|
||||
|
||||
"""
|
||||
|
||||
import argparse, yaml
|
||||
from dataclasses import dataclass, field
|
||||
@@ -55,6 +60,7 @@ class Config:
|
||||
|
||||
parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml')
|
||||
|
||||
# Iterate over all step subclasses to gather default configs and CLI arguments
|
||||
for configurable in self.configurable_parents:
|
||||
child: Step
|
||||
for child in configurable.__subclasses__():
|
||||
|
||||
@@ -1,6 +1,21 @@
|
||||
""" ArchivingContext provides a global context for managing configurations and temporary data during the archiving process.
|
||||
|
||||
This singleton class allows for:
|
||||
- Storing and retrieving key-value pairs that are accessible throughout the application lifecycle.
|
||||
- Marking certain values to persist across resets using `keep_on_reset`.
|
||||
- Managing temporary directories and other shared data used during the archiving process.
|
||||
|
||||
### Key Features:
|
||||
- Creates a single global instance.
|
||||
- Reset functionality allows for clearing configurations, with options for partial or full resets.
|
||||
- Custom getters and setters for commonly used context values like temporary directories.
|
||||
|
||||
"""
|
||||
|
||||
class ArchivingContext:
|
||||
"""
|
||||
Singleton context class.
|
||||
Singleton context class for managing global configurations and temporary data.
|
||||
|
||||
ArchivingContext._get_instance() to retrieve it if needed
|
||||
otherwise just
|
||||
ArchivingContext.set(key, value)
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
"""
|
||||
Manages media files and their associated metadata, supporting storage,
|
||||
nested media retrieval, and type validation.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import os
|
||||
@@ -18,6 +22,16 @@ from loguru import logger
|
||||
@dataclass_json # annotation order matters
|
||||
@dataclass
|
||||
class Media:
|
||||
"""
|
||||
Represents a media file with associated properties and storage details.
|
||||
|
||||
Attributes:
|
||||
- filename: The file path of the media.
|
||||
- key: An optional identifier for the media.
|
||||
- urls: A list of URLs where the media is stored or accessible.
|
||||
- properties: Additional metadata or transformations for the media.
|
||||
- _mimetype: The media's mimetype (e.g., image/jpeg, video/mp4).
|
||||
"""
|
||||
filename: str
|
||||
key: str = None
|
||||
urls: List[str] = field(default_factory=list)
|
||||
@@ -40,8 +54,9 @@ class Media:
|
||||
s.store(any_media, url, metadata=metadata)
|
||||
|
||||
def all_inner_media(self, include_self=False):
|
||||
""" Media can be inside media properties, examples include transformations on original media.
|
||||
This function returns a generator for all the inner media.
|
||||
"""Retrieves all media, including nested media within properties or transformations on original media.
|
||||
This function returns a generator for all the inner media.
|
||||
|
||||
"""
|
||||
if include_self: yield self
|
||||
for prop in self.properties.values():
|
||||
|
||||
@@ -1,3 +1,13 @@
|
||||
"""
|
||||
Acts as a container for metadata and media objects associated with an archived item.
|
||||
|
||||
Key Functionalities:
|
||||
- Store and retrieve metadata and associated media.
|
||||
- Merge metadata objects with conflict resolution.
|
||||
- Validate properties like URLs and timestamps.
|
||||
- Manage and deduplicate media objects.
|
||||
- Support for flexible metadata querying and appending.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import hashlib
|
||||
@@ -25,7 +35,11 @@ class Metadata:
|
||||
|
||||
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
|
||||
"""
|
||||
merges two Metadata instances, will overwrite according to overwrite_left flag
|
||||
Merges another `Metadata` instance into this one.
|
||||
|
||||
Conflicts are resolved based on the `overwrite_left` flag:
|
||||
- If `True`, this instance's values are overwritten by `right`.
|
||||
- If `False`, the inverse applies.
|
||||
"""
|
||||
if not right: return self
|
||||
if overwrite_left:
|
||||
@@ -191,4 +205,4 @@ class Metadata:
|
||||
for r in results[1:]:
|
||||
if len(r.media) > len(most_complete.media): most_complete = r
|
||||
elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata): most_complete = r
|
||||
return most_complete
|
||||
return most_complete
|
||||
|
||||
@@ -1,3 +1,9 @@
|
||||
""" Orchestrates all archiving steps, including feeding items,
|
||||
archiving them with specific archivers, enrichment, storage,
|
||||
formatting, database operations and clean up.
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Generator, Union, List
|
||||
from urllib.parse import urlparse
|
||||
|
||||
@@ -1,3 +1,9 @@
|
||||
"""
|
||||
Defines the Step abstract base class, which acts as a blueprint for steps in the archiving pipeline
|
||||
by handling user configuration, validating the steps properties, and implementing dynamic instantiation.
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from inspect import ClassFoundException
|
||||
@@ -10,6 +16,7 @@ class Step(ABC):
|
||||
name: str = None
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# Initialises each step by reading the relevant entries
|
||||
# reads the configs into object properties
|
||||
# self.config = config[self.name]
|
||||
for k, v in config.get(self.name, {}).items():
|
||||
@@ -20,7 +27,9 @@ class Step(ABC):
|
||||
|
||||
def init(name: str, config: dict, child: Type[Step]) -> Step:
|
||||
"""
|
||||
looks into direct subclasses of child for name and returns such an object
|
||||
Attempts to instantiate a subclass of the provided `child` type
|
||||
matching the given `name`.
|
||||
Raises ClassFoundException if no matching subclass is found.
|
||||
TODO: cannot find subclasses of child.subclasses
|
||||
"""
|
||||
for sub in child.__subclasses__():
|
||||
@@ -30,7 +39,9 @@ class Step(ABC):
|
||||
|
||||
def assert_valid_string(self, prop: str) -> None:
|
||||
"""
|
||||
receives a property name an ensures it exists and is a valid non-empty string, raises an exception if not
|
||||
Receives a property name and ensures it exists and is a valid non-empty string,
|
||||
raising an AssertionError if not.
|
||||
TODO: replace assertions with custom exceptions
|
||||
"""
|
||||
assert hasattr(self, prop), f"property {prop} not found"
|
||||
s = getattr(self, prop)
|
||||
|
||||
Reference in New Issue
Block a user