Basic docs structure for RTD

This commit is contained in:
erinhmclark
2025-01-15 21:45:29 +00:00
parent 05e0c9de93
commit d3eec5d90f
38 changed files with 1034 additions and 40 deletions

View File

@@ -1,3 +1,6 @@
""" Core modules to handle things such as orchestration, metadata and configs..
"""
from .metadata import Metadata
from .media import Media
from .step import Step

View File

@@ -1,4 +1,9 @@
"""
The Config class initializes and parses configurations for all other steps.
It supports CLI argument parsing, loading from YAML file, and overrides to allow
flexible setup in various environments.
"""
import argparse, yaml
from dataclasses import dataclass, field
@@ -55,6 +60,7 @@ class Config:
parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml')
# Iterate over all step subclasses to gather default configs and CLI arguments
for configurable in self.configurable_parents:
child: Step
for child in configurable.__subclasses__():

View File

@@ -1,6 +1,21 @@
""" ArchivingContext provides a global context for managing configurations and temporary data during the archiving process.
This singleton class allows for:
- Storing and retrieving key-value pairs that are accessible throughout the application lifecycle.
- Marking certain values to persist across resets using `keep_on_reset`.
- Managing temporary directories and other shared data used during the archiving process.
### Key Features:
- Creates a single global instance.
- Reset functionality allows for clearing configurations, with options for partial or full resets.
- Custom getters and setters for commonly used context values like temporary directories.
"""
class ArchivingContext:
"""
Singleton context class.
Singleton context class for managing global configurations and temporary data.
ArchivingContext._get_instance() to retrieve it if needed
otherwise just
ArchivingContext.set(key, value)

View File

@@ -1,3 +1,7 @@
"""
Manages media files and their associated metadata, supporting storage,
nested media retrieval, and type validation.
"""
from __future__ import annotations
import os
@@ -18,6 +22,16 @@ from loguru import logger
@dataclass_json # annotation order matters
@dataclass
class Media:
"""
Represents a media file with associated properties and storage details.
Attributes:
- filename: The file path of the media.
- key: An optional identifier for the media.
- urls: A list of URLs where the media is stored or accessible.
- properties: Additional metadata or transformations for the media.
- _mimetype: The media's mimetype (e.g., image/jpeg, video/mp4).
"""
filename: str
key: str = None
urls: List[str] = field(default_factory=list)
@@ -40,8 +54,9 @@ class Media:
s.store(any_media, url, metadata=metadata)
def all_inner_media(self, include_self=False):
""" Media can be inside media properties, examples include transformations on original media.
This function returns a generator for all the inner media.
"""Retrieves all media, including nested media within properties or transformations on original media.
This function returns a generator for all the inner media.
"""
if include_self: yield self
for prop in self.properties.values():

View File

@@ -1,3 +1,13 @@
"""
Acts as a container for metadata and media objects associated with an archived item.
Key Functionalities:
- Store and retrieve metadata and associated media.
- Merge metadata objects with conflict resolution.
- Validate properties like URLs and timestamps.
- Manage and deduplicate media objects.
- Support for flexible metadata querying and appending.
"""
from __future__ import annotations
import hashlib
@@ -25,7 +35,11 @@ class Metadata:
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
"""
merges two Metadata instances, will overwrite according to overwrite_left flag
Merges another `Metadata` instance into this one.
Conflicts are resolved based on the `overwrite_left` flag:
- If `True`, this instance's values are overwritten by `right`.
- If `False`, the inverse applies.
"""
if not right: return self
if overwrite_left:
@@ -191,4 +205,4 @@ class Metadata:
for r in results[1:]:
if len(r.media) > len(most_complete.media): most_complete = r
elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata): most_complete = r
return most_complete
return most_complete

View File

@@ -1,3 +1,9 @@
""" Orchestrates all archiving steps, including feeding items,
archiving them with specific archivers, enrichment, storage,
formatting, database operations and clean up.
"""
from __future__ import annotations
from typing import Generator, Union, List
from urllib.parse import urlparse

View File

@@ -1,3 +1,9 @@
"""
Defines the Step abstract base class, which acts as a blueprint for steps in the archiving pipeline
by handling user configuration, validating the steps properties, and implementing dynamic instantiation.
"""
from __future__ import annotations
from dataclasses import dataclass
from inspect import ClassFoundException
@@ -10,6 +16,7 @@ class Step(ABC):
name: str = None
def __init__(self, config: dict) -> None:
# Initialises each step by reading the relevant entries
# reads the configs into object properties
# self.config = config[self.name]
for k, v in config.get(self.name, {}).items():
@@ -20,7 +27,9 @@ class Step(ABC):
def init(name: str, config: dict, child: Type[Step]) -> Step:
"""
looks into direct subclasses of child for name and returns such an object
Attempts to instantiate a subclass of the provided `child` type
matching the given `name`.
Raises ClassFoundException if no matching subclass is found.
TODO: cannot find subclasses of child.subclasses
"""
for sub in child.__subclasses__():
@@ -30,7 +39,9 @@ class Step(ABC):
def assert_valid_string(self, prop: str) -> None:
"""
receives a property name an ensures it exists and is a valid non-empty string, raises an exception if not
Receives a property name and ensures it exists and is a valid non-empty string,
raising an AssertionError if not.
TODO: replace assertions with custom exceptions
"""
assert hasattr(self, prop), f"property {prop} not found"
s = getattr(self, prop)