Basic docs structure for RTD

2026-06-11 20:58:29 +03:00 · 2025-01-15 21:45:29 +00:00
parent 05e0c9de93
commit d3eec5d90f
38 changed files with 1034 additions and 40 deletions
--- a/src/auto_archiver/main.py
+++ b/src/auto_archiver/main.py
@@ -1,3 +1,4 @@
+""" Entry point for the auto_archiver package. """
 from . import Config
 from . import ArchivingOrchestrator

--- a/src/auto_archiver/archivers/init.py
+++ b/src/auto_archiver/archivers/init.py
@@ -1,3 +1,10 @@
+"""
+Archivers are responsible for retrieving the content from various external platforms.
+They act as specialized modules, each tailored to interact with a specific platform,
+service, or data source. The archivers collectively enable the tool to comprehensively
+collect and preserve a variety of content types, such as posts, images, videos and metadata.
+
+"""
 from .archiver import Archiver
 from .telethon_archiver import TelethonArchiver
 from .twitter_archiver import TwitterArchiver
@@ -9,4 +16,4 @@ from .telegram_archiver import TelegramArchiver
 from .vk_archiver import VkArchiver
 from .youtubedl_archiver import YoutubeDLArchiver
 from .instagram_api_archiver import InstagramAPIArchiver
-from .bluesky_archiver import BlueskyArchiver
+from .bluesky_archiver import BlueskyArchiver
--- a/src/auto_archiver/archivers/archiver.py
+++ b/src/auto_archiver/archivers/archiver.py
@@ -1,3 +1,10 @@
+""" The `archiver` module defines the base functionality for implementing archivers in the media archiving framework.
+    This class provides common utility methods and a standard interface for archivers.
+
+    Factory method to initialize an archiver instance based on its name.
+
+
+"""
 from __future__ import annotations
 from abc import abstractmethod
 from dataclasses import dataclass
@@ -11,6 +18,11 @@ from ..core import Metadata, Step, ArchivingContext

@dataclass
 class Archiver(Step):
+    """
+    Base class for implementing archivers in the media archiving framework.
+    Subclasses must implement the `download` method to define platform-specific behavior.
+    """
+
    name = "archiver"

    def __init__(self, config: dict) -> None:
@@ -66,4 +78,5 @@ class Archiver(Step):
        return to_filename

    @abstractmethod
-    def download(self, item: Metadata) -> Metadata: pass
+    def download(self, item: Metadata) -> Metadata:
+        pass
--- a/src/auto_archiver/archivers/instagram_api_archiver.py
+++ b/src/auto_archiver/archivers/instagram_api_archiver.py
@@ -1,4 +1,15 @@
-import re, requests
+"""
+The `instagram_api_archiver` module provides tools for archiving various types of Instagram content
+using the [Instagrapi API](https://github.com/subzeroid/instagrapi).
+
+Connects to an Instagrapi API deployment and allows for downloading Instagram user profiles,
+posts, stories, highlights, and tagged content. It offers advanced configuration options for filtering
+data, reducing JSON output size, and handling large profiles.
+
+"""
+
+import re
+import requests
 from datetime import datetime
 from loguru import logger
 from retrying import retry
--- a/src/auto_archiver/archivers/instagram_archiver.py
+++ b/src/auto_archiver/archivers/instagram_archiver.py
@@ -1,3 +1,8 @@
+""" Uses the Instaloader library to download content from Instagram. This class handles both individual posts
+    and user profiles, downloading as much information as possible, including images, videos, text, stories,
+    highlights, and tagged posts. Authentication is required via username/password or a session file.
+
+"""
 import re, os, shutil, traceback
 import instaloader  # https://instaloader.github.io/as-module.html
 from loguru import logger
--- a/src/auto_archiver/archivers/instagram_tbot_archiver.py
+++ b/src/auto_archiver/archivers/instagram_tbot_archiver.py
@@ -1,3 +1,11 @@
+"""
+InstagramTbotArchiver Module
+
+This module provides functionality to archive Instagram content (posts, stories, etc.) using a Telegram bot (`instagram_load_bot`).
+It interacts with the Telegram API via the Telethon library to send Instagram URLs to the bot, which retrieves the
+relevant media and metadata. The fetched content is saved as `Media` objects in a temporary directory and returned as a
+`Metadata` object.
+"""

 import shutil
 from telethon.sync import TelegramClient
--- a/src/auto_archiver/archivers/tiktok_archiver.py
+++ b/src/auto_archiver/archivers/tiktok_archiver.py
@@ -1,4 +1,5 @@
 import json, os, traceback
+import tiktok_downloader
 from loguru import logger


--- a/src/auto_archiver/archivers/twitter_archiver.py
+++ b/src/auto_archiver/archivers/twitter_archiver.py
@@ -1,4 +1,4 @@
-import re, requests, mimetypes, json, math
+import re, requests, mimetypes, json
 from typing import Union
 from datetime import datetime
 from loguru import logger
--- a/src/auto_archiver/archivers/youtubedl_archiver.py
+++ b/src/auto_archiver/archivers/youtubedl_archiver.py
@@ -1,4 +1,23 @@
-import datetime, os, yt_dlp, pysubs2
+"""
+This defines an archiver implementation using `yt-dlp`.
+
+This module is responsible for downloading and processing media content from platforms
+supported by `yt-dlp`, such as YouTube, Facebook, and others. It provides functionality
+for retrieving videos, subtitles, comments, and other metadata, and it integrates with
+the broader archiving framework.
+
+### Features
+- Supports downloading videos and playlists.
+- Retrieves metadata like titles, descriptions, upload dates, and durations.
+- Downloads subtitles and comments when enabled.
+- Configurable options for handling live streams, proxies, and more.
+
+"""
+import datetime
+import os
+import pysubs2
+import yt_dlp
+
 from loguru import logger

 from . import Archiver
@@ -37,6 +56,7 @@ class YoutubeDLArchiver(Archiver):
    def download(self, item: Metadata) -> Metadata:
        url = item.get_url()

+        # Handle Facebook cookies if enabled
        if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
            logger.debug('Using Facebook cookie')
            yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
@@ -66,11 +86,12 @@ class YoutubeDLArchiver(Archiver):
            logger.debug(f'ytdlp exception which is normal for example a facebook page with images only will cause a IndexError: list index out of range. Exception is: \n  {e}')
            return False

-        # this time download
+        # This time download the content
        ydl = yt_dlp.YoutubeDL({**ydl_options, "getcomments": self.comments})
        #TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
        info = ydl.extract_info(url, download=True)

+        # Process entries (e.g., for playlists)
        if "entries" in info:
            entries = info.get("entries", [])
            if not len(entries):
@@ -78,9 +99,11 @@ class YoutubeDLArchiver(Archiver):
                return False
        else: entries = [info]

+        # Prepare enriched metadata
        result = Metadata()
        result.set_title(info.get("title"))
        if "description" in info: result.set_content(info["description"])
+        # Process individual entries
        for entry in entries:
            try:
                filename = ydl.prepare_filename(entry)
@@ -112,6 +135,7 @@ class YoutubeDLArchiver(Archiver):
                "timestamp": datetime.datetime.fromtimestamp(c.get("timestamp"), tz = datetime.timezone.utc)
            } for c in info.get("comments", [])])

+        # Set additional metadata
        if (timestamp := info.get("timestamp")):
            #TODO: fix deprecated timestamp, 
            timestamp = datetime.datetime.fromtimestamp(timestamp, tz = datetime.timezone.utc).isoformat()
@@ -120,6 +144,7 @@ class YoutubeDLArchiver(Archiver):
            upload_date = datetime.datetime.strptime(upload_date, '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
            result.set("upload_date", upload_date)

+        # Update status for success
        if self.end_means_success: result.success("yt-dlp")
        else: result.status = "yt-dlp"
        return result
--- a/src/auto_archiver/core/init.py
+++ b/src/auto_archiver/core/init.py
@@ -1,3 +1,6 @@
+""" Core modules to handle things such as orchestration, metadata and configs..
+
+"""
 from .metadata import Metadata
 from .media import Media
 from .step import Step
--- a/src/auto_archiver/core/config.py
+++ b/src/auto_archiver/core/config.py
@@ -1,4 +1,9 @@
+"""
+The Config class initializes and parses configurations for all other steps.
+It supports CLI argument parsing, loading from YAML file, and overrides to allow
+flexible setup in various environments.

+"""

 import argparse, yaml
 from dataclasses import dataclass, field
@@ -55,6 +60,7 @@ class Config:

            parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml')

+        # Iterate over all step subclasses to gather default configs and CLI arguments
        for configurable in self.configurable_parents:
            child: Step
            for child in configurable.__subclasses__():
--- a/src/auto_archiver/core/context.py
+++ b/src/auto_archiver/core/context.py
@@ -1,6 +1,21 @@
+""" ArchivingContext provides a global context for managing configurations and temporary data during the archiving process.
+
+This singleton class allows for:
+- Storing and retrieving key-value pairs that are accessible throughout the application lifecycle.
+- Marking certain values to persist across resets using `keep_on_reset`.
+- Managing temporary directories and other shared data used during the archiving process.
+
+### Key Features:
+- Creates a single global instance.
+- Reset functionality allows for clearing configurations, with options for partial or full resets.
+- Custom getters and setters for commonly used context values like temporary directories.
+
+"""
+
 class ArchivingContext:
    """
-    Singleton context class.
+    Singleton context class for managing global configurations and temporary data.
+
    ArchivingContext._get_instance() to retrieve it if needed
    otherwise just 
    ArchivingContext.set(key, value)
--- a/src/auto_archiver/core/media.py
+++ b/src/auto_archiver/core/media.py
@@ -1,3 +1,7 @@
+"""
+Manages media files and their associated metadata, supporting storage,
+nested media retrieval, and type validation.
+"""

 from __future__ import annotations
 import os
@@ -18,6 +22,16 @@ from loguru import logger
@dataclass_json  # annotation order matters
@dataclass
 class Media:
+    """
+    Represents a media file with associated properties and storage details.
+
+    Attributes:
+    - filename: The file path of the media.
+    - key: An optional identifier for the media.
+    - urls: A list of URLs where the media is stored or accessible.
+    - properties: Additional metadata or transformations for the media.
+    - _mimetype: The media's mimetype (e.g., image/jpeg, video/mp4).
+    """
    filename: str
    key: str = None
    urls: List[str] = field(default_factory=list)
@@ -40,8 +54,9 @@ class Media:
                s.store(any_media, url, metadata=metadata)

    def all_inner_media(self, include_self=False):
-        """ Media can be inside media properties, examples include transformations on original media.
-        This function returns a generator for all the inner media.        
+        """Retrieves all media, including nested media within properties or transformations on original media.
+        This function returns a generator for all the inner media.
+
        """
        if include_self: yield self
        for prop in self.properties.values():
--- a/src/auto_archiver/core/metadata.py
+++ b/src/auto_archiver/core/metadata.py
@@ -1,3 +1,13 @@
+"""
+Acts as a container for metadata and media objects associated with an archived item.
+
+Key Functionalities:
+- Store and retrieve metadata and associated media.
+- Merge metadata objects with conflict resolution.
+- Validate properties like URLs and timestamps.
+- Manage and deduplicate media objects.
+- Support for flexible metadata querying and appending.
+"""

 from __future__ import annotations
 import hashlib
@@ -25,7 +35,11 @@ class Metadata:

    def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
        """
-        merges two Metadata instances, will overwrite according to overwrite_left flag
+        Merges another `Metadata` instance into this one.
+
+        Conflicts are resolved based on the `overwrite_left` flag:
+        - If `True`, this instance's values are overwritten by `right`.
+        - If `False`, the inverse applies.
        """
        if not right: return self
        if overwrite_left:
@@ -191,4 +205,4 @@ class Metadata:
        for r in results[1:]:
            if len(r.media) > len(most_complete.media): most_complete = r
            elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata): most_complete = r
-        return most_complete
+        return most_complete
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -1,3 +1,9 @@
+""" Orchestrates all archiving steps, including feeding items,
+    archiving them with specific archivers, enrichment, storage,
+    formatting, database operations and clean up.
+
+"""
+
 from __future__ import annotations
 from typing import Generator, Union, List
 from urllib.parse import urlparse
--- a/src/auto_archiver/core/step.py
+++ b/src/auto_archiver/core/step.py
@@ -1,3 +1,9 @@
+"""
+Defines the Step abstract base class, which acts as a blueprint for steps in the archiving pipeline
+by handling user configuration, validating the steps properties, and implementing dynamic instantiation.
+
+"""
+
 from __future__ import annotations
 from dataclasses import dataclass
 from inspect import ClassFoundException
@@ -10,6 +16,7 @@ class Step(ABC):
    name: str = None

    def __init__(self, config: dict) -> None:
+        # Initialises each step by reading the relevant entries
        # reads the configs into object properties
        # self.config = config[self.name]
        for k, v in config.get(self.name, {}).items():
@@ -20,7 +27,9 @@ class Step(ABC):

    def init(name: str, config: dict, child: Type[Step]) -> Step:
        """
-        looks into direct subclasses of child for name and returns such an object
+        Attempts to instantiate a subclass of the provided `child` type
+        matching the given `name`.
+        Raises ClassFoundException if no matching subclass is found.
        TODO: cannot find subclasses of child.subclasses
        """
        for sub in child.__subclasses__():
@@ -30,7 +39,9 @@ class Step(ABC):

    def assert_valid_string(self, prop: str) -> None:
        """
-        receives a property name an ensures it exists and is a valid non-empty string, raises an exception if not
+        Receives a property name and ensures it exists and is a valid non-empty string,
+        raising an AssertionError if not.
+        TODO: replace assertions with custom exceptions
        """
        assert hasattr(self, prop), f"property {prop} not found"
        s = getattr(self, prop)
--- a/src/auto_archiver/databases/init.py
+++ b/src/auto_archiver/databases/init.py
@@ -1,3 +1,7 @@
+""" Databases are used to store the outputs from running the Autp Archiver.
+
+
+"""
 from .database import Database
 from .gsheet_db import GsheetsDb
 from .console_db import ConsoleDb
--- a/src/auto_archiver/databases/api_db.py
+++ b/src/auto_archiver/databases/api_db.py
@@ -32,7 +32,9 @@ class AAApiDb(Database):
            "tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))},
        }
    def fetch(self, item: Metadata) -> Union[Metadata, bool]:
-        """ query the database for the existence of this item"""
+        """ query the database for the existence of this item.
+            Helps avoid re-archiving the same URL multiple times.
+        """
        if not self.allow_rearchive: return
        
        params = {"url": item.get_url(), "limit": 15}
--- a/src/auto_archiver/enrichers/init.py
+++ b/src/auto_archiver/enrichers/init.py
@@ -1,3 +1,15 @@
+"""
+Enrichers are modular components that enhance archived content by adding
+context, metadata, or additional processing.
+
+These add additional information to the context, such as screenshots, hashes, and metadata.
+They are designed to work within the archiving pipeline, operating on `Metadata` objects after
+the archiving step and before storage or formatting.
+
+Enrichers are optional but highly useful for making the archived data more powerful.
+
+
+"""
 from .enricher import Enricher
 from .screenshot_enricher import ScreenshotEnricher 
 from .wayback_enricher import WaybackArchiverEnricher
--- a/src/auto_archiver/enrichers/enricher.py
+++ b/src/auto_archiver/enrichers/enricher.py
@@ -1,3 +1,5 @@
+""" Base classes and utilities for enrichers in the Auto-Archiver system.
+"""
 from __future__ import annotations
 from dataclasses import dataclass
 from abc import abstractmethod, ABC
--- a/src/auto_archiver/enrichers/hash_enricher.py
+++ b/src/auto_archiver/enrichers/hash_enricher.py
@@ -1,3 +1,12 @@
+""" Hash Enricher for generating cryptographic hashes of media files.
+
+The `HashEnricher` calculates cryptographic hashes (e.g., SHA-256, SHA3-512)
+for media files stored in `Metadata` objects. These hashes are used for
+validating content integrity, ensuring data authenticity, and identifying
+exact duplicates. The hash is computed by reading the file's bytes in chunks,
+making it suitable for handling large files efficiently.
+
+"""
 import hashlib
 from loguru import logger

--- a/src/auto_archiver/enrichers/pdq_hash_enricher.py
+++ b/src/auto_archiver/enrichers/pdq_hash_enricher.py
@@ -1,3 +1,15 @@
+"""
+PDQ Hash Enricher for generating perceptual hashes of media files.
+
+The `PdqHashEnricher` processes media files (e.g., images) in `Metadata`
+objects and calculates perceptual hashes using the PDQ hashing algorithm.
+These hashes are designed specifically for images and can be used
+for detecting duplicate or near-duplicate visual content.
+
+This enricher is typically used after thumbnail or screenshot enrichers
+to ensure images are available for hashing.
+
+"""
 import traceback
 import pdqhash
 import numpy as np
--- a/src/auto_archiver/enrichers/thumbnail_enricher.py
+++ b/src/auto_archiver/enrichers/thumbnail_enricher.py
@@ -1,3 +1,11 @@
+"""Thumbnail Enricher for generating visual previews of video files.
+
+The `ThumbnailEnricher` processes video files in `Metadata` objects and
+creates evenly distributed thumbnail images. These thumbnails provide
+visual snapshots of the video's keyframes, helping users preview content
+and identify important moments without watching the entire video.
+
+"""
 import ffmpeg, os
 from loguru import logger

--- a/src/auto_archiver/feeders/init.py
+++ b/src/auto_archiver/feeders/init.py
@@ -1,3 +1,6 @@
+""" Feeders handle the input of media into the Auto Archiver.
+
+"""
 from.feeder import Feeder
 from .gsheet_feeder import GsheetsFeeder
 from .cli_feeder import CLIFeeder
--- a/src/auto_archiver/feeders/gsheet_feeder.py
+++ b/src/auto_archiver/feeders/gsheet_feeder.py
@@ -1,3 +1,13 @@
+"""
+GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver.
+
+This reads data from Google Sheets and filters rows based on user-defined rules.
+The filtered rows are processed into `Metadata` objects.
+
+### Key properties
+- validates the sheet's structure and filters rows based on input configurations.
+- Ensures only rows with valid URLs and unprocessed statuses are included.
+"""
 import gspread, os

 from loguru import logger
--- a/src/auto_archiver/formatters/init.py
+++ b/src/auto_archiver/formatters/init.py
@@ -1,3 +1,4 @@
+""" Formatters for the output of the content. """
 from .formatter import Formatter
 from .html_formatter import HtmlFormatter
 from .mute_formatter import MuteFormatter
--- a/src/auto_archiver/storages/init.py
+++ b/src/auto_archiver/storages/init.py
@@ -1,3 +1,6 @@
+""" This module contains the storage classes for the auto-archiver.
+
+"""
 from .storage import Storage
 from .s3 import S3Storage
 from .local import LocalStorage
--- a/src/auto_archiver/utils/init.py
+++ b/src/auto_archiver/utils/init.py
@@ -1,3 +1,4 @@
+""" Auto Archiver Utilities. """
 # we need to explicitly expose the available imports here
 from .gworksheet import GWorksheet
 from .misc import *
--- a/src/auto_archiver/utils/webdriver.py
+++ b/src/auto_archiver/utils/webdriver.py
@@ -1,3 +1,4 @@
+""" This Webdriver class acts as a context manager for the selenium webdriver. """
 from __future__ import annotations
 from selenium import webdriver
 from selenium.common.exceptions import TimeoutException