mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 20:58:29 +03:00
Basic docs structure for RTD
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
""" Entry point for the auto_archiver package. """
|
||||
from . import Config
|
||||
from . import ArchivingOrchestrator
|
||||
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
"""
|
||||
Archivers are responsible for retrieving the content from various external platforms.
|
||||
They act as specialized modules, each tailored to interact with a specific platform,
|
||||
service, or data source. The archivers collectively enable the tool to comprehensively
|
||||
collect and preserve a variety of content types, such as posts, images, videos and metadata.
|
||||
|
||||
"""
|
||||
from .archiver import Archiver
|
||||
from .telethon_archiver import TelethonArchiver
|
||||
from .twitter_archiver import TwitterArchiver
|
||||
@@ -9,4 +16,4 @@ from .telegram_archiver import TelegramArchiver
|
||||
from .vk_archiver import VkArchiver
|
||||
from .youtubedl_archiver import YoutubeDLArchiver
|
||||
from .instagram_api_archiver import InstagramAPIArchiver
|
||||
from .bluesky_archiver import BlueskyArchiver
|
||||
from .bluesky_archiver import BlueskyArchiver
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
""" The `archiver` module defines the base functionality for implementing archivers in the media archiving framework.
|
||||
This class provides common utility methods and a standard interface for archivers.
|
||||
|
||||
Factory method to initialize an archiver instance based on its name.
|
||||
|
||||
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from dataclasses import dataclass
|
||||
@@ -11,6 +18,11 @@ from ..core import Metadata, Step, ArchivingContext
|
||||
|
||||
@dataclass
|
||||
class Archiver(Step):
|
||||
"""
|
||||
Base class for implementing archivers in the media archiving framework.
|
||||
Subclasses must implement the `download` method to define platform-specific behavior.
|
||||
"""
|
||||
|
||||
name = "archiver"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
@@ -66,4 +78,5 @@ class Archiver(Step):
|
||||
return to_filename
|
||||
|
||||
@abstractmethod
|
||||
def download(self, item: Metadata) -> Metadata: pass
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
pass
|
||||
|
||||
@@ -1,4 +1,15 @@
|
||||
import re, requests
|
||||
"""
|
||||
The `instagram_api_archiver` module provides tools for archiving various types of Instagram content
|
||||
using the [Instagrapi API](https://github.com/subzeroid/instagrapi).
|
||||
|
||||
Connects to an Instagrapi API deployment and allows for downloading Instagram user profiles,
|
||||
posts, stories, highlights, and tagged content. It offers advanced configuration options for filtering
|
||||
data, reducing JSON output size, and handling large profiles.
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
import requests
|
||||
from datetime import datetime
|
||||
from loguru import logger
|
||||
from retrying import retry
|
||||
|
||||
@@ -1,3 +1,8 @@
|
||||
""" Uses the Instaloader library to download content from Instagram. This class handles both individual posts
|
||||
and user profiles, downloading as much information as possible, including images, videos, text, stories,
|
||||
highlights, and tagged posts. Authentication is required via username/password or a session file.
|
||||
|
||||
"""
|
||||
import re, os, shutil, traceback
|
||||
import instaloader # https://instaloader.github.io/as-module.html
|
||||
from loguru import logger
|
||||
|
||||
@@ -1,3 +1,11 @@
|
||||
"""
|
||||
InstagramTbotArchiver Module
|
||||
|
||||
This module provides functionality to archive Instagram content (posts, stories, etc.) using a Telegram bot (`instagram_load_bot`).
|
||||
It interacts with the Telegram API via the Telethon library to send Instagram URLs to the bot, which retrieves the
|
||||
relevant media and metadata. The fetched content is saved as `Media` objects in a temporary directory and returned as a
|
||||
`Metadata` object.
|
||||
"""
|
||||
|
||||
import shutil
|
||||
from telethon.sync import TelegramClient
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import json, os, traceback
|
||||
import tiktok_downloader
|
||||
from loguru import logger
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import re, requests, mimetypes, json, math
|
||||
import re, requests, mimetypes, json
|
||||
from typing import Union
|
||||
from datetime import datetime
|
||||
from loguru import logger
|
||||
|
||||
@@ -1,4 +1,23 @@
|
||||
import datetime, os, yt_dlp, pysubs2
|
||||
"""
|
||||
This defines an archiver implementation using `yt-dlp`.
|
||||
|
||||
This module is responsible for downloading and processing media content from platforms
|
||||
supported by `yt-dlp`, such as YouTube, Facebook, and others. It provides functionality
|
||||
for retrieving videos, subtitles, comments, and other metadata, and it integrates with
|
||||
the broader archiving framework.
|
||||
|
||||
### Features
|
||||
- Supports downloading videos and playlists.
|
||||
- Retrieves metadata like titles, descriptions, upload dates, and durations.
|
||||
- Downloads subtitles and comments when enabled.
|
||||
- Configurable options for handling live streams, proxies, and more.
|
||||
|
||||
"""
|
||||
import datetime
|
||||
import os
|
||||
import pysubs2
|
||||
import yt_dlp
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from . import Archiver
|
||||
@@ -37,6 +56,7 @@ class YoutubeDLArchiver(Archiver):
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
# Handle Facebook cookies if enabled
|
||||
if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
|
||||
logger.debug('Using Facebook cookie')
|
||||
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
|
||||
@@ -66,11 +86,12 @@ class YoutubeDLArchiver(Archiver):
|
||||
logger.debug(f'ytdlp exception which is normal for example a facebook page with images only will cause a IndexError: list index out of range. Exception is: \n {e}')
|
||||
return False
|
||||
|
||||
# this time download
|
||||
# This time download the content
|
||||
ydl = yt_dlp.YoutubeDL({**ydl_options, "getcomments": self.comments})
|
||||
#TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
|
||||
info = ydl.extract_info(url, download=True)
|
||||
|
||||
# Process entries (e.g., for playlists)
|
||||
if "entries" in info:
|
||||
entries = info.get("entries", [])
|
||||
if not len(entries):
|
||||
@@ -78,9 +99,11 @@ class YoutubeDLArchiver(Archiver):
|
||||
return False
|
||||
else: entries = [info]
|
||||
|
||||
# Prepare enriched metadata
|
||||
result = Metadata()
|
||||
result.set_title(info.get("title"))
|
||||
if "description" in info: result.set_content(info["description"])
|
||||
# Process individual entries
|
||||
for entry in entries:
|
||||
try:
|
||||
filename = ydl.prepare_filename(entry)
|
||||
@@ -112,6 +135,7 @@ class YoutubeDLArchiver(Archiver):
|
||||
"timestamp": datetime.datetime.fromtimestamp(c.get("timestamp"), tz = datetime.timezone.utc)
|
||||
} for c in info.get("comments", [])])
|
||||
|
||||
# Set additional metadata
|
||||
if (timestamp := info.get("timestamp")):
|
||||
#TODO: fix deprecated timestamp,
|
||||
timestamp = datetime.datetime.fromtimestamp(timestamp, tz = datetime.timezone.utc).isoformat()
|
||||
@@ -120,6 +144,7 @@ class YoutubeDLArchiver(Archiver):
|
||||
upload_date = datetime.datetime.strptime(upload_date, '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
|
||||
result.set("upload_date", upload_date)
|
||||
|
||||
# Update status for success
|
||||
if self.end_means_success: result.success("yt-dlp")
|
||||
else: result.status = "yt-dlp"
|
||||
return result
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
""" Core modules to handle things such as orchestration, metadata and configs..
|
||||
|
||||
"""
|
||||
from .metadata import Metadata
|
||||
from .media import Media
|
||||
from .step import Step
|
||||
|
||||
@@ -1,4 +1,9 @@
|
||||
"""
|
||||
The Config class initializes and parses configurations for all other steps.
|
||||
It supports CLI argument parsing, loading from YAML file, and overrides to allow
|
||||
flexible setup in various environments.
|
||||
|
||||
"""
|
||||
|
||||
import argparse, yaml
|
||||
from dataclasses import dataclass, field
|
||||
@@ -55,6 +60,7 @@ class Config:
|
||||
|
||||
parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml')
|
||||
|
||||
# Iterate over all step subclasses to gather default configs and CLI arguments
|
||||
for configurable in self.configurable_parents:
|
||||
child: Step
|
||||
for child in configurable.__subclasses__():
|
||||
|
||||
@@ -1,6 +1,21 @@
|
||||
""" ArchivingContext provides a global context for managing configurations and temporary data during the archiving process.
|
||||
|
||||
This singleton class allows for:
|
||||
- Storing and retrieving key-value pairs that are accessible throughout the application lifecycle.
|
||||
- Marking certain values to persist across resets using `keep_on_reset`.
|
||||
- Managing temporary directories and other shared data used during the archiving process.
|
||||
|
||||
### Key Features:
|
||||
- Creates a single global instance.
|
||||
- Reset functionality allows for clearing configurations, with options for partial or full resets.
|
||||
- Custom getters and setters for commonly used context values like temporary directories.
|
||||
|
||||
"""
|
||||
|
||||
class ArchivingContext:
|
||||
"""
|
||||
Singleton context class.
|
||||
Singleton context class for managing global configurations and temporary data.
|
||||
|
||||
ArchivingContext._get_instance() to retrieve it if needed
|
||||
otherwise just
|
||||
ArchivingContext.set(key, value)
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
"""
|
||||
Manages media files and their associated metadata, supporting storage,
|
||||
nested media retrieval, and type validation.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import os
|
||||
@@ -18,6 +22,16 @@ from loguru import logger
|
||||
@dataclass_json # annotation order matters
|
||||
@dataclass
|
||||
class Media:
|
||||
"""
|
||||
Represents a media file with associated properties and storage details.
|
||||
|
||||
Attributes:
|
||||
- filename: The file path of the media.
|
||||
- key: An optional identifier for the media.
|
||||
- urls: A list of URLs where the media is stored or accessible.
|
||||
- properties: Additional metadata or transformations for the media.
|
||||
- _mimetype: The media's mimetype (e.g., image/jpeg, video/mp4).
|
||||
"""
|
||||
filename: str
|
||||
key: str = None
|
||||
urls: List[str] = field(default_factory=list)
|
||||
@@ -40,8 +54,9 @@ class Media:
|
||||
s.store(any_media, url, metadata=metadata)
|
||||
|
||||
def all_inner_media(self, include_self=False):
|
||||
""" Media can be inside media properties, examples include transformations on original media.
|
||||
This function returns a generator for all the inner media.
|
||||
"""Retrieves all media, including nested media within properties or transformations on original media.
|
||||
This function returns a generator for all the inner media.
|
||||
|
||||
"""
|
||||
if include_self: yield self
|
||||
for prop in self.properties.values():
|
||||
|
||||
@@ -1,3 +1,13 @@
|
||||
"""
|
||||
Acts as a container for metadata and media objects associated with an archived item.
|
||||
|
||||
Key Functionalities:
|
||||
- Store and retrieve metadata and associated media.
|
||||
- Merge metadata objects with conflict resolution.
|
||||
- Validate properties like URLs and timestamps.
|
||||
- Manage and deduplicate media objects.
|
||||
- Support for flexible metadata querying and appending.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import hashlib
|
||||
@@ -25,7 +35,11 @@ class Metadata:
|
||||
|
||||
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
|
||||
"""
|
||||
merges two Metadata instances, will overwrite according to overwrite_left flag
|
||||
Merges another `Metadata` instance into this one.
|
||||
|
||||
Conflicts are resolved based on the `overwrite_left` flag:
|
||||
- If `True`, this instance's values are overwritten by `right`.
|
||||
- If `False`, the inverse applies.
|
||||
"""
|
||||
if not right: return self
|
||||
if overwrite_left:
|
||||
@@ -191,4 +205,4 @@ class Metadata:
|
||||
for r in results[1:]:
|
||||
if len(r.media) > len(most_complete.media): most_complete = r
|
||||
elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata): most_complete = r
|
||||
return most_complete
|
||||
return most_complete
|
||||
|
||||
@@ -1,3 +1,9 @@
|
||||
""" Orchestrates all archiving steps, including feeding items,
|
||||
archiving them with specific archivers, enrichment, storage,
|
||||
formatting, database operations and clean up.
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Generator, Union, List
|
||||
from urllib.parse import urlparse
|
||||
|
||||
@@ -1,3 +1,9 @@
|
||||
"""
|
||||
Defines the Step abstract base class, which acts as a blueprint for steps in the archiving pipeline
|
||||
by handling user configuration, validating the steps properties, and implementing dynamic instantiation.
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from inspect import ClassFoundException
|
||||
@@ -10,6 +16,7 @@ class Step(ABC):
|
||||
name: str = None
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# Initialises each step by reading the relevant entries
|
||||
# reads the configs into object properties
|
||||
# self.config = config[self.name]
|
||||
for k, v in config.get(self.name, {}).items():
|
||||
@@ -20,7 +27,9 @@ class Step(ABC):
|
||||
|
||||
def init(name: str, config: dict, child: Type[Step]) -> Step:
|
||||
"""
|
||||
looks into direct subclasses of child for name and returns such an object
|
||||
Attempts to instantiate a subclass of the provided `child` type
|
||||
matching the given `name`.
|
||||
Raises ClassFoundException if no matching subclass is found.
|
||||
TODO: cannot find subclasses of child.subclasses
|
||||
"""
|
||||
for sub in child.__subclasses__():
|
||||
@@ -30,7 +39,9 @@ class Step(ABC):
|
||||
|
||||
def assert_valid_string(self, prop: str) -> None:
|
||||
"""
|
||||
receives a property name an ensures it exists and is a valid non-empty string, raises an exception if not
|
||||
Receives a property name and ensures it exists and is a valid non-empty string,
|
||||
raising an AssertionError if not.
|
||||
TODO: replace assertions with custom exceptions
|
||||
"""
|
||||
assert hasattr(self, prop), f"property {prop} not found"
|
||||
s = getattr(self, prop)
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
""" Databases are used to store the outputs from running the Autp Archiver.
|
||||
|
||||
|
||||
"""
|
||||
from .database import Database
|
||||
from .gsheet_db import GsheetsDb
|
||||
from .console_db import ConsoleDb
|
||||
|
||||
@@ -32,7 +32,9 @@ class AAApiDb(Database):
|
||||
"tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))},
|
||||
}
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
""" query the database for the existence of this item"""
|
||||
""" query the database for the existence of this item.
|
||||
Helps avoid re-archiving the same URL multiple times.
|
||||
"""
|
||||
if not self.allow_rearchive: return
|
||||
|
||||
params = {"url": item.get_url(), "limit": 15}
|
||||
|
||||
@@ -1,3 +1,15 @@
|
||||
"""
|
||||
Enrichers are modular components that enhance archived content by adding
|
||||
context, metadata, or additional processing.
|
||||
|
||||
These add additional information to the context, such as screenshots, hashes, and metadata.
|
||||
They are designed to work within the archiving pipeline, operating on `Metadata` objects after
|
||||
the archiving step and before storage or formatting.
|
||||
|
||||
Enrichers are optional but highly useful for making the archived data more powerful.
|
||||
|
||||
|
||||
"""
|
||||
from .enricher import Enricher
|
||||
from .screenshot_enricher import ScreenshotEnricher
|
||||
from .wayback_enricher import WaybackArchiverEnricher
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
""" Base classes and utilities for enrichers in the Auto-Archiver system.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from abc import abstractmethod, ABC
|
||||
|
||||
@@ -1,3 +1,12 @@
|
||||
""" Hash Enricher for generating cryptographic hashes of media files.
|
||||
|
||||
The `HashEnricher` calculates cryptographic hashes (e.g., SHA-256, SHA3-512)
|
||||
for media files stored in `Metadata` objects. These hashes are used for
|
||||
validating content integrity, ensuring data authenticity, and identifying
|
||||
exact duplicates. The hash is computed by reading the file's bytes in chunks,
|
||||
making it suitable for handling large files efficiently.
|
||||
|
||||
"""
|
||||
import hashlib
|
||||
from loguru import logger
|
||||
|
||||
|
||||
@@ -1,3 +1,15 @@
|
||||
"""
|
||||
PDQ Hash Enricher for generating perceptual hashes of media files.
|
||||
|
||||
The `PdqHashEnricher` processes media files (e.g., images) in `Metadata`
|
||||
objects and calculates perceptual hashes using the PDQ hashing algorithm.
|
||||
These hashes are designed specifically for images and can be used
|
||||
for detecting duplicate or near-duplicate visual content.
|
||||
|
||||
This enricher is typically used after thumbnail or screenshot enrichers
|
||||
to ensure images are available for hashing.
|
||||
|
||||
"""
|
||||
import traceback
|
||||
import pdqhash
|
||||
import numpy as np
|
||||
|
||||
@@ -1,3 +1,11 @@
|
||||
"""Thumbnail Enricher for generating visual previews of video files.
|
||||
|
||||
The `ThumbnailEnricher` processes video files in `Metadata` objects and
|
||||
creates evenly distributed thumbnail images. These thumbnails provide
|
||||
visual snapshots of the video's keyframes, helping users preview content
|
||||
and identify important moments without watching the entire video.
|
||||
|
||||
"""
|
||||
import ffmpeg, os
|
||||
from loguru import logger
|
||||
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
""" Feeders handle the input of media into the Auto Archiver.
|
||||
|
||||
"""
|
||||
from.feeder import Feeder
|
||||
from .gsheet_feeder import GsheetsFeeder
|
||||
from .cli_feeder import CLIFeeder
|
||||
|
||||
@@ -1,3 +1,13 @@
|
||||
"""
|
||||
GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver.
|
||||
|
||||
This reads data from Google Sheets and filters rows based on user-defined rules.
|
||||
The filtered rows are processed into `Metadata` objects.
|
||||
|
||||
### Key properties
|
||||
- validates the sheet's structure and filters rows based on input configurations.
|
||||
- Ensures only rows with valid URLs and unprocessed statuses are included.
|
||||
"""
|
||||
import gspread, os
|
||||
|
||||
from loguru import logger
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
""" Formatters for the output of the content. """
|
||||
from .formatter import Formatter
|
||||
from .html_formatter import HtmlFormatter
|
||||
from .mute_formatter import MuteFormatter
|
||||
@@ -1,3 +1,6 @@
|
||||
""" This module contains the storage classes for the auto-archiver.
|
||||
|
||||
"""
|
||||
from .storage import Storage
|
||||
from .s3 import S3Storage
|
||||
from .local import LocalStorage
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
""" Auto Archiver Utilities. """
|
||||
# we need to explicitly expose the available imports here
|
||||
from .gworksheet import GWorksheet
|
||||
from .misc import *
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
""" This Webdriver class acts as a context manager for the selenium webdriver. """
|
||||
from __future__ import annotations
|
||||
from selenium import webdriver
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
|
||||
Reference in New Issue
Block a user