Further addition to docs: creating modules, configurations, installation

2026-06-08 03:18:28 +03:00 · 2025-02-11 13:49:30 +00:00
parent 7d87b858d6
commit 2f51d3917a
10 changed files with 266 additions and 204 deletions
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -21,13 +21,12 @@ language = 'en'
 # -- General configuration ---------------------------------------------------
 extensions = [
    "myst_parser",                  # Markdown support
    "autoapi.extension",            # Generate API documentation from docstrings
    "sphinxcontrib.mermaid",        # Mermaid diagrams
    "myst_parser",                  # Markdown support
    "sphinx.ext.viewcode",          # Source code links
    "sphinx.ext.napoleon",          # Google-style and NumPy-style docstrings
    "sphinx.ext.autosectionlabel",
    # "sphinx.ext.autodoc",           # Include custom docstrings
    # 'sphinx.ext.autosummary',       # Summarize module/class/function docs
 ]
@@ -55,7 +54,6 @@ autoapi_options = [
 # -- Markdown Support --------------------------------------------------------
 myst_enable_extensions = [
    "colon_fence",          # ::: fences
    "deflist",              # Definition lists
    "html_admonition",      # HTML-style admonitions
    "html_image",           # Inline HTML images
@@ -63,7 +61,6 @@ myst_enable_extensions = [
    "smartquotes",          # Smart quotes
    "linkify",              # Auto-detect links
    "substitution",         # Text substitutions
    "attrs_block",
 ]
 myst_heading_anchors = 2
 myst_fence_as_directive = ["mermaid"]
--- a/docs/source/development/creating_modules.md
+++ b/docs/source/development/creating_modules.md
@@ -0,0 +1,59 @@
 # Creating Your Own Modules
 Modules are what's used to extend `auto-archiver` to process different websites or media, and/or transform the data in a way that suits your needs. In most cases, the [](../core_modules.md) should be sufficient for every day use, but the most common use-cases for making your own Modules include:
 1. Extracting data from a website which doesn't work with the current core extractors.
 2. Enriching or altering the data before saving with additional information that the core enrichers do not offer.
 3. Storing your data in a different format/location from what the core storage providers offer.
 ## Setting up the folder structure
 1. First, decide what type of module you wish to create. Check the types of modules on the [](../core_modules.md) page to decide what type you need. (Note: a module can be more than one type, more on that below)
 2. Create a new python package (a folder) with the name of your module (in this tutorial, we'll call it `awesome_extractor`).
 3. Create the `__manifest__.py` and an the `awesome_extractor.py` files in this folder.
 When done, you should have a module structure as follows:
 ```
 .
 ├── awesome_extractor
 │   ├── __manifest__.py
 │   └── awesome_extractor.py
 ``` 
 Check out the [core modules](https://github.com/bellingcat/auto-archiver/tree/main/src/auto_archiver/modules) in the `auto-archiver` repository for examples of the folder structure for real-world modules.
 ## Populating the Manifest File
 The manifest file is where you define the core information of your module. It is a python dict containing important information, here's an example file:
 ```{code} python
 :filename: myfile.py
 def setup():
   pass
 ```
 ```{include} ../../../tests/data/test_modules/example_module/__manifest__.py
 :name: __manifest__.py
 :literal:
 :parser: python
 ```
 ## Creating the Python Code
 The next step is to create your module code. First, create a class which should subclass the base module types from `auto_archiver.core`, here's an example class for the `awesome_extractor` module which is an `extractor`:
 ```{code-block} python
 :filename: awesome_extractor.py
 from auto_archiver.core import Extractor, Metadata
 def AwesomeExtractor(Extractor):
    def download(self, item: Metadata) -> Metadata | False:
      url = item.get_url()
      # download the content and create the metadata object
      metadata = ...
      return metadata
 ```
--- a/docs/source/development/developer_guidelines.md
+++ b/docs/source/development/developer_guidelines.md
@@ -26,7 +26,7 @@ Install development packages (used for unit tests etc.) using:
 ```{toctree}
 :hidden:
-
+creating_modules
 docker_development
 testing
 docs
--- a/docs/source/example.orchestration.yaml
+++ b/docs/source/example.orchestration.yaml
@@ -0,0 +1,79 @@
 # Auto Archiver Configuration
 # Steps are the modules that will be run in the order they are defined
 steps:
  feeders:
  - cli_feeder
  extractors:
  - generic_extractor
  - telegram_extractor
  enrichers:
  - thumbnail_enricher
  - meta_enricher
  - pdq_hash_enricher
  - ssl_enricher
  - hash_enricher
  databases:
  - console_db
  - csv_db
  storages:
  - local_storage
  formatters:
  - html_formatter
 # Global configuration
 # Authentication
 # a dictionary of authentication information that can be used by extractors to login to website. 
 # you can use a comma separated list for multiple domains on the same line (common usecase: x.com,twitter.com)
 # Common login 'types' are username/password, cookie, api key/token.
 # There are two special keys for using cookies, they are: cookies_file and cookies_from_browser. 
 # Some Examples:
 # facebook.com:
 #   username: "my_username"
 #   password: "my_password"
 # or for a site that uses an API key:
 # twitter.com,x.com:
 #   api_key
 #   api_secret
 # youtube.com:
 #   cookie: "login_cookie=value ; other_cookie=123" # multiple 'key=value' pairs should be separated by ;
 authentication: {}
 # Logging settings for your project. See the logging settings with --help
 logging:
  level: INFO
 # These are the global configurations that are used by the modules
  file:
  rotation:
 local_storage:
  path_generator: flat
  filename_generator: static
  save_to: ./local_archive
  save_absolute: false
 html_formatter:
  detect_thumbnails: true
 thumbnail_enricher:
  thumbnails_per_minute: 60
  max_thumbnails: 16
 generic_extractor:
  subtitles: true
  comments: false
  livestreams: false
  live_from_start: false
  proxy: ''
  end_means_success: true
  allow_playlist: false
  max_downloads: inf
 csv_db:
  csv_file: db.csv
 ssl_enricher:
  skip_when_nothing_archived: true
 hash_enricher:
  algorithm: SHA-256
  chunksize: 16000000
--- a/docs/source/installation/configurations.md
+++ b/docs/source/installation/configurations.md
@@ -0,0 +1,100 @@
 # Configuring
 ```{toctree}
 :hidden:
 configurations
 ```
 This section of the documentation provides guidelines for configuring the tool.
 ## Configuring from the Command Line
 You can run auto-archiver directy from the command line, without the need for a configuration file, command line arguments are parsed using the format `module_name.config_value`. For example, a config value of `api_key` in the `instagram_extractor` module would be passed on the command line with the flag `--instagram_extractor.api_key=API_KEY`.
 The command line arguments are useful for testing or editing config values and enabling/disabling modules on the fly. When you are happy with your settings, you can store them back in your configuration file by passing the `-s/--store` flag on the command line.
 ```bash
 auto-archiver --instagram_extractor.api_key=123 --other_module.setting --store
 # will store the new settings into the configuration file (default: orchestration.yaml)
 ```
 ## Configuring using a file
 The recommended way to configure auto-archiver for long-term and deployed projects is a configuration file, typically called `orchestration.yaml`. This is a YAML file containing all the settings for your entire workflow.
 A default `orchestration.yaml` will be created for you the first time you run auto-archiver (without any arguments). Here's what it looks like:
 <details>
 <summary>View example.orchestration.yaml</summary>
 ```{literalinclude} ../example.orchestration.yaml
   :language: yaml
   :caption: example.orchestration.yaml
 ```
 </details>
 ## Core Module Configuration
 View the configurable settings for the core modules on the individual doc pages for each [](../core_modules.md).
 You can also view all settings available for the modules you have on your system using the `--help` flag in auto-archiver.
 ```{code-block} console
 :caption: Example output when using the --help flag with auto-archiver
 $ auto-archiver --help
 ...
 Positional Arguments:
  urls                  URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml
 Options:
  --help, -h            show a full help message and exit
  --version             show program's version number and exit
  --config CONFIG_FILE  the filename of the YAML configuration file (defaults to 'config.yaml')
  --mode {simple,full}  the mode to run the archiver in
  -s, --store, --no-store
                        Store the created config in the config file
  --module_paths MODULE_PATHS [MODULE_PATHS ...]
                        additional paths to search for modules
  --feeders STEPS.FEEDERS [STEPS.FEEDERS ...]
                        the feeders to use
  --enrichers STEPS.ENRICHERS [STEPS.ENRICHERS ...]
                        the enrichers to use
  --extractors STEPS.EXTRACTORS [STEPS.EXTRACTORS ...]
                        the extractors to use
  --databases STEPS.DATABASES [STEPS.DATABASES ...]
                        the databases to use
  --storages STEPS.STORAGES [STEPS.STORAGES ...]
                        the storages to use
  --formatters STEPS.FORMATTERS [STEPS.FORMATTERS ...]
                        the formatter to use
  --authentication AUTHENTICATION
                        A dictionary of sites and their authentication methods (token, username etc.) that extractors can use to log into a website. If passing this on the command line, use a JSON string. You may
                        also pass a path to a valid JSON/YAML file which will be parsed.
  --logging.level {INFO,DEBUG,ERROR,WARNING}
                        the logging level to use
  --logging.file LOGGING.FILE
                        the logging file to write to
  --logging.rotation LOGGING.ROTATION
                        the logging rotation to use
 Wayback Machine Enricher:
  Submits the current URL to the Wayback Machine for archiving and returns either a job ID or the...
  --wayback_extractor_enricher.timeout TIMEOUT
                        seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually.
  --wayback_extractor_enricher.if_not_archived_within IF_NOT_ARCHIVED_WITHIN
                        only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information:
                        https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA
  --wayback_extractor_enricher.key KEY
                        wayback API key. to get credentials visit https://archive.org/account/s3.php
  --wayback_extractor_enricher.secret SECRET
                        wayback API secret. to get credentials visit https://archive.org/account/s3.php
  --wayback_extractor_enricher.proxy_http PROXY_HTTP
                        http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port
  --wayback_extractor_enricher.proxy_https PROXY_HTTPS
                        https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port
 ```
--- a/docs/source/installation/configurations.rst
+++ b/docs/source/installation/configurations.rst
@@ -1,41 +0,0 @@
 Configurations
 ==============
 ```{toctree}
 :hidden:
 configurations
 ```
 This section of the documentation provides guidelines for configuring the tool.
 File Reference
 --------------
 Below is the content of the `example.orchestration.yaml` file:
 .. raw:: html
    <details>
    <summary>View example.orchestration.yaml</summary>
 .. literalinclude:: ../../example.orchestration.yaml
   :language: yaml
   :caption: example.orchestration.yaml
 .. raw:: html
    </details>
 Configs
 -------
 This section of the documentation will show the custom configurations for the individual steps of the tool.
 .. include:: _auto/configs.rst
--- a/docs/source/installation/installation.md
+++ b/docs/source/installation/installation.md
@@ -1,5 +1,10 @@
 # Installing Auto Archiver
 ```{toctree}
 :depth: 1
 configurations.md
 ```
 There are 3  main ways to use the auto-archiver:
 1. Easiest: [via docker](#installing-with-docker)
--- a/example.orchestration.yaml
+++ b/example.orchestration.yaml
@@ -1,156 +0,0 @@
 steps:
  # only 1 feeder allowed
  feeder: gsheet_feeder # defaults to cli_feeder
  archivers: # order matters, uncomment to activate
    - bluesky_archiver
    # - vk_archiver
    # - telethon_archiver
    # - telegram_archiver
    # - twitter_archiver
    # - twitter_api_archiver
    # - instagram_api_archiver
    # - instagram_tbot_archiver
    # - instagram_archiver
    # - tiktok_archiver
    - youtubedl_archiver
    # - wayback_archiver_enricher
    # - wacz_archiver_enricher
  enrichers:
    - hash_enricher
    # - meta_enricher
    # - metadata_enricher
    # - screenshot_enricher
    # - pdq_hash_enricher
    # - ssl_enricher
    # - timestamping_enricher
    # - whisper_enricher
    # - thumbnail_enricher
    # - wayback_archiver_enricher
    # - wacz_archiver_enricher
    # - pdq_hash_enricher # if you want to calculate hashes for thumbnails, include this after thumbnail_enricher
  formatter: html_formatter # defaults to mute_formatter
  storages:
    - local_storage
    # - s3_storage
    # - gdrive_storage
  databases:
    - console_db
    # - csv_db
    # - gsheet_db
    # - mongo_db
 configurations:
  gsheet_feeder:
    sheet: "your sheet name"
    header: 1
    service_account: "secrets/service_account.json"
    # allow_worksheets: "only parse this worksheet"
    # block_worksheets: "blocked sheet 1,blocked sheet 2"
    use_sheet_names_in_stored_paths: false
    columns:
      url: link
      status: archive status
      folder: destination folder
      archive: archive location
      date: archive date
      thumbnail: thumbnail
      timestamp: upload timestamp
      title: upload title
      text: textual content
      screenshot: screenshot
      hash: hash
      pdq_hash: perceptual hashes
      wacz: wacz
      replaywebpage: replaywebpage
  instagram_tbot_archiver:
    api_id: "TELEGRAM_BOT_API_ID"
    api_hash: "TELEGRAM_BOT_API_HASH"
    # session_file: "secrets/anon"
  telethon_archiver:
    api_id: "TELEGRAM_BOT_API_ID"
    api_hash: "TELEGRAM_BOT_API_HASH"
    # session_file: "secrets/anon"
    join_channels: false
    channel_invites: # if you want to archive from private channels
      - invite: https://t.me/+123456789
        id: 0000000001
      - invite: https://t.me/+123456788
        id: 0000000002
  twitter_api_archiver:
    # either bearer_token only
    bearer_token: "TWITTER_BEARER_TOKEN"
    # OR all of the below
    # consumer_key: ""
    # consumer_secret: ""
    # access_token: ""
    # access_secret: ""
  instagram_archiver:
    username: "INSTAGRAM_USERNAME"
    password: "INSTAGRAM_PASSWORD"
    # session_file: "secrets/instaloader.session"
  vk_archiver:
    username: "or phone number"
    password: "vk pass"
    session_file: "secrets/vk_config.v2.json"
  youtubedl_archiver:
    subtitles: true
    # use one of the following two methods to authenticate in youtube - either provide a cookies file or use the cookies of the given browser
    # for more information, see https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp
    # cookie_file: "secrets/youtube_cookies.txt"
    # cookies_from_browser: firefox
    # proxy: socks5://proxy-user:password@proxy-ip:port
  screenshot_enricher:
    width: 1280
    height: 2300
    # to save as pdf, uncomment the following lines and adjust the print options
    # save_to_pdf: true
    # print_options:
      # for all options see https://www.selenium.dev/selenium/docs/api/py/webdriver/selenium.webdriver.common.print_page_options.html
      # background: true
      # orientation: "portrait"
      # scale: 1
      # page_width: 8.5in
      # page_height: 11in
      # margin_top: 0.4in
      # margin_bottom: 0.4in
      # margin_left: 0.4in
      # margin_right: 0.4in
      # page_ranges: ""
      # shrink_to_fit: true
  wayback_archiver_enricher:
    timeout: 10
    key: "wayback key"
    secret: "wayback secret"
  hash_enricher:
    algorithm: "SHA3-512" # can also be SHA-256
  wacz_archiver_enricher:
    profile: secrets/profile.tar.gz
  local_storage:
    save_to: "./local_archive"
    save_absolute: true
    filename_generator: static
    path_generator: flat
  s3_storage:
    bucket: your-bucket-name
    region: reg1
    key: S3_KEY
    secret: S3_SECRET
    endpoint_url: "https://{region}.digitaloceanspaces.com"
    cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
    # if private:true S3 urls will not be readable online
    private: false
    # with 'random' you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files, alternative is 'default' or not omitted from config
    key_path: random
  gdrive_storage:
    path_generator: url
    filename_generator: random
    root_folder_id: folder_id_from_url
    oauth_token: secrets/gd-token.json # needs to be generated with scripts/create_update_gdrive_oauth_token.py
    service_account: "secrets/service_account.json"
  csv_db:
    csv_file: "./local_archive/db.csv"
--- a/src/auto_archiver/core/config.py
+++ b/src/auto_archiver/core/config.py
@@ -48,6 +48,7 @@ authentication: {}
 logging:
  level: INFO
 """)
 # note: 'logging' is explicitly added above in order to better format the config file
--- a/tests/data/test_modules/example_module/manifest.py
+++ b/tests/data/test_modules/example_module/manifest.py
@@ -1,11 +1,29 @@
 {
    # Display Name of your module
    "name": "Example Module",
    # The author of your module (optional)
    "author": "John Doe",
    # Optional version number, for your own versioning purposes
    "version": 2.0,
    # The type of the module, must be one (or more) of the built in module types
    "type": ["extractor", "feeder", "formatter", "storage", "enricher", "database"],
    # a boolean indicating whether or not a module requires additional user setup before it can be used
    # for example: adding API keys, installing additional software etc.
    "requires_setup": False,
-    "dependencies": {"python": ["loguru"]
+    # a dictionary of dependencies for this module, that must be installed before the module is loaded.
-                              },
+    # Can be python dependencies (external packages, or other auto-archiver modules), or you can
    # provide external bin dependencies (e.g. ffmpeg, docker etc.)
    "dependencies": {
        "python": ["loguru"],
        "bin": ["bash"],
        },
    # configurations that this module takes. These are argparse-compliant dicationaries, that are 
    # used to create command line arguments when the programme is run.
    # The full name of the config option will become: `module_name.config_name`
    "configs": {
            "csv_file": {"default": "db.csv", "help": "CSV file name"},
            "required_field": {"required": True, "help": "required field in the CSV file"},
        },
    # A description of the module, used for documentation
    "description": "This is an example module",
 }