diff --git a/docs/source/conf.py b/docs/source/conf.py
index 54988ed..a749dfd 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -21,13 +21,12 @@ language = 'en'
# -- General configuration ---------------------------------------------------
extensions = [
+ "myst_parser", # Markdown support
"autoapi.extension", # Generate API documentation from docstrings
"sphinxcontrib.mermaid", # Mermaid diagrams
- "myst_parser", # Markdown support
"sphinx.ext.viewcode", # Source code links
"sphinx.ext.napoleon", # Google-style and NumPy-style docstrings
"sphinx.ext.autosectionlabel",
- # "sphinx.ext.autodoc", # Include custom docstrings
# 'sphinx.ext.autosummary', # Summarize module/class/function docs
]
@@ -55,7 +54,6 @@ autoapi_options = [
# -- Markdown Support --------------------------------------------------------
myst_enable_extensions = [
- "colon_fence", # ::: fences
"deflist", # Definition lists
"html_admonition", # HTML-style admonitions
"html_image", # Inline HTML images
@@ -63,7 +61,6 @@ myst_enable_extensions = [
"smartquotes", # Smart quotes
"linkify", # Auto-detect links
"substitution", # Text substitutions
- "attrs_block",
]
myst_heading_anchors = 2
myst_fence_as_directive = ["mermaid"]
diff --git a/docs/source/development/creating_modules.md b/docs/source/development/creating_modules.md
new file mode 100644
index 0000000..7ee65ef
--- /dev/null
+++ b/docs/source/development/creating_modules.md
@@ -0,0 +1,59 @@
+# Creating Your Own Modules
+
+Modules are what's used to extend `auto-archiver` to process different websites or media, and/or transform the data in a way that suits your needs. In most cases, the [](../core_modules.md) should be sufficient for every day use, but the most common use-cases for making your own Modules include:
+
+1. Extracting data from a website which doesn't work with the current core extractors.
+2. Enriching or altering the data before saving with additional information that the core enrichers do not offer.
+3. Storing your data in a different format/location from what the core storage providers offer.
+
+## Setting up the folder structure
+
+1. First, decide what type of module you wish to create. Check the types of modules on the [](../core_modules.md) page to decide what type you need. (Note: a module can be more than one type, more on that below)
+2. Create a new python package (a folder) with the name of your module (in this tutorial, we'll call it `awesome_extractor`).
+3. Create the `__manifest__.py` and an the `awesome_extractor.py` files in this folder.
+
+When done, you should have a module structure as follows:
+
+```
+.
+├── awesome_extractor
+│ ├── __manifest__.py
+│ └── awesome_extractor.py
+```
+
+Check out the [core modules](https://github.com/bellingcat/auto-archiver/tree/main/src/auto_archiver/modules) in the `auto-archiver` repository for examples of the folder structure for real-world modules.
+
+## Populating the Manifest File
+
+The manifest file is where you define the core information of your module. It is a python dict containing important information, here's an example file:
+
+```{code} python
+:filename: myfile.py
+
+def setup():
+ pass
+```
+
+```{include} ../../../tests/data/test_modules/example_module/__manifest__.py
+:name: __manifest__.py
+:literal:
+:parser: python
+```
+
+## Creating the Python Code
+
+The next step is to create your module code. First, create a class which should subclass the base module types from `auto_archiver.core`, here's an example class for the `awesome_extractor` module which is an `extractor`:
+
+```{code-block} python
+:filename: awesome_extractor.py
+
+from auto_archiver.core import Extractor, Metadata
+
+def AwesomeExtractor(Extractor):
+
+ def download(self, item: Metadata) -> Metadata | False:
+ url = item.get_url()
+ # download the content and create the metadata object
+ metadata = ...
+ return metadata
+```
diff --git a/docs/source/development/developer_guidelines.md b/docs/source/development/developer_guidelines.md
index 26cf6f4..e72193a 100644
--- a/docs/source/development/developer_guidelines.md
+++ b/docs/source/development/developer_guidelines.md
@@ -26,7 +26,7 @@ Install development packages (used for unit tests etc.) using:
```{toctree}
:hidden:
-
+creating_modules
docker_development
testing
docs
diff --git a/docs/source/example.orchestration.yaml b/docs/source/example.orchestration.yaml
new file mode 100644
index 0000000..48d354d
--- /dev/null
+++ b/docs/source/example.orchestration.yaml
@@ -0,0 +1,79 @@
+# Auto Archiver Configuration
+# Steps are the modules that will be run in the order they are defined
+
+steps:
+ feeders:
+ - cli_feeder
+ extractors:
+ - generic_extractor
+ - telegram_extractor
+ enrichers:
+ - thumbnail_enricher
+ - meta_enricher
+ - pdq_hash_enricher
+ - ssl_enricher
+ - hash_enricher
+ databases:
+ - console_db
+ - csv_db
+ storages:
+ - local_storage
+ formatters:
+ - html_formatter
+
+# Global configuration
+
+# Authentication
+# a dictionary of authentication information that can be used by extractors to login to website.
+# you can use a comma separated list for multiple domains on the same line (common usecase: x.com,twitter.com)
+# Common login 'types' are username/password, cookie, api key/token.
+# There are two special keys for using cookies, they are: cookies_file and cookies_from_browser.
+# Some Examples:
+# facebook.com:
+# username: "my_username"
+# password: "my_password"
+# or for a site that uses an API key:
+# twitter.com,x.com:
+# api_key
+# api_secret
+# youtube.com:
+# cookie: "login_cookie=value ; other_cookie=123" # multiple 'key=value' pairs should be separated by ;
+
+authentication: {}
+
+# Logging settings for your project. See the logging settings with --help
+
+logging:
+ level: INFO
+
+# These are the global configurations that are used by the modules
+
+ file:
+ rotation:
+local_storage:
+ path_generator: flat
+ filename_generator: static
+ save_to: ./local_archive
+ save_absolute: false
+html_formatter:
+ detect_thumbnails: true
+thumbnail_enricher:
+ thumbnails_per_minute: 60
+ max_thumbnails: 16
+generic_extractor:
+ subtitles: true
+ comments: false
+ livestreams: false
+ live_from_start: false
+ proxy: ''
+ end_means_success: true
+ allow_playlist: false
+ max_downloads: inf
+csv_db:
+ csv_file: db.csv
+ssl_enricher:
+ skip_when_nothing_archived: true
+hash_enricher:
+ algorithm: SHA-256
+ chunksize: 16000000
+
diff --git a/docs/source/installation/configurations.md b/docs/source/installation/configurations.md
new file mode 100644
index 0000000..ef82e2a
--- /dev/null
+++ b/docs/source/installation/configurations.md
@@ -0,0 +1,100 @@
+
+# Configuring
+
+
+```{toctree}
+:hidden:
+
+configurations
+```
+
+This section of the documentation provides guidelines for configuring the tool.
+
+## Configuring from the Command Line
+
+You can run auto-archiver directy from the command line, without the need for a configuration file, command line arguments are parsed using the format `module_name.config_value`. For example, a config value of `api_key` in the `instagram_extractor` module would be passed on the command line with the flag `--instagram_extractor.api_key=API_KEY`.
+
+The command line arguments are useful for testing or editing config values and enabling/disabling modules on the fly. When you are happy with your settings, you can store them back in your configuration file by passing the `-s/--store` flag on the command line.
+
+```bash
+auto-archiver --instagram_extractor.api_key=123 --other_module.setting --store
+# will store the new settings into the configuration file (default: orchestration.yaml)
+```
+
+## Configuring using a file
+
+The recommended way to configure auto-archiver for long-term and deployed projects is a configuration file, typically called `orchestration.yaml`. This is a YAML file containing all the settings for your entire workflow.
+
+A default `orchestration.yaml` will be created for you the first time you run auto-archiver (without any arguments). Here's what it looks like:
+
+
+View example.orchestration.yaml
+
+```{literalinclude} ../example.orchestration.yaml
+ :language: yaml
+ :caption: example.orchestration.yaml
+```
+
+
+
+## Core Module Configuration
+
+View the configurable settings for the core modules on the individual doc pages for each [](../core_modules.md).
+You can also view all settings available for the modules you have on your system using the `--help` flag in auto-archiver.
+
+```{code-block} console
+:caption: Example output when using the --help flag with auto-archiver
+$ auto-archiver --help
+...
+Positional Arguments:
+ urls URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml
+
+Options:
+ --help, -h show a full help message and exit
+ --version show program's version number and exit
+ --config CONFIG_FILE the filename of the YAML configuration file (defaults to 'config.yaml')
+ --mode {simple,full} the mode to run the archiver in
+ -s, --store, --no-store
+ Store the created config in the config file
+ --module_paths MODULE_PATHS [MODULE_PATHS ...]
+ additional paths to search for modules
+ --feeders STEPS.FEEDERS [STEPS.FEEDERS ...]
+ the feeders to use
+ --enrichers STEPS.ENRICHERS [STEPS.ENRICHERS ...]
+ the enrichers to use
+ --extractors STEPS.EXTRACTORS [STEPS.EXTRACTORS ...]
+ the extractors to use
+ --databases STEPS.DATABASES [STEPS.DATABASES ...]
+ the databases to use
+ --storages STEPS.STORAGES [STEPS.STORAGES ...]
+ the storages to use
+ --formatters STEPS.FORMATTERS [STEPS.FORMATTERS ...]
+ the formatter to use
+ --authentication AUTHENTICATION
+ A dictionary of sites and their authentication methods (token, username etc.) that extractors can use to log into a website. If passing this on the command line, use a JSON string. You may
+ also pass a path to a valid JSON/YAML file which will be parsed.
+ --logging.level {INFO,DEBUG,ERROR,WARNING}
+ the logging level to use
+ --logging.file LOGGING.FILE
+ the logging file to write to
+ --logging.rotation LOGGING.ROTATION
+ the logging rotation to use
+
+Wayback Machine Enricher:
+ Submits the current URL to the Wayback Machine for archiving and returns either a job ID or the...
+
+ --wayback_extractor_enricher.timeout TIMEOUT
+ seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually.
+ --wayback_extractor_enricher.if_not_archived_within IF_NOT_ARCHIVED_WITHIN
+ only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information:
+ https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA
+ --wayback_extractor_enricher.key KEY
+ wayback API key. to get credentials visit https://archive.org/account/s3.php
+ --wayback_extractor_enricher.secret SECRET
+ wayback API secret. to get credentials visit https://archive.org/account/s3.php
+ --wayback_extractor_enricher.proxy_http PROXY_HTTP
+ http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port
+ --wayback_extractor_enricher.proxy_https PROXY_HTTPS
+ https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port
+```
+
diff --git a/docs/source/installation/configurations.rst b/docs/source/installation/configurations.rst
deleted file mode 100644
index e00dd94..0000000
--- a/docs/source/installation/configurations.rst
+++ /dev/null
@@ -1,41 +0,0 @@
-
-Configurations
-==============
-
-
-```{toctree}
-:hidden:
-
-configurations
-```
-
-This section of the documentation provides guidelines for configuring the tool.
-
-File Reference
---------------
-
-
-Below is the content of the `example.orchestration.yaml` file:
-
-.. raw:: html
-
-
- View example.orchestration.yaml
-
-.. literalinclude:: ../../example.orchestration.yaml
- :language: yaml
- :caption: example.orchestration.yaml
-
-.. raw:: html
-
-
-
-
-Configs
--------
-
-This section of the documentation will show the custom configurations for the individual steps of the tool.
-
-.. include:: _auto/configs.rst
-
-
diff --git a/docs/source/installation/installation.md b/docs/source/installation/installation.md
index 99c6bf6..3744d0a 100644
--- a/docs/source/installation/installation.md
+++ b/docs/source/installation/installation.md
@@ -1,5 +1,10 @@
# Installing Auto Archiver
+```{toctree}
+:depth: 1
+
+configurations.md
+```
There are 3 main ways to use the auto-archiver:
1. Easiest: [via docker](#installing-with-docker)
diff --git a/example.orchestration.yaml b/example.orchestration.yaml
deleted file mode 100644
index f1eed2a..0000000
--- a/example.orchestration.yaml
+++ /dev/null
@@ -1,156 +0,0 @@
-steps:
- # only 1 feeder allowed
- feeder: gsheet_feeder # defaults to cli_feeder
- archivers: # order matters, uncomment to activate
- - bluesky_archiver
- # - vk_archiver
- # - telethon_archiver
- # - telegram_archiver
- # - twitter_archiver
- # - twitter_api_archiver
- # - instagram_api_archiver
- # - instagram_tbot_archiver
- # - instagram_archiver
- # - tiktok_archiver
- - youtubedl_archiver
- # - wayback_archiver_enricher
- # - wacz_archiver_enricher
- enrichers:
- - hash_enricher
- # - meta_enricher
- # - metadata_enricher
- # - screenshot_enricher
- # - pdq_hash_enricher
- # - ssl_enricher
- # - timestamping_enricher
- # - whisper_enricher
- # - thumbnail_enricher
- # - wayback_archiver_enricher
- # - wacz_archiver_enricher
- # - pdq_hash_enricher # if you want to calculate hashes for thumbnails, include this after thumbnail_enricher
- formatter: html_formatter # defaults to mute_formatter
- storages:
- - local_storage
- # - s3_storage
- # - gdrive_storage
- databases:
- - console_db
- # - csv_db
- # - gsheet_db
- # - mongo_db
-
-configurations:
- gsheet_feeder:
- sheet: "your sheet name"
- header: 1
- service_account: "secrets/service_account.json"
- # allow_worksheets: "only parse this worksheet"
- # block_worksheets: "blocked sheet 1,blocked sheet 2"
- use_sheet_names_in_stored_paths: false
- columns:
- url: link
- status: archive status
- folder: destination folder
- archive: archive location
- date: archive date
- thumbnail: thumbnail
- timestamp: upload timestamp
- title: upload title
- text: textual content
- screenshot: screenshot
- hash: hash
- pdq_hash: perceptual hashes
- wacz: wacz
- replaywebpage: replaywebpage
- instagram_tbot_archiver:
- api_id: "TELEGRAM_BOT_API_ID"
- api_hash: "TELEGRAM_BOT_API_HASH"
- # session_file: "secrets/anon"
- telethon_archiver:
- api_id: "TELEGRAM_BOT_API_ID"
- api_hash: "TELEGRAM_BOT_API_HASH"
- # session_file: "secrets/anon"
- join_channels: false
- channel_invites: # if you want to archive from private channels
- - invite: https://t.me/+123456789
- id: 0000000001
- - invite: https://t.me/+123456788
- id: 0000000002
-
- twitter_api_archiver:
- # either bearer_token only
- bearer_token: "TWITTER_BEARER_TOKEN"
- # OR all of the below
- # consumer_key: ""
- # consumer_secret: ""
- # access_token: ""
- # access_secret: ""
- instagram_archiver:
- username: "INSTAGRAM_USERNAME"
- password: "INSTAGRAM_PASSWORD"
- # session_file: "secrets/instaloader.session"
-
- vk_archiver:
- username: "or phone number"
- password: "vk pass"
- session_file: "secrets/vk_config.v2.json"
-
- youtubedl_archiver:
- subtitles: true
- # use one of the following two methods to authenticate in youtube - either provide a cookies file or use the cookies of the given browser
- # for more information, see https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp
- # cookie_file: "secrets/youtube_cookies.txt"
- # cookies_from_browser: firefox
- # proxy: socks5://proxy-user:password@proxy-ip:port
-
- screenshot_enricher:
- width: 1280
- height: 2300
- # to save as pdf, uncomment the following lines and adjust the print options
- # save_to_pdf: true
- # print_options:
- # for all options see https://www.selenium.dev/selenium/docs/api/py/webdriver/selenium.webdriver.common.print_page_options.html
- # background: true
- # orientation: "portrait"
- # scale: 1
- # page_width: 8.5in
- # page_height: 11in
- # margin_top: 0.4in
- # margin_bottom: 0.4in
- # margin_left: 0.4in
- # margin_right: 0.4in
- # page_ranges: ""
- # shrink_to_fit: true
-
- wayback_archiver_enricher:
- timeout: 10
- key: "wayback key"
- secret: "wayback secret"
- hash_enricher:
- algorithm: "SHA3-512" # can also be SHA-256
- wacz_archiver_enricher:
- profile: secrets/profile.tar.gz
- local_storage:
- save_to: "./local_archive"
- save_absolute: true
- filename_generator: static
- path_generator: flat
- s3_storage:
- bucket: your-bucket-name
- region: reg1
- key: S3_KEY
- secret: S3_SECRET
- endpoint_url: "https://{region}.digitaloceanspaces.com"
- cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}"
- # if private:true S3 urls will not be readable online
- private: false
- # with 'random' you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files, alternative is 'default' or not omitted from config
- key_path: random
- gdrive_storage:
- path_generator: url
- filename_generator: random
- root_folder_id: folder_id_from_url
- oauth_token: secrets/gd-token.json # needs to be generated with scripts/create_update_gdrive_oauth_token.py
- service_account: "secrets/service_account.json"
- csv_db:
- csv_file: "./local_archive/db.csv"
diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py
index 9bb080f..c2d38ee 100644
--- a/src/auto_archiver/core/config.py
+++ b/src/auto_archiver/core/config.py
@@ -48,6 +48,7 @@ authentication: {}
logging:
level: INFO
+
""")
# note: 'logging' is explicitly added above in order to better format the config file
diff --git a/tests/data/test_modules/example_module/__manifest__.py b/tests/data/test_modules/example_module/__manifest__.py
index f2ebdbf..e3a26bb 100644
--- a/tests/data/test_modules/example_module/__manifest__.py
+++ b/tests/data/test_modules/example_module/__manifest__.py
@@ -1,11 +1,29 @@
{
+ # Display Name of your module
"name": "Example Module",
+ # The author of your module (optional)
+ "author": "John Doe",
+ # Optional version number, for your own versioning purposes
+ "version": 2.0,
+ # The type of the module, must be one (or more) of the built in module types
"type": ["extractor", "feeder", "formatter", "storage", "enricher", "database"],
+ # a boolean indicating whether or not a module requires additional user setup before it can be used
+ # for example: adding API keys, installing additional software etc.
"requires_setup": False,
- "dependencies": {"python": ["loguru"]
- },
+ # a dictionary of dependencies for this module, that must be installed before the module is loaded.
+ # Can be python dependencies (external packages, or other auto-archiver modules), or you can
+ # provide external bin dependencies (e.g. ffmpeg, docker etc.)
+ "dependencies": {
+ "python": ["loguru"],
+ "bin": ["bash"],
+ },
+ # configurations that this module takes. These are argparse-compliant dicationaries, that are
+ # used to create command line arguments when the programme is run.
+ # The full name of the config option will become: `module_name.config_name`
"configs": {
"csv_file": {"default": "db.csv", "help": "CSV file name"},
"required_field": {"required": True, "help": "required field in the CSV file"},
},
+ # A description of the module, used for documentation
+ "description": "This is an example module",
}
\ No newline at end of file