diff --git a/docs/source/conf.py b/docs/source/conf.py index 54988ed..a749dfd 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -21,13 +21,12 @@ language = 'en' # -- General configuration --------------------------------------------------- extensions = [ + "myst_parser", # Markdown support "autoapi.extension", # Generate API documentation from docstrings "sphinxcontrib.mermaid", # Mermaid diagrams - "myst_parser", # Markdown support "sphinx.ext.viewcode", # Source code links "sphinx.ext.napoleon", # Google-style and NumPy-style docstrings "sphinx.ext.autosectionlabel", - # "sphinx.ext.autodoc", # Include custom docstrings # 'sphinx.ext.autosummary', # Summarize module/class/function docs ] @@ -55,7 +54,6 @@ autoapi_options = [ # -- Markdown Support -------------------------------------------------------- myst_enable_extensions = [ - "colon_fence", # ::: fences "deflist", # Definition lists "html_admonition", # HTML-style admonitions "html_image", # Inline HTML images @@ -63,7 +61,6 @@ myst_enable_extensions = [ "smartquotes", # Smart quotes "linkify", # Auto-detect links "substitution", # Text substitutions - "attrs_block", ] myst_heading_anchors = 2 myst_fence_as_directive = ["mermaid"] diff --git a/docs/source/development/creating_modules.md b/docs/source/development/creating_modules.md new file mode 100644 index 0000000..7ee65ef --- /dev/null +++ b/docs/source/development/creating_modules.md @@ -0,0 +1,59 @@ +# Creating Your Own Modules + +Modules are what's used to extend `auto-archiver` to process different websites or media, and/or transform the data in a way that suits your needs. In most cases, the [](../core_modules.md) should be sufficient for every day use, but the most common use-cases for making your own Modules include: + +1. Extracting data from a website which doesn't work with the current core extractors. +2. Enriching or altering the data before saving with additional information that the core enrichers do not offer. +3. Storing your data in a different format/location from what the core storage providers offer. + +## Setting up the folder structure + +1. First, decide what type of module you wish to create. Check the types of modules on the [](../core_modules.md) page to decide what type you need. (Note: a module can be more than one type, more on that below) +2. Create a new python package (a folder) with the name of your module (in this tutorial, we'll call it `awesome_extractor`). +3. Create the `__manifest__.py` and an the `awesome_extractor.py` files in this folder. + +When done, you should have a module structure as follows: + +``` +. +├── awesome_extractor +│ ├── __manifest__.py +│ └── awesome_extractor.py +``` + +Check out the [core modules](https://github.com/bellingcat/auto-archiver/tree/main/src/auto_archiver/modules) in the `auto-archiver` repository for examples of the folder structure for real-world modules. + +## Populating the Manifest File + +The manifest file is where you define the core information of your module. It is a python dict containing important information, here's an example file: + +```{code} python +:filename: myfile.py + +def setup(): + pass +``` + +```{include} ../../../tests/data/test_modules/example_module/__manifest__.py +:name: __manifest__.py +:literal: +:parser: python +``` + +## Creating the Python Code + +The next step is to create your module code. First, create a class which should subclass the base module types from `auto_archiver.core`, here's an example class for the `awesome_extractor` module which is an `extractor`: + +```{code-block} python +:filename: awesome_extractor.py + +from auto_archiver.core import Extractor, Metadata + +def AwesomeExtractor(Extractor): + + def download(self, item: Metadata) -> Metadata | False: + url = item.get_url() + # download the content and create the metadata object + metadata = ... + return metadata +``` diff --git a/docs/source/development/developer_guidelines.md b/docs/source/development/developer_guidelines.md index 26cf6f4..e72193a 100644 --- a/docs/source/development/developer_guidelines.md +++ b/docs/source/development/developer_guidelines.md @@ -26,7 +26,7 @@ Install development packages (used for unit tests etc.) using: ```{toctree} :hidden: - +creating_modules docker_development testing docs diff --git a/docs/source/example.orchestration.yaml b/docs/source/example.orchestration.yaml new file mode 100644 index 0000000..48d354d --- /dev/null +++ b/docs/source/example.orchestration.yaml @@ -0,0 +1,79 @@ +# Auto Archiver Configuration +# Steps are the modules that will be run in the order they are defined + +steps: + feeders: + - cli_feeder + extractors: + - generic_extractor + - telegram_extractor + enrichers: + - thumbnail_enricher + - meta_enricher + - pdq_hash_enricher + - ssl_enricher + - hash_enricher + databases: + - console_db + - csv_db + storages: + - local_storage + formatters: + - html_formatter + +# Global configuration + +# Authentication +# a dictionary of authentication information that can be used by extractors to login to website. +# you can use a comma separated list for multiple domains on the same line (common usecase: x.com,twitter.com) +# Common login 'types' are username/password, cookie, api key/token. +# There are two special keys for using cookies, they are: cookies_file and cookies_from_browser. +# Some Examples: +# facebook.com: +# username: "my_username" +# password: "my_password" +# or for a site that uses an API key: +# twitter.com,x.com: +# api_key +# api_secret +# youtube.com: +# cookie: "login_cookie=value ; other_cookie=123" # multiple 'key=value' pairs should be separated by ; + +authentication: {} + +# Logging settings for your project. See the logging settings with --help + +logging: + level: INFO + +# These are the global configurations that are used by the modules + + file: + rotation: +local_storage: + path_generator: flat + filename_generator: static + save_to: ./local_archive + save_absolute: false +html_formatter: + detect_thumbnails: true +thumbnail_enricher: + thumbnails_per_minute: 60 + max_thumbnails: 16 +generic_extractor: + subtitles: true + comments: false + livestreams: false + live_from_start: false + proxy: '' + end_means_success: true + allow_playlist: false + max_downloads: inf +csv_db: + csv_file: db.csv +ssl_enricher: + skip_when_nothing_archived: true +hash_enricher: + algorithm: SHA-256 + chunksize: 16000000 + diff --git a/docs/source/installation/configurations.md b/docs/source/installation/configurations.md new file mode 100644 index 0000000..ef82e2a --- /dev/null +++ b/docs/source/installation/configurations.md @@ -0,0 +1,100 @@ + +# Configuring + + +```{toctree} +:hidden: + +configurations +``` + +This section of the documentation provides guidelines for configuring the tool. + +## Configuring from the Command Line + +You can run auto-archiver directy from the command line, without the need for a configuration file, command line arguments are parsed using the format `module_name.config_value`. For example, a config value of `api_key` in the `instagram_extractor` module would be passed on the command line with the flag `--instagram_extractor.api_key=API_KEY`. + +The command line arguments are useful for testing or editing config values and enabling/disabling modules on the fly. When you are happy with your settings, you can store them back in your configuration file by passing the `-s/--store` flag on the command line. + +```bash +auto-archiver --instagram_extractor.api_key=123 --other_module.setting --store +# will store the new settings into the configuration file (default: orchestration.yaml) +``` + +## Configuring using a file + +The recommended way to configure auto-archiver for long-term and deployed projects is a configuration file, typically called `orchestration.yaml`. This is a YAML file containing all the settings for your entire workflow. + +A default `orchestration.yaml` will be created for you the first time you run auto-archiver (without any arguments). Here's what it looks like: + +
+View example.orchestration.yaml + +```{literalinclude} ../example.orchestration.yaml + :language: yaml + :caption: example.orchestration.yaml +``` + +
+ +## Core Module Configuration + +View the configurable settings for the core modules on the individual doc pages for each [](../core_modules.md). +You can also view all settings available for the modules you have on your system using the `--help` flag in auto-archiver. + +```{code-block} console +:caption: Example output when using the --help flag with auto-archiver +$ auto-archiver --help +... +Positional Arguments: + urls URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml + +Options: + --help, -h show a full help message and exit + --version show program's version number and exit + --config CONFIG_FILE the filename of the YAML configuration file (defaults to 'config.yaml') + --mode {simple,full} the mode to run the archiver in + -s, --store, --no-store + Store the created config in the config file + --module_paths MODULE_PATHS [MODULE_PATHS ...] + additional paths to search for modules + --feeders STEPS.FEEDERS [STEPS.FEEDERS ...] + the feeders to use + --enrichers STEPS.ENRICHERS [STEPS.ENRICHERS ...] + the enrichers to use + --extractors STEPS.EXTRACTORS [STEPS.EXTRACTORS ...] + the extractors to use + --databases STEPS.DATABASES [STEPS.DATABASES ...] + the databases to use + --storages STEPS.STORAGES [STEPS.STORAGES ...] + the storages to use + --formatters STEPS.FORMATTERS [STEPS.FORMATTERS ...] + the formatter to use + --authentication AUTHENTICATION + A dictionary of sites and their authentication methods (token, username etc.) that extractors can use to log into a website. If passing this on the command line, use a JSON string. You may + also pass a path to a valid JSON/YAML file which will be parsed. + --logging.level {INFO,DEBUG,ERROR,WARNING} + the logging level to use + --logging.file LOGGING.FILE + the logging file to write to + --logging.rotation LOGGING.ROTATION + the logging rotation to use + +Wayback Machine Enricher: + Submits the current URL to the Wayback Machine for archiving and returns either a job ID or the... + + --wayback_extractor_enricher.timeout TIMEOUT + seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually. + --wayback_extractor_enricher.if_not_archived_within IF_NOT_ARCHIVED_WITHIN + only tell wayback to archive if no archive is available before the number of seconds specified, use None to ignore this option. For more information: + https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA + --wayback_extractor_enricher.key KEY + wayback API key. to get credentials visit https://archive.org/account/s3.php + --wayback_extractor_enricher.secret SECRET + wayback API secret. to get credentials visit https://archive.org/account/s3.php + --wayback_extractor_enricher.proxy_http PROXY_HTTP + http proxy to use for wayback requests, eg http://proxy-user:password@proxy-ip:port + --wayback_extractor_enricher.proxy_https PROXY_HTTPS + https proxy to use for wayback requests, eg https://proxy-user:password@proxy-ip:port +``` + diff --git a/docs/source/installation/configurations.rst b/docs/source/installation/configurations.rst deleted file mode 100644 index e00dd94..0000000 --- a/docs/source/installation/configurations.rst +++ /dev/null @@ -1,41 +0,0 @@ - -Configurations -============== - - -```{toctree} -:hidden: - -configurations -``` - -This section of the documentation provides guidelines for configuring the tool. - -File Reference --------------- - - -Below is the content of the `example.orchestration.yaml` file: - -.. raw:: html - -
- View example.orchestration.yaml - -.. literalinclude:: ../../example.orchestration.yaml - :language: yaml - :caption: example.orchestration.yaml - -.. raw:: html - -
- - -Configs -------- - -This section of the documentation will show the custom configurations for the individual steps of the tool. - -.. include:: _auto/configs.rst - - diff --git a/docs/source/installation/installation.md b/docs/source/installation/installation.md index 99c6bf6..3744d0a 100644 --- a/docs/source/installation/installation.md +++ b/docs/source/installation/installation.md @@ -1,5 +1,10 @@ # Installing Auto Archiver +```{toctree} +:depth: 1 + +configurations.md +``` There are 3 main ways to use the auto-archiver: 1. Easiest: [via docker](#installing-with-docker) diff --git a/example.orchestration.yaml b/example.orchestration.yaml deleted file mode 100644 index f1eed2a..0000000 --- a/example.orchestration.yaml +++ /dev/null @@ -1,156 +0,0 @@ -steps: - # only 1 feeder allowed - feeder: gsheet_feeder # defaults to cli_feeder - archivers: # order matters, uncomment to activate - - bluesky_archiver - # - vk_archiver - # - telethon_archiver - # - telegram_archiver - # - twitter_archiver - # - twitter_api_archiver - # - instagram_api_archiver - # - instagram_tbot_archiver - # - instagram_archiver - # - tiktok_archiver - - youtubedl_archiver - # - wayback_archiver_enricher - # - wacz_archiver_enricher - enrichers: - - hash_enricher - # - meta_enricher - # - metadata_enricher - # - screenshot_enricher - # - pdq_hash_enricher - # - ssl_enricher - # - timestamping_enricher - # - whisper_enricher - # - thumbnail_enricher - # - wayback_archiver_enricher - # - wacz_archiver_enricher - # - pdq_hash_enricher # if you want to calculate hashes for thumbnails, include this after thumbnail_enricher - formatter: html_formatter # defaults to mute_formatter - storages: - - local_storage - # - s3_storage - # - gdrive_storage - databases: - - console_db - # - csv_db - # - gsheet_db - # - mongo_db - -configurations: - gsheet_feeder: - sheet: "your sheet name" - header: 1 - service_account: "secrets/service_account.json" - # allow_worksheets: "only parse this worksheet" - # block_worksheets: "blocked sheet 1,blocked sheet 2" - use_sheet_names_in_stored_paths: false - columns: - url: link - status: archive status - folder: destination folder - archive: archive location - date: archive date - thumbnail: thumbnail - timestamp: upload timestamp - title: upload title - text: textual content - screenshot: screenshot - hash: hash - pdq_hash: perceptual hashes - wacz: wacz - replaywebpage: replaywebpage - instagram_tbot_archiver: - api_id: "TELEGRAM_BOT_API_ID" - api_hash: "TELEGRAM_BOT_API_HASH" - # session_file: "secrets/anon" - telethon_archiver: - api_id: "TELEGRAM_BOT_API_ID" - api_hash: "TELEGRAM_BOT_API_HASH" - # session_file: "secrets/anon" - join_channels: false - channel_invites: # if you want to archive from private channels - - invite: https://t.me/+123456789 - id: 0000000001 - - invite: https://t.me/+123456788 - id: 0000000002 - - twitter_api_archiver: - # either bearer_token only - bearer_token: "TWITTER_BEARER_TOKEN" - # OR all of the below - # consumer_key: "" - # consumer_secret: "" - # access_token: "" - # access_secret: "" - instagram_archiver: - username: "INSTAGRAM_USERNAME" - password: "INSTAGRAM_PASSWORD" - # session_file: "secrets/instaloader.session" - - vk_archiver: - username: "or phone number" - password: "vk pass" - session_file: "secrets/vk_config.v2.json" - - youtubedl_archiver: - subtitles: true - # use one of the following two methods to authenticate in youtube - either provide a cookies file or use the cookies of the given browser - # for more information, see https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp - # cookie_file: "secrets/youtube_cookies.txt" - # cookies_from_browser: firefox - # proxy: socks5://proxy-user:password@proxy-ip:port - - screenshot_enricher: - width: 1280 - height: 2300 - # to save as pdf, uncomment the following lines and adjust the print options - # save_to_pdf: true - # print_options: - # for all options see https://www.selenium.dev/selenium/docs/api/py/webdriver/selenium.webdriver.common.print_page_options.html - # background: true - # orientation: "portrait" - # scale: 1 - # page_width: 8.5in - # page_height: 11in - # margin_top: 0.4in - # margin_bottom: 0.4in - # margin_left: 0.4in - # margin_right: 0.4in - # page_ranges: "" - # shrink_to_fit: true - - wayback_archiver_enricher: - timeout: 10 - key: "wayback key" - secret: "wayback secret" - hash_enricher: - algorithm: "SHA3-512" # can also be SHA-256 - wacz_archiver_enricher: - profile: secrets/profile.tar.gz - local_storage: - save_to: "./local_archive" - save_absolute: true - filename_generator: static - path_generator: flat - s3_storage: - bucket: your-bucket-name - region: reg1 - key: S3_KEY - secret: S3_SECRET - endpoint_url: "https://{region}.digitaloceanspaces.com" - cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}" - # if private:true S3 urls will not be readable online - private: false - # with 'random' you can generate a random UUID for the URL instead of a predictable path, useful to still have public but unlisted files, alternative is 'default' or not omitted from config - key_path: random - gdrive_storage: - path_generator: url - filename_generator: random - root_folder_id: folder_id_from_url - oauth_token: secrets/gd-token.json # needs to be generated with scripts/create_update_gdrive_oauth_token.py - service_account: "secrets/service_account.json" - csv_db: - csv_file: "./local_archive/db.csv" diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 9bb080f..c2d38ee 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -48,6 +48,7 @@ authentication: {} logging: level: INFO + """) # note: 'logging' is explicitly added above in order to better format the config file diff --git a/tests/data/test_modules/example_module/__manifest__.py b/tests/data/test_modules/example_module/__manifest__.py index f2ebdbf..e3a26bb 100644 --- a/tests/data/test_modules/example_module/__manifest__.py +++ b/tests/data/test_modules/example_module/__manifest__.py @@ -1,11 +1,29 @@ { + # Display Name of your module "name": "Example Module", + # The author of your module (optional) + "author": "John Doe", + # Optional version number, for your own versioning purposes + "version": 2.0, + # The type of the module, must be one (or more) of the built in module types "type": ["extractor", "feeder", "formatter", "storage", "enricher", "database"], + # a boolean indicating whether or not a module requires additional user setup before it can be used + # for example: adding API keys, installing additional software etc. "requires_setup": False, - "dependencies": {"python": ["loguru"] - }, + # a dictionary of dependencies for this module, that must be installed before the module is loaded. + # Can be python dependencies (external packages, or other auto-archiver modules), or you can + # provide external bin dependencies (e.g. ffmpeg, docker etc.) + "dependencies": { + "python": ["loguru"], + "bin": ["bash"], + }, + # configurations that this module takes. These are argparse-compliant dicationaries, that are + # used to create command line arguments when the programme is run. + # The full name of the config option will become: `module_name.config_name` "configs": { "csv_file": {"default": "db.csv", "help": "CSV file name"}, "required_field": {"required": True, "help": "required field in the CSV file"}, }, + # A description of the module, used for documentation + "description": "This is an example module", } \ No newline at end of file