diff --git a/docs/source/development/release.md b/docs/source/development/release.md index 6939e97..403dcb9 100644 --- a/docs/source/development/release.md +++ b/docs/source/development/release.md @@ -6,7 +6,7 @@ 1. Update the version number in [version.py](src/auto_archiver/version.py) 2. Go to github releases > new release > use `vx.y.z` for matching version notation 1. package is automatically updated in pypi - 2. docker image is automatically pushed to dockerhup + 2. docker image is automatically pushed to dockerhub diff --git a/docs/source/how_to.md b/docs/source/how_to.md index 25e1e1d..d8fe2e1 100644 --- a/docs/source/how_to.md +++ b/docs/source/how_to.md @@ -1,49 +1,6 @@ # How-To Guides -## How to use Google Sheets to load and store archive information -The `--gsheet_feeder.sheet` property is the name of the Google Sheet to check for URLs. -This sheet must have been shared with the Google Service account used by `gspread`. -This sheet must also have specific columns (case-insensitive) in the `header` - see the [Gsheet Feeder Docs](modules/autogen/feeder/gsheet_feeder.md) for more info. The default names of these columns and their purpose is: - -Inputs: - -* **Link** *(required)*: the URL of the post to archive -* **Destination folder**: custom folder for archived file (regardless of storage) - -Outputs: -* **Archive status** *(required)*: Status of archive operation -* **Archive location**: URL of archived post -* **Archive date**: Date archived -* **Thumbnail**: Embeds a thumbnail for the post in the spreadsheet -* **Timestamp**: Timestamp of original post -* **Title**: Post title -* **Text**: Post text -* **Screenshot**: Link to screenshot of post -* **Hash**: Hash of archived HTML file (which contains hashes of post media) - for checksums/verification -* **Perceptual Hash**: Perceptual hashes of found images - these can be used for de-duplication of content -* **WACZ**: Link to a WACZ web archive of post -* **ReplayWebpage**: Link to a ReplayWebpage viewer of the WACZ archive - -For example, this is a spreadsheet configured with all of the columns for the auto archiver and a few URLs to archive. (Note that the column names are not case sensitive.) - -![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column](../demo-before.png) - -Now the auto archiver can be invoked, with this command in this example: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver:dockerize --config secrets/orchestration-global.yaml --gsheet_feeder.sheet "Auto archive test 2023-2"`. Note that the sheet name has been overridden/specified in the command line invocation. - -When the auto archiver starts running, it updates the "Archive status" column. - -![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column. The auto archiver has added "archive in progress" to one of the status columns.](../demo-progress.png) - -The links are downloaded and archived, and the spreadsheet is updated to the following: - -![A screenshot of a Google Spreadsheet with videos archived and metadata added per the description of the columns above.](../demo-after.png) - -Note that the first row is skipped, as it is assumed to be a header row (`--gsheet_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked. - -The "archive location" link contains the path of the archived file, in local storage, S3, or in Google Drive. - -![The archive result for a link in the demo sheet.](../demo-archive.png) - +The follow pages contain helpful how-to guides for comon use cases of the Auto-Archiver. --- ```{toctree} @@ -51,4 +8,5 @@ The "archive location" link contains the path of the archived file, in local sto :glob: how_to/* + ``` \ No newline at end of file diff --git a/docs/source/how_to/authentication_how_to.md b/docs/source/how_to/authentication_how_to.md new file mode 100644 index 0000000..ebf0f0c --- /dev/null +++ b/docs/source/how_to/authentication_how_to.md @@ -0,0 +1,6 @@ +# How to login (authenticate) to websites + +This how-to guide shows you how you can add authentication to Auto Archiver for a site you are trying to archive. In this example, we will authenticate on use Twitter/X.com using cookies, and on XXXX using username/password. + +```{note} This page is still under construction 🚧 +``` \ No newline at end of file diff --git a/docs/source/how_to/gsheets_setup.md b/docs/source/how_to/gsheets_setup.md new file mode 100644 index 0000000..f013534 --- /dev/null +++ b/docs/source/how_to/gsheets_setup.md @@ -0,0 +1,44 @@ +# Using Google Sheets + +The `--gsheet_feeder.sheet` property is the name of the Google Sheet to check for URLs. +This sheet must have been shared with the Google Service account used by `gspread`. +This sheet must also have specific columns (case-insensitive) in the `header` - see the [Gsheet Feeder Docs](modules/autogen/feeder/gsheet_feeder.md) for more info. The default names of these columns and their purpose is: + +Inputs: + +* **Link** *(required)*: the URL of the post to archive +* **Destination folder**: custom folder for archived file (regardless of storage) + +Outputs: +* **Archive status** *(required)*: Status of archive operation +* **Archive location**: URL of archived post +* **Archive date**: Date archived +* **Thumbnail**: Embeds a thumbnail for the post in the spreadsheet +* **Timestamp**: Timestamp of original post +* **Title**: Post title +* **Text**: Post text +* **Screenshot**: Link to screenshot of post +* **Hash**: Hash of archived HTML file (which contains hashes of post media) - for checksums/verification +* **Perceptual Hash**: Perceptual hashes of found images - these can be used for de-duplication of content +* **WACZ**: Link to a WACZ web archive of post +* **ReplayWebpage**: Link to a ReplayWebpage viewer of the WACZ archive + +For example, this is a spreadsheet configured with all of the columns for the auto archiver and a few URLs to archive. (Note that the column names are not case sensitive.) + +![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column](../demo-before.png) + +Now the auto archiver can be invoked, with this command in this example: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver:dockerize --config secrets/orchestration-global.yaml --gsheet_feeder.sheet "Auto archive test 2023-2"`. Note that the sheet name has been overridden/specified in the command line invocation. + +When the auto archiver starts running, it updates the "Archive status" column. + +![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column. The auto archiver has added "archive in progress" to one of the status columns.](../demo-progress.png) + +The links are downloaded and archived, and the spreadsheet is updated to the following: + +![A screenshot of a Google Spreadsheet with videos archived and metadata added per the description of the columns above.](../demo-after.png) + +Note that the first row is skipped, as it is assumed to be a header row (`--gsheet_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked. + +The "archive location" link contains the path of the archived file, in local storage, S3, or in Google Drive. + +![The archive result for a link in the demo sheet.](../demo-archive.png) diff --git a/docs/source/how_to/logging.md b/docs/source/how_to/logging.md new file mode 100644 index 0000000..7ae3b18 --- /dev/null +++ b/docs/source/how_to/logging.md @@ -0,0 +1,55 @@ +# Logging + +Auto Archiver's logs can be helpful for debugging problematic archiving processes. This guide shows you how to use the logs to + +## Setting up logging + +Logging settings can be set on the command line or using the orchestration config file ([learn more](../installation/configuration)). A special `logging` section defines the logging options. + +#### Logging Level + +There are 7 logging levels in total, with 4 commonly used levels. They are: `DEBUG`, `INFO`, `WARNING` and `ERROR`. + +Change the warning level by setting the value in your orchestration config file: + +```{code} yaml +:caption: orchestration.yaml + +... +logging: + level: DEBUG # or INFO / WARNING / ERROR +... +``` + +For normal usage, it is recommended to use the `INFO` level, or if you prefer quieter logs with less information, you can use the `WARNING` level. If you encounter issues with the archiving, then it's recommended to enable the `DEBUG` level. + +```{note} To learn about all logging levels, see the [loguru documentation](https://loguru.readthedocs.io/en/stable/api/logger.html) +``` + +### Logging to a file + +As default, auto-archiver will log to the console. But if you wish to store your logs for future reference, or you are running the auto-archiver from within code a implementation, then you may with to enable file logging. This can be done by setting the `file:` config value in the logging settings. + +**Rotation:** For file logging, you can choose to 'rotate' your log files (creating new log files) so they do not get too large. Change this by setting the 'rotation' option in your logging settings. For a full list of rotation options, see the [loguru docs](https://loguru.readthedocs.io/en/stable/overview.html#easier-file-logging-with-rotation-retention-compression). + +```{code} yaml +:caption: orchestration.yaml + +logging: + ... + file: /my/log/file.log + rotation: 1 day +``` + +### Full logging example + +The below example logs only `WARNING` logs to the console and to the file `/my/file.log`, rotating that file once per week: + +```{code} yaml +:caption: orchestration.yaml + +logging: + level: WARNING + file: /my/file.log + rotation: 1 week +``` \ No newline at end of file diff --git a/docs/source/index.md b/docs/source/index.md index 6a7f769..53185ee 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -9,7 +9,7 @@ Overview contributing -installation/installation.rst +installation/installation core_modules.md how_to development/developer_guidelines diff --git a/docs/source/how_to/authentication.md b/docs/source/installation/authentication.md similarity index 69% rename from docs/source/how_to/authentication.md rename to docs/source/installation/authentication.md index 5f3bc48..be30425 100644 --- a/docs/source/how_to/authentication.md +++ b/docs/source/installation/authentication.md @@ -4,22 +4,42 @@ The Authentication framework for auto-archiver allows you to add login details f There are two main use cases for authentication: * Some websites require some kind of authentication in order to view the content. Examples include Facebook, Telegram etc. -* Some websites use anti-bot systems to block bot-like tools from accessig the website. Adding real login information to auto-archiver can sometimes bypass this. +* Some websites use anti-bot systems to block bot-like tools from accessing the website. Adding real login information to auto-archiver can sometimes bypass this. ## The Authentication Config -You can save your authentication information directly inside your orchestration config file, or as a separate file (for security/multi-deploy purposes). Whether storing your settings inside the orchestration file, or as a separate file, the configuration format is the same. +You can save your authentication information directly inside your orchestration config file, or as a separate file (for security/multi-deploy purposes). Whether storing your settings inside the orchestration file, or as a separate file, the configuration format is the same. Currently, auto-archiver supports the following authentication types: + +**Username & Password:** +- `username`: str - the username to use for login +- `password`: str - the password to use for login + +**API** +- `api_key`: str - the API key to use for login +- `api_secret`: str - the API secret to use for login + +**Cookies** +- `cookie`: str - a cookie string to use for login (specific to this site) +- `cookies_from_browser`: str - load cookies from this browser, for this site only. +- `cookies_file`: str - load cookies from this file, for this site only. + +```{note} + +The Username & Password, and API settings only work with the Generic Extractor. Other modules (like the screenshot enricher) can only use the `cookies` options. Furthermore, many sites can still detect bots and block username/password logins. Twitter/X and YouTube are two prominent ones that block username/password logging. + +One of the 'Cookies' options is recommended for the most robust archiving. +``` ```{code} yaml authentication: # optional file to load authentication information from, for security or multi-system deploy purposes load_from_file: path/to/authentication/file.txt - # optional setting to load cookies from the named browser on the system. + # optional setting to load cookies from the named browser on the system, for **ALL** websites cookies_from_browser: firefox - # optional setting to load cookies from a cookies.txt/cookies.jar file. See note below on extracting these + # optional setting to load cookies from a cookies.txt/cookies.jar file, for **ALL** websites. See note below on extracting these cookies_file: path/to/cookies.jar - twitter.com,x.com: + mysite.com: username: myusername password: 123 @@ -29,15 +49,10 @@ authentication: othersite.com: api_key: 123 api_secret: 1234 - -# All available options: - # - username: str - the username to use for login - # - password: str - the password to use for login - # - api_key: str - the API key to use for login - # - api_secret: str - the API secret to use for login - # - cookie: str - a cookie string to use for login (specific to this site) + ``` + ### Recommendations for authentication 1. **Store authentication information separately:** diff --git a/docs/source/installation/configurations.md b/docs/source/installation/configurations.md index 705b6c5..3e9cd08 100644 --- a/docs/source/installation/configurations.md +++ b/docs/source/installation/configurations.md @@ -23,7 +23,7 @@ A default `orchestration.yaml` will be created for you the first time you run au ## Configuring from the Command Line -You can run auto-archiver directy from the command line, without the need for a configuration file, command line arguments are parsed using the format `module_name.config_value`. For example, a config value of `api_key` in the `instagram_extractor` module would be passed on the command line with the flag `--instagram_extractor.api_key=API_KEY`. +You can run auto-archiver directly from the command line, without the need for a configuration file, command line arguments are parsed using the format `module_name.config_value`. For example, a config value of `api_key` in the `instagram_extractor` module would be passed on the command line with the flag `--instagram_extractor.api_key=API_KEY`. The command line arguments are useful for testing or editing config values and enabling/disabling modules on the fly. When you are happy with your settings, you can store them back in your configuration file by passing the `-s/--store` flag on the command line. diff --git a/docs/source/installation/installation.md b/docs/source/installation/installation.md index fdd3184..6f2e9d4 100644 --- a/docs/source/installation/installation.md +++ b/docs/source/installation/installation.md @@ -5,6 +5,7 @@ :hidden: configurations.md +authentication.md config_cheatsheet.md ``` diff --git a/docs/source/modules/extractor.md b/docs/source/modules/extractor.md index 7f218fb..2fb0fef 100644 --- a/docs/source/modules/extractor.md +++ b/docs/source/modules/extractor.md @@ -4,7 +4,7 @@ Extractor modules are used to extract the content of a given URL. Typically, one Extractors that are able to extract content from a wide range of websites include: 1. Generic Extractor: parses videos and images on sites using the powerful yt-dlp library. -2. Wayback Machine Extractor: sends pages to the Waygback machine for archiving, and stores the link. +2. Wayback Machine Extractor: sends pages to the Wayback machine for archiving, and stores the link. 3. WACZ Extractor: runs a web browser to 'browse' the URL and save a copy of the page in WACZ format. ```{include} autogen/extractor.md diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index dfdd5ad..1586585 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -63,12 +63,18 @@ class BaseModule(ABC): def config_setup(self, config: dict): authentication = config.get('authentication', {}) - # this is important. Each instance is given its own deepcopied config, so modules cannot # change values to affect other modules config = deepcopy(config) authentication = deepcopy(config.pop('authentication', {})) + # extract out concatenated sites + for key, val in copy(authentication).items(): + if "," in key: + for site in key.split(","): + authentication[site] = val + del authentication[key] + self.authentication = authentication self.config = config for key, val in config.get(self.name, {}).items(): @@ -102,7 +108,7 @@ class BaseModule(ABC): # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com) # for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code? - site = UrlUtil.domain_for_url(site) + site = UrlUtil.domain_for_url(site).lstrip("www.") # add the 'www' version of the site to the list of sites to check authdict = {} @@ -128,17 +134,30 @@ class BaseModule(ABC): # collections.namedtuple('ParsedOptions', ('parser', 'options', 'urls', 'ydl_opts')) ytdlp_opts = getattr(parse_options(args), 'ydl_opts') return yt_dlp.YoutubeDL(ytdlp_opts).cookiejar + + get_cookiejar_options = None - # get the cookies jar, prefer the browser cookies than the file - if 'cookies_from_browser' in self.authentication: + # order of priority: + # 1. cookies_from_browser setting in site config + # 2. cookies_file setting in site config + # 3. cookies_from_browser setting in global config + # 4. cookies_file setting in global config + + if 'cookies_from_browser' in authdict: + get_cookiejar_options = ['--cookies-from-browser', authdict['cookies_from_browser']] + elif 'cookies_file' in authdict: + get_cookiejar_options = ['--cookies', authdict['cookies_file']] + elif 'cookies_from_browser' in self.authentication: authdict['cookies_from_browser'] = self.authentication['cookies_from_browser'] - if extract_cookies: - authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies-from-browser', self.authentication['cookies_from_browser']]) + get_cookiejar_options = ['--cookies-from-browser', self.authentication['cookies_from_browser']] elif 'cookies_file' in self.authentication: authdict['cookies_file'] = self.authentication['cookies_file'] - if extract_cookies: - authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies', self.authentication['cookies_file']]) + get_cookiejar_options = ['--cookies', self.authentication['cookies_file']] + + if get_cookiejar_options: + authdict['cookies_jar'] = get_ytdlp_cookiejar(get_cookiejar_options) + return authdict def repr(self): diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 322ef6e..d335959 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -10,7 +10,7 @@ from ruamel.yaml import YAML, CommentedMap, add_representer from loguru import logger -from copy import deepcopy +from copy import deepcopy, copy from .module import BaseModule from typing import Any, List, Type, Tuple @@ -154,7 +154,7 @@ def read_yaml(yaml_filename: str) -> CommentedMap: if not config: config = EMPTY_CONFIG - + return config # TODO: make this tidier/find a way to notify of which keys should not be stored diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 208512a..aa405dd 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -8,7 +8,6 @@ from __future__ import annotations from typing import Generator, Union, List, Type from urllib.parse import urlparse from ipaddress import ip_address -from copy import copy import argparse import os import sys @@ -75,13 +74,6 @@ class AuthenticationJsonParseAction(JsonParseAction): continue if not isinstance(key, str) or not isinstance(auth, dict): raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}") - - # extract out concatenated sites - for key, val in copy(auth_dict).items(): - if "," in key: - for site in key.split(","): - auth_dict[site] = val - del auth_dict[key] setattr(namespace, self.dest, auth_dict) diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 6bcb249..7a627ac 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -280,6 +280,7 @@ class GenericExtractor(Extractor): # set up auth auth = self.auth_for_site(url, extract_cookies=False) + # order of importance: username/pasword -> api_key -> cookie -> cookie_from_browser -> cookies_file if auth: if 'username' in auth and 'password' in auth: @@ -290,11 +291,11 @@ class GenericExtractor(Extractor): logger.debug(f'Using provided auth cookie for {url}') yt_dlp.utils.std_headers['cookie'] = auth['cookie'] elif 'cookie_from_browser' in auth: - logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}') + logger.debug(f'Using extracted cookies from browser {auth["cookies_from_browser"]} for {url}') ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser'] elif 'cookies_file' in auth: - logger.debug(f'Using cookies from file {self.cookie_file} for {url}') - ydl_options['cookiesfile'] = auth['cookies_file'] + logger.debug(f'Using cookies from file {auth["cookies_file"]} for {url}') + ydl_options['cookiefile'] = auth['cookies_file'] ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en" diff --git a/tests/extractors/test_generic_extractor.py b/tests/extractors/test_generic_extractor.py index 54f4d9c..33f35b7 100644 --- a/tests/extractors/test_generic_extractor.py +++ b/tests/extractors/test_generic_extractor.py @@ -68,7 +68,7 @@ class TestGenericExtractor(TestExtractorBase): "twitter.com/bellingcat/status/123", "https://www.youtube.com/watch?v=1" ]) - def test_download_nonexistend_media(self, make_item, url): + def test_download_nonexistent_media(self, make_item, url): """ Test to make sure that the extractor doesn't break on non-existend posts/media