mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
Further tidyups + refactoring for new structure
* Add implementation tests for orchestrator + logging tests * Standardise method/class vars for extractors to see if they are suitable * Fix bugs with removing default loguru logger (allows further customisation) * Fix bug loading required fields from file *
This commit is contained in:
@@ -1,8 +1,9 @@
|
||||
""" Entry point for the auto_archiver package. """
|
||||
from auto_archiver.core.orchestrator import ArchivingOrchestrator
|
||||
import sys
|
||||
|
||||
def main():
|
||||
ArchivingOrchestrator().run()
|
||||
ArchivingOrchestrator().run(sys.argv)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
0
src/auto_archiver/core/authentication.py
Normal file
0
src/auto_archiver/core/authentication.py
Normal file
@@ -48,6 +48,10 @@ class DefaultValidatingParser(argparse.ArgumentParser):
|
||||
"""
|
||||
for action in self._actions:
|
||||
if not namespace or action.dest not in namespace:
|
||||
# for actions that are required and already have a default value, remove the 'required' check
|
||||
if action.required and action.default is not None:
|
||||
action.required = False
|
||||
|
||||
if action.default is not None:
|
||||
try:
|
||||
self._check_value(action, action.default)
|
||||
|
||||
@@ -11,9 +11,12 @@ from abc import abstractmethod
|
||||
from dataclasses import dataclass
|
||||
import mimetypes
|
||||
import os
|
||||
import mimetypes, requests
|
||||
import mimetypes
|
||||
|
||||
import requests
|
||||
from loguru import logger
|
||||
from retrying import retry
|
||||
import re
|
||||
|
||||
from ..core import Metadata, ArchivingContext, BaseModule
|
||||
|
||||
@@ -25,6 +28,8 @@ class Extractor(BaseModule):
|
||||
Subclasses must implement the `download` method to define platform-specific behavior.
|
||||
"""
|
||||
|
||||
valid_url: re.Pattern = None
|
||||
|
||||
def cleanup(self) -> None:
|
||||
# called when extractors are done, or upon errors, cleanup any resources
|
||||
pass
|
||||
@@ -32,13 +37,20 @@ class Extractor(BaseModule):
|
||||
def sanitize_url(self, url: str) -> str:
|
||||
# used to clean unnecessary URL parameters OR unfurl redirect links
|
||||
return url
|
||||
|
||||
def match_link(self, url: str) -> re.Match:
|
||||
return self.valid_url.match(url)
|
||||
|
||||
def suitable(self, url: str) -> bool:
|
||||
"""
|
||||
Returns True if this extractor can handle the given URL
|
||||
|
||||
Should be overridden by subclasses
|
||||
|
||||
"""
|
||||
if self.valid_url:
|
||||
return self.match_link(url) is not None
|
||||
|
||||
return True
|
||||
|
||||
def _guess_file_type(self, path: str) -> str:
|
||||
|
||||
@@ -83,6 +83,11 @@ def setup_paths(paths: list[str]) -> None:
|
||||
|
||||
"""
|
||||
for path in paths:
|
||||
# check path exists, if it doesn't, log a warning
|
||||
if not os.path.exists(path):
|
||||
logger.warning(f"Path '{path}' does not exist. Skipping...")
|
||||
continue
|
||||
|
||||
# see odoo/module/module.py -> initialize_sys_path
|
||||
if path not in auto_archiver.modules.__path__:
|
||||
auto_archiver.modules.__path__.append(path)
|
||||
|
||||
@@ -43,6 +43,7 @@ class ArchivingOrchestrator:
|
||||
|
||||
def setup_basic_parser(self):
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="auto-archiver",
|
||||
add_help=False,
|
||||
description="""
|
||||
Auto Archiver is a CLI tool to archive media/metadata from online URLs;
|
||||
@@ -51,15 +52,16 @@ class ArchivingOrchestrator:
|
||||
epilog="Check the code at https://github.com/bellingcat/auto-archiver",
|
||||
formatter_class=RichHelpFormatter,
|
||||
)
|
||||
parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
|
||||
parser.add_argument('--help', '-h', action='store_true', dest='help', help='show this help message and exit')
|
||||
parser.add_argument('--version', action='version', version=__version__)
|
||||
parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
|
||||
parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple')
|
||||
# override the default 'help' so we can inject all the configs and show those
|
||||
parser.add_argument('-h', '--help', action='store_true', dest='help', help='show this help message and exit')
|
||||
parser.add_argument('-s', '--store', dest='store', default=False, help='Store the created config in the config file', action=argparse.BooleanOptionalAction)
|
||||
parser.add_argument('--module_paths', dest='module_paths', nargs='+', default=[], help='additional paths to search for modules', action=UniqueAppendAction)
|
||||
|
||||
self.basic_parser = parser
|
||||
return parser
|
||||
|
||||
def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
|
||||
parser = DefaultValidatingParser(
|
||||
@@ -78,15 +80,15 @@ class ArchivingOrchestrator:
|
||||
# only load the modules enabled in config
|
||||
# TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
|
||||
enabled_modules = []
|
||||
for module_type in BaseModule.MODULE_TYPES:
|
||||
enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
|
||||
|
||||
# add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'archivers', 'databases', 'storages', 'formatter'
|
||||
for module_type in BaseModule.MODULE_TYPES:
|
||||
if modules := getattr(basic_config, f"{module_type}s", []):
|
||||
enabled_modules.extend(modules)
|
||||
# first loads the modules from the config file, then from the command line
|
||||
for config in [yaml_config['steps'], basic_config.__dict__]:
|
||||
for module_type in BaseModule.MODULE_TYPES:
|
||||
enabled_modules.extend(config.get(f"{module_type}s", []))
|
||||
|
||||
avail_modules = available_modules(with_manifest=True, limit_to_modules=list(dict.fromkeys(enabled_modules)), suppress_warnings=True)
|
||||
# clear out duplicates, but keep the order
|
||||
enabled_modules = list(dict.fromkeys(enabled_modules))
|
||||
avail_modules = available_modules(with_manifest=True, limit_to_modules=enabled_modules, suppress_warnings=True)
|
||||
self.add_module_args(avail_modules, parser)
|
||||
elif basic_config.mode == 'simple':
|
||||
simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup]
|
||||
@@ -163,6 +165,10 @@ class ArchivingOrchestrator:
|
||||
# make a nicer metavar, metavar is what's used in the help, e.g. --cli_feeder.urls [METAVAR]
|
||||
kwargs['metavar'] = name.upper()
|
||||
|
||||
if kwargs.get('required', False):
|
||||
# required args shouldn't have a 'default' value, remove it
|
||||
kwargs.pop('default', None)
|
||||
|
||||
kwargs.pop('cli_set', None)
|
||||
should_store = kwargs.pop('should_store', False)
|
||||
kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}"
|
||||
@@ -179,13 +185,12 @@ class ArchivingOrchestrator:
|
||||
|
||||
self.add_additional_args(self.basic_parser)
|
||||
self.add_module_args(parser=self.basic_parser)
|
||||
|
||||
self.basic_parser.print_help()
|
||||
exit()
|
||||
self.basic_parser.exit()
|
||||
|
||||
def setup_logging(self):
|
||||
# setup loguru logging
|
||||
logger.remove() # remove the default logger
|
||||
logger.remove(0) # remove the default logger
|
||||
logging_config = self.config['logging']
|
||||
logger.add(sys.stderr, level=logging_config['level'])
|
||||
if log_file := logging_config['file']:
|
||||
@@ -194,14 +199,18 @@ class ArchivingOrchestrator:
|
||||
|
||||
def install_modules(self):
|
||||
"""
|
||||
Swaps out the previous 'strings' in the config with the actual modules
|
||||
Swaps out the previous 'strings' in the config with the actual modules and loads them
|
||||
"""
|
||||
|
||||
invalid_modules = []
|
||||
for module_type in BaseModule.MODULE_TYPES:
|
||||
|
||||
step_items = []
|
||||
modules_to_load = self.config['steps'][f"{module_type}s"]
|
||||
|
||||
assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} \
|
||||
in your configuration file or on the command line (using --{module_type}s)"
|
||||
|
||||
def check_steps_ok():
|
||||
if not len(step_items):
|
||||
logger.error(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.")
|
||||
@@ -239,30 +248,29 @@ class ArchivingOrchestrator:
|
||||
|
||||
assert len(step_items) > 0, f"No {module_type}s were loaded. Please check your configuration file and try again."
|
||||
self.config['steps'][f"{module_type}s"] = step_items
|
||||
|
||||
def load_config(self, config_file: str) -> dict:
|
||||
if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
|
||||
logger.error(f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
|
||||
exit()
|
||||
|
||||
def run(self) -> None:
|
||||
return read_yaml(config_file)
|
||||
|
||||
def run(self, args: list) -> None:
|
||||
|
||||
self.setup_basic_parser()
|
||||
|
||||
# parse the known arguments for now (basically, we want the config file)
|
||||
basic_config, unused_args = self.basic_parser.parse_known_args(args)
|
||||
|
||||
# load the config file to get the list of enabled items
|
||||
basic_config, unused_args = self.basic_parser.parse_known_args()
|
||||
|
||||
# setup any custom module paths, so they'll show in the help and for arg parsing
|
||||
setup_paths(basic_config.module_paths)
|
||||
|
||||
# if help flag was called, then show the help
|
||||
if basic_config.help:
|
||||
self.show_help(basic_config)
|
||||
|
||||
# load the config file
|
||||
yaml_config = {}
|
||||
|
||||
if not os.path.exists(basic_config.config_file) and basic_config.config_file != DEFAULT_CONFIG_FILE:
|
||||
logger.error(f"The configuration file {basic_config.config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
|
||||
exit()
|
||||
|
||||
|
||||
yaml_config = read_yaml(basic_config.config_file)
|
||||
yaml_config = self.load_config(basic_config.config_file)
|
||||
self.setup_complete_parser(basic_config, yaml_config, unused_args)
|
||||
|
||||
logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
|
||||
|
||||
@@ -28,7 +28,7 @@ class InstagramAPIExtractor(Extractor):
|
||||
# TODO: improvement collect aggregates of locations[0].location and mentions for all posts
|
||||
"""
|
||||
|
||||
global_pattern = re.compile(
|
||||
valid_url = re.compile(
|
||||
r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?"
|
||||
)
|
||||
|
||||
@@ -44,7 +44,7 @@ class InstagramAPIExtractor(Extractor):
|
||||
url.replace("instagr.com", "instagram.com").replace(
|
||||
"instagr.am", "instagram.com"
|
||||
)
|
||||
insta_matches = self.global_pattern.findall(url)
|
||||
insta_matches = self.valid_url.findall(url)
|
||||
logger.info(f"{insta_matches=}")
|
||||
if not len(insta_matches) or len(insta_matches[0]) != 3:
|
||||
return
|
||||
|
||||
@@ -16,10 +16,13 @@ class InstagramExtractor(Extractor):
|
||||
Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
|
||||
"""
|
||||
# NB: post regex should be tested before profile
|
||||
|
||||
valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/")
|
||||
|
||||
# https://regex101.com/r/MGPquX/1
|
||||
post_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(?:p|reel)\/(\w+)")
|
||||
post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url))
|
||||
# https://regex101.com/r/6Wbsxa/1
|
||||
profile_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(\w+)")
|
||||
profile_pattern = re.compile(r"{valid_url}(\w+)".format(valid_url=valid_url))
|
||||
# TODO: links to stories
|
||||
|
||||
def setup(self, config: dict) -> None:
|
||||
|
||||
@@ -14,7 +14,7 @@ from auto_archiver.utils import random_str
|
||||
|
||||
|
||||
class TelethonArchiver(Extractor):
|
||||
link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
|
||||
valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
|
||||
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
|
||||
|
||||
|
||||
@@ -92,7 +92,7 @@ class TelethonArchiver(Extractor):
|
||||
"""
|
||||
url = item.get_url()
|
||||
# detect URLs that we definitely cannot handle
|
||||
match = self.link_pattern.search(url)
|
||||
match = self.valid_url.search(url)
|
||||
logger.debug(f"TELETHON: {match=}")
|
||||
if not match: return False
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata,Media
|
||||
|
||||
class TwitterApiExtractor(Extractor):
|
||||
link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
||||
valid_url = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
||||
|
||||
def setup(self, config: dict) -> None:
|
||||
super().setup(config)
|
||||
@@ -54,7 +54,7 @@ class TwitterApiExtractor(Extractor):
|
||||
|
||||
def get_username_tweet_id(self, url):
|
||||
# detect URLs that we definitely cannot handle
|
||||
matches = self.link_pattern.findall(url)
|
||||
matches = self.valid_url.findall(url)
|
||||
if not len(matches): return False, False
|
||||
|
||||
username, tweet_id = matches[0] # only one URL supported
|
||||
|
||||
@@ -2,8 +2,11 @@ import re
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
|
||||
class UrlUtil:
|
||||
telegram_private = re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)")
|
||||
is_istagram = re.compile(r"https:\/\/www\.instagram\.com")
|
||||
|
||||
AUTHWALL_URLS = [
|
||||
re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels
|
||||
re.compile(r"https:\/\/www\.instagram\.com"), # instagram
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def clean(url: str) -> str: return url
|
||||
@@ -13,8 +16,9 @@ class UrlUtil:
|
||||
"""
|
||||
checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work
|
||||
"""
|
||||
if UrlUtil.telegram_private.match(url): return True
|
||||
if UrlUtil.is_istagram.match(url): return True
|
||||
for regex in UrlUtil.AUTHWALL_URLS:
|
||||
if regex.match(url):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
Reference in New Issue
Block a user