mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 20:58:29 +03:00
Merge branch 'main' into small_issues
# Conflicts: # src/auto_archiver/core/base_module.py # src/auto_archiver/utils/misc.py
This commit is contained in:
@@ -84,11 +84,13 @@ class BaseModule(ABC):
|
||||
* api_key: str - the API key to use for login\n
|
||||
* api_secret: str - the API secret to use for login\n
|
||||
* cookie: str - a cookie string to use for login (specific to this site)\n
|
||||
* cookies_file: str - the path to a cookies file to use for login (specific to this site)\n
|
||||
* cookies_from_browser: str - the name of the browser to extract cookies from (specitic for this site)\n
|
||||
"""
|
||||
# TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
|
||||
# for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?
|
||||
|
||||
site = UrlUtil.domain_for_url(site)
|
||||
site = UrlUtil.domain_for_url(site).lstrip("www.")
|
||||
# add the 'www' version of the site to the list of sites to check
|
||||
authdict = {}
|
||||
|
||||
@@ -115,16 +117,29 @@ class BaseModule(ABC):
|
||||
ytdlp_opts = getattr(parse_options(args), 'ydl_opts')
|
||||
return yt_dlp.YoutubeDL(ytdlp_opts).cookiejar
|
||||
|
||||
# get the cookies jar, prefer the browser cookies than the file
|
||||
if 'cookies_from_browser' in self.authentication:
|
||||
get_cookiejar_options = None
|
||||
|
||||
# order of priority:
|
||||
# 1. cookies_from_browser setting in site config
|
||||
# 2. cookies_file setting in site config
|
||||
# 3. cookies_from_browser setting in global config
|
||||
# 4. cookies_file setting in global config
|
||||
|
||||
if 'cookies_from_browser' in authdict:
|
||||
get_cookiejar_options = ['--cookies-from-browser', authdict['cookies_from_browser']]
|
||||
elif 'cookies_file' in authdict:
|
||||
get_cookiejar_options = ['--cookies', authdict['cookies_file']]
|
||||
elif 'cookies_from_browser' in self.authentication:
|
||||
authdict['cookies_from_browser'] = self.authentication['cookies_from_browser']
|
||||
if extract_cookies:
|
||||
authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies-from-browser', self.authentication['cookies_from_browser']])
|
||||
get_cookiejar_options = ['--cookies-from-browser', self.authentication['cookies_from_browser']]
|
||||
elif 'cookies_file' in self.authentication:
|
||||
authdict['cookies_file'] = self.authentication['cookies_file']
|
||||
if extract_cookies:
|
||||
authdict['cookies_jar'] = get_ytdlp_cookiejar(['--cookies', self.authentication['cookies_file']])
|
||||
get_cookiejar_options = ['--cookies', self.authentication['cookies_file']]
|
||||
|
||||
|
||||
if get_cookiejar_options:
|
||||
authdict['cookies_jar'] = get_ytdlp_cookiejar(get_cookiejar_options)
|
||||
|
||||
return authdict
|
||||
|
||||
def repr(self):
|
||||
|
||||
@@ -7,6 +7,7 @@ flexible setup in various environments.
|
||||
|
||||
import argparse
|
||||
from ruamel.yaml import YAML, CommentedMap, add_representer
|
||||
import json
|
||||
|
||||
from loguru import logger
|
||||
|
||||
@@ -17,10 +18,12 @@ from typing import Any, List, Type, Tuple
|
||||
|
||||
_yaml: YAML = YAML()
|
||||
|
||||
DEFAULT_CONFIG_FILE = "secrets/orchestration.yaml"
|
||||
|
||||
EMPTY_CONFIG = _yaml.load("""
|
||||
# Auto Archiver Configuration
|
||||
# Steps are the modules that will be run in the order they are defined
|
||||
|
||||
# Steps are the modules that will be run in the order they are defined
|
||||
steps:""" + "".join([f"\n {module}s: []" for module in MODULE_TYPES]) + \
|
||||
"""
|
||||
|
||||
@@ -52,6 +55,57 @@ logging:
|
||||
""")
|
||||
# note: 'logging' is explicitly added above in order to better format the config file
|
||||
|
||||
|
||||
# Arg Parse Actions/Classes
|
||||
class AuthenticationJsonParseAction(argparse.Action):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
|
||||
try:
|
||||
auth_dict = json.loads(values)
|
||||
setattr(namespace, self.dest, auth_dict)
|
||||
except json.JSONDecodeError as e:
|
||||
raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}")
|
||||
|
||||
def load_from_file(path):
|
||||
try:
|
||||
with open(path, 'r') as f:
|
||||
try:
|
||||
auth_dict = json.load(f)
|
||||
except json.JSONDecodeError:
|
||||
f.seek(0)
|
||||
# maybe it's yaml, try that
|
||||
auth_dict = _yaml.load(f)
|
||||
if auth_dict.get('authentication'):
|
||||
auth_dict = auth_dict['authentication']
|
||||
auth_dict['load_from_file'] = path
|
||||
return auth_dict
|
||||
except:
|
||||
return None
|
||||
|
||||
if isinstance(auth_dict, dict) and auth_dict.get('from_file'):
|
||||
auth_dict = load_from_file(auth_dict['from_file'])
|
||||
elif isinstance(auth_dict, str):
|
||||
# if it's a string
|
||||
auth_dict = load_from_file(auth_dict)
|
||||
|
||||
if not isinstance(auth_dict, dict):
|
||||
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
|
||||
global_options = ['cookies_from_browser', 'cookies_file', 'load_from_file']
|
||||
for key, auth in auth_dict.items():
|
||||
if key in global_options:
|
||||
continue
|
||||
if not isinstance(key, str) or not isinstance(auth, dict):
|
||||
raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}")
|
||||
|
||||
setattr(namespace, self.dest, auth_dict)
|
||||
|
||||
|
||||
class UniqueAppendAction(argparse.Action):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
for value in values:
|
||||
if value not in getattr(namespace, self.dest):
|
||||
getattr(namespace, self.dest).append(value)
|
||||
|
||||
class DefaultValidatingParser(argparse.ArgumentParser):
|
||||
|
||||
def error(self, message):
|
||||
@@ -82,6 +136,7 @@ class DefaultValidatingParser(argparse.ArgumentParser):
|
||||
|
||||
return super().parse_known_args(args, namespace)
|
||||
|
||||
# Config Utils
|
||||
|
||||
def to_dot_notation(yaml_conf: CommentedMap | dict) -> dict:
|
||||
dotdict = {}
|
||||
@@ -153,8 +208,8 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
|
||||
pass
|
||||
|
||||
if not config:
|
||||
config = EMPTY_CONFIG
|
||||
|
||||
config = deepcopy(EMPTY_CONFIG)
|
||||
|
||||
return config
|
||||
|
||||
# TODO: make this tidier/find a way to notify of which keys should not be stored
|
||||
@@ -170,4 +225,7 @@ def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
|
||||
|
||||
config_to_save.pop('urls', None)
|
||||
with open(yaml_filename, "w", encoding="utf-8") as outf:
|
||||
_yaml.dump(config_to_save, outf)
|
||||
_yaml.dump(config_to_save, outf)
|
||||
|
||||
def is_valid_config(config: CommentedMap) -> bool:
|
||||
return config and config != EMPTY_CONFIG
|
||||
@@ -13,7 +13,7 @@ from abc import abstractmethod
|
||||
from auto_archiver.core import Metadata, BaseModule
|
||||
|
||||
class Enricher(BaseModule):
|
||||
"""Base classes and utilities for enrichers in the Auto-Archiver system.
|
||||
"""Base classes and utilities for enrichers in the Auto Archiver system.
|
||||
|
||||
Enricher modules must implement the `enrich` method to define their behavior.
|
||||
"""
|
||||
|
||||
@@ -134,7 +134,6 @@ class LazyBaseModule:
|
||||
|
||||
"""
|
||||
name: str
|
||||
type: list
|
||||
description: str
|
||||
path: str
|
||||
module_factory: ModuleFactory
|
||||
@@ -148,6 +147,10 @@ class LazyBaseModule:
|
||||
self.path = path
|
||||
self.module_factory = factory
|
||||
|
||||
@property
|
||||
def type(self):
|
||||
return self.manifest['type']
|
||||
|
||||
@property
|
||||
def entry_point(self):
|
||||
if not self._entry_point and not self.manifest['entry_point']:
|
||||
@@ -183,10 +186,9 @@ class LazyBaseModule:
|
||||
try:
|
||||
manifest.update(ast.literal_eval(f.read()))
|
||||
except (ValueError, TypeError, SyntaxError, MemoryError, RecursionError) as e:
|
||||
logger.error(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}")
|
||||
raise ValueError(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}")
|
||||
|
||||
self._manifest = manifest
|
||||
self.type = manifest['type']
|
||||
self._entry_point = manifest['entry_point']
|
||||
self.description = manifest['description']
|
||||
self.version = manifest['version']
|
||||
@@ -254,7 +256,7 @@ class LazyBaseModule:
|
||||
instance.module_factory = self.module_factory
|
||||
|
||||
# merge the default config with the user config
|
||||
default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default'))
|
||||
default_config = dict((k, v['default']) for k, v in self.configs.items() if 'default' in v)
|
||||
|
||||
config[self.name] = default_config | config.get(self.name, {})
|
||||
instance.config_setup(config)
|
||||
|
||||
@@ -6,95 +6,31 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Generator, Union, List, Type, TYPE_CHECKING
|
||||
from urllib.parse import urlparse
|
||||
from ipaddress import ip_address
|
||||
from copy import copy
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
from tempfile import TemporaryDirectory
|
||||
import traceback
|
||||
from copy import copy
|
||||
|
||||
from rich_argparse import RichHelpFormatter
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from .metadata import Metadata, Media
|
||||
from auto_archiver.version import __version__
|
||||
from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
|
||||
from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, is_valid_config, \
|
||||
DefaultValidatingParser, UniqueAppendAction, AuthenticationJsonParseAction, DEFAULT_CONFIG_FILE
|
||||
from .module import ModuleFactory, LazyBaseModule
|
||||
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
|
||||
from .consts import MODULE_TYPES
|
||||
from loguru import logger
|
||||
from auto_archiver.utils.url import check_url_or_raise
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .base_module import BaseModule
|
||||
from .module import LazyBaseModule
|
||||
|
||||
DEFAULT_CONFIG_FILE = "orchestration.yaml"
|
||||
|
||||
|
||||
class JsonParseAction(argparse.Action):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
try:
|
||||
setattr(namespace, self.dest, json.loads(values))
|
||||
except json.JSONDecodeError as e:
|
||||
raise argparse.ArgumentTypeError(f"Invalid JSON input for argument '{self.dest}': {e}")
|
||||
|
||||
|
||||
class AuthenticationJsonParseAction(JsonParseAction):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
super().__call__(parser, namespace, values, option_string)
|
||||
auth_dict = getattr(namespace, self.dest)
|
||||
|
||||
def load_from_file(path):
|
||||
try:
|
||||
with open(path, 'r') as f:
|
||||
try:
|
||||
auth_dict = json.load(f)
|
||||
except json.JSONDecodeError:
|
||||
f.seek(0)
|
||||
# maybe it's yaml, try that
|
||||
auth_dict = _yaml.load(f)
|
||||
if auth_dict.get('authentication'):
|
||||
auth_dict = auth_dict['authentication']
|
||||
auth_dict['load_from_file'] = path
|
||||
return auth_dict
|
||||
except:
|
||||
return None
|
||||
|
||||
if isinstance(auth_dict, dict) and auth_dict.get('from_file'):
|
||||
auth_dict = load_from_file(auth_dict['from_file'])
|
||||
elif isinstance(auth_dict, str):
|
||||
# if it's a string
|
||||
auth_dict = load_from_file(auth_dict)
|
||||
|
||||
if not isinstance(auth_dict, dict):
|
||||
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
|
||||
global_options = ['cookies_from_browser', 'cookies_file', 'load_from_file']
|
||||
for key, auth in auth_dict.items():
|
||||
if key in global_options:
|
||||
continue
|
||||
if not isinstance(key, str) or not isinstance(auth, dict):
|
||||
raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}")
|
||||
|
||||
# extract out concatenated sites
|
||||
for key, val in copy(auth_dict).items():
|
||||
if "," in key:
|
||||
for site in key.split(","):
|
||||
auth_dict[site] = val
|
||||
del auth_dict[key]
|
||||
|
||||
setattr(namespace, self.dest, auth_dict)
|
||||
|
||||
|
||||
class UniqueAppendAction(argparse.Action):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
for value in values:
|
||||
if value not in getattr(namespace, self.dest):
|
||||
getattr(namespace, self.dest).append(value)
|
||||
|
||||
|
||||
class SetupError(ValueError):
|
||||
pass
|
||||
class ArchivingOrchestrator:
|
||||
|
||||
# instance variables
|
||||
@@ -163,7 +99,7 @@ class ArchivingOrchestrator:
|
||||
# TODO: BUG** - basic_config won't have steps in it, since these args aren't added to 'basic_parser'
|
||||
# but should we add them? Or should we just add them to the 'complete' parser?
|
||||
|
||||
if yaml_config != EMPTY_CONFIG:
|
||||
if is_valid_config(yaml_config):
|
||||
# only load the modules enabled in config
|
||||
# TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
|
||||
enabled_modules = []
|
||||
@@ -189,7 +125,13 @@ class ArchivingOrchestrator:
|
||||
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
|
||||
else:
|
||||
# load all modules, they're not using the 'simple' mode
|
||||
self.add_individual_module_args(self.module_factory.available_modules(), parser)
|
||||
all_modules = self.module_factory.available_modules()
|
||||
# add all the modules to the steps
|
||||
for module in all_modules:
|
||||
for module_type in module.type:
|
||||
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
|
||||
|
||||
self.add_individual_module_args(all_modules, parser)
|
||||
|
||||
parser.set_defaults(**to_dot_notation(yaml_config))
|
||||
|
||||
@@ -198,6 +140,9 @@ class ArchivingOrchestrator:
|
||||
# merge the new config with the old one
|
||||
config = merge_dicts(vars(parsed), yaml_config)
|
||||
|
||||
# set up the authentication dict as needed
|
||||
config = self.setup_authentication(config)
|
||||
|
||||
# clean out args from the base_parser that we don't want in the config
|
||||
for key in vars(basic_config):
|
||||
config.pop(key, None)
|
||||
@@ -286,14 +231,20 @@ class ArchivingOrchestrator:
|
||||
self.basic_parser.exit()
|
||||
|
||||
def setup_logging(self, config):
|
||||
|
||||
logging_config = config['logging']
|
||||
|
||||
if logging_config.get('enabled', True) is False:
|
||||
# disabled logging settings, they're set on a higher level
|
||||
logger.disable('auto_archiver')
|
||||
return
|
||||
|
||||
# setup loguru logging
|
||||
try:
|
||||
logger.remove(0) # remove the default logger
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
logging_config = config['logging']
|
||||
|
||||
# add other logging info
|
||||
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
|
||||
self.logger_id = logger.add(sys.stderr, level=logging_config['level'])
|
||||
@@ -312,27 +263,25 @@ class ArchivingOrchestrator:
|
||||
|
||||
step_items = []
|
||||
modules_to_load = modules_by_type[f"{module_type}s"]
|
||||
assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
|
||||
if not modules_to_load:
|
||||
raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")
|
||||
|
||||
def check_steps_ok():
|
||||
if not len(step_items):
|
||||
logger.error(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.")
|
||||
if len(modules_to_load):
|
||||
logger.error(f"Tried to load the following modules, but none were available: {modules_to_load}")
|
||||
exit()
|
||||
logger.error(f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}")
|
||||
raise SetupError(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.")
|
||||
|
||||
|
||||
if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1:
|
||||
logger.error(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
|
||||
exit()
|
||||
raise SetupError(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
|
||||
|
||||
for module in modules_to_load:
|
||||
if module == 'cli_feeder':
|
||||
# pseudo module, don't load it
|
||||
# cli_feeder is a pseudo module, it just takes the command line args for [URLS]
|
||||
urls = self.config['urls']
|
||||
if not urls:
|
||||
logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
|
||||
exit()
|
||||
# cli_feeder is a pseudo module, it just takes the command line args
|
||||
raise SetupError("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
|
||||
|
||||
def feed(self) -> Generator[Metadata]:
|
||||
for url in urls:
|
||||
@@ -352,13 +301,14 @@ class ArchivingOrchestrator:
|
||||
|
||||
if module in invalid_modules:
|
||||
continue
|
||||
|
||||
try:
|
||||
loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
|
||||
except (KeyboardInterrupt, Exception) as e:
|
||||
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
|
||||
if module_type == 'extractor' and loaded_module.name == module:
|
||||
loaded_module.cleanup()
|
||||
exit()
|
||||
raise e
|
||||
|
||||
if not loaded_module:
|
||||
invalid_modules.append(module)
|
||||
@@ -372,7 +322,7 @@ class ArchivingOrchestrator:
|
||||
def load_config(self, config_file: str) -> dict:
|
||||
if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
|
||||
logger.error(f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
|
||||
exit()
|
||||
raise FileNotFoundError(f"Configuration file {config_file} not found")
|
||||
|
||||
return read_yaml(config_file)
|
||||
|
||||
@@ -437,8 +387,12 @@ class ArchivingOrchestrator:
|
||||
If you wish to make code invocations yourself, you should use the 'setup' and 'feed' methods separately.
|
||||
To test configurations, without loading any modules you can also first call 'setup_configs'
|
||||
"""
|
||||
self.setup(args)
|
||||
return self.feed()
|
||||
try:
|
||||
self.setup(args)
|
||||
return self.feed()
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
exit(1)
|
||||
|
||||
def cleanup(self) -> None:
|
||||
logger.info("Cleaning up")
|
||||
@@ -503,8 +457,8 @@ class ArchivingOrchestrator:
|
||||
|
||||
original_url = result.get_url().strip()
|
||||
try:
|
||||
self.assert_valid_url(original_url)
|
||||
except AssertionError as e:
|
||||
check_url_or_raise(original_url)
|
||||
except ValueError as e:
|
||||
logger.error(f"Error archiving URL {original_url}: {e}")
|
||||
raise e
|
||||
|
||||
@@ -564,26 +518,27 @@ class ArchivingOrchestrator:
|
||||
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def assert_valid_url(self, url: str) -> bool:
|
||||
def setup_authentication(self, config: dict) -> dict:
|
||||
"""
|
||||
Blocks localhost, private, reserved, and link-local IPs and all non-http/https schemes.
|
||||
Setup authentication for all modules that require it
|
||||
|
||||
Split up strings into multiple sites if they are comma separated
|
||||
"""
|
||||
assert url.startswith("http://") or url.startswith("https://"), f"Invalid URL scheme"
|
||||
|
||||
parsed = urlparse(url)
|
||||
assert parsed.scheme in ["http", "https"], f"Invalid URL scheme"
|
||||
assert parsed.hostname, f"Invalid URL hostname"
|
||||
assert parsed.hostname != "localhost", f"Invalid URL"
|
||||
authentication = config.get('authentication', {})
|
||||
|
||||
try: # special rules for IP addresses
|
||||
ip = ip_address(parsed.hostname)
|
||||
except ValueError: pass
|
||||
else:
|
||||
assert ip.is_global, f"Invalid IP used"
|
||||
assert not ip.is_reserved, f"Invalid IP used"
|
||||
assert not ip.is_link_local, f"Invalid IP used"
|
||||
assert not ip.is_private, f"Invalid IP used"
|
||||
# extract out concatenated sites
|
||||
for key, val in copy(authentication).items():
|
||||
if "," in key:
|
||||
for site in key.split(","):
|
||||
site = site.strip()
|
||||
authentication[site] = val
|
||||
del authentication[key]
|
||||
|
||||
config['authentication'] = authentication
|
||||
return config
|
||||
|
||||
# Helper Properties
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# used as validators for config values. Should raise an exception if the value is invalid.
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
import json
|
||||
|
||||
def example_validator(value):
|
||||
if "example" not in value:
|
||||
@@ -16,4 +17,7 @@ def positive_number(value):
|
||||
def valid_file(value):
|
||||
if not Path(value).is_file():
|
||||
raise argparse.ArgumentTypeError(f"File '{value}' does not exist.")
|
||||
return value
|
||||
return value
|
||||
|
||||
def json_loader(cli_val):
|
||||
return json.loads(cli_val)
|
||||
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"name": "Auto-Archiver API Database",
|
||||
"name": "Auto Archiver API Database",
|
||||
"type": ["database"],
|
||||
"entry_point": "api_db::AAApiDb",
|
||||
"requires_setup": True,
|
||||
@@ -39,7 +39,7 @@
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Provides integration with the Auto-Archiver API for querying and storing archival data.
|
||||
Provides integration with the Auto Archiver API for querying and storing archival data.
|
||||
|
||||
### Features
|
||||
- **API Integration**: Supports querying for existing archives and submitting results.
|
||||
@@ -49,6 +49,6 @@
|
||||
- **Optional Storage**: Archives results conditionally based on configuration.
|
||||
|
||||
### Setup
|
||||
Requires access to an Auto-Archiver API instance and a valid API token.
|
||||
Requires access to an Auto Archiver API instance and a valid API token.
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -280,6 +280,7 @@ class GenericExtractor(Extractor):
|
||||
|
||||
# set up auth
|
||||
auth = self.auth_for_site(url, extract_cookies=False)
|
||||
|
||||
# order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file
|
||||
if auth:
|
||||
if 'username' in auth and 'password' in auth:
|
||||
@@ -290,11 +291,11 @@ class GenericExtractor(Extractor):
|
||||
logger.debug(f'Using provided auth cookie for {url}')
|
||||
yt_dlp.utils.std_headers['cookie'] = auth['cookie']
|
||||
elif 'cookies_from_browser' in auth:
|
||||
logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}')
|
||||
logger.debug(f'Using extracted cookies from browser {auth["cookies_from_browser"]} for {url}')
|
||||
ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
|
||||
elif 'cookies_file' in auth:
|
||||
logger.debug(f'Using cookies from file {self.cookie_file} for {url}')
|
||||
ydl_options['cookiesfile'] = auth['cookies_file']
|
||||
logger.debug(f'Using cookies from file {auth["cookies_file"]} for {url}')
|
||||
ydl_options['cookiefile'] = auth['cookies_file']
|
||||
|
||||
ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
|
||||
|
||||
|
||||
@@ -15,7 +15,8 @@
|
||||
"header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
|
||||
"service_account": {
|
||||
"default": "secrets/service_account.json",
|
||||
"help": "service account JSON file path",
|
||||
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
|
||||
"required": True,
|
||||
},
|
||||
"columns": {
|
||||
"default": {
|
||||
@@ -34,16 +35,16 @@
|
||||
"wacz": "wacz",
|
||||
"replaywebpage": "replaywebpage",
|
||||
},
|
||||
"help": "names of columns in the google sheet (stringified JSON object)",
|
||||
"type": "auto_archiver.utils.json_loader",
|
||||
"help": "Custom names for the columns in your Google sheet. If you don't want to use the default column names, change them with this setting",
|
||||
"type": "json_loader",
|
||||
},
|
||||
"allow_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||
"help": "A list of worksheet names that should be processed (overrides worksheet_block), leave empty so all are allowed",
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||
"help": "A list of worksheet names for worksheets that should be explicitly blocked from being processed",
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": True,
|
||||
@@ -64,8 +65,10 @@
|
||||
- Ensures only rows with valid URLs and unprocessed statuses are included for archival.
|
||||
- Supports organizing stored files into folder paths based on sheet and worksheet names.
|
||||
|
||||
### Notes
|
||||
- Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`.
|
||||
- Create the sheet using the template provided in the docs.
|
||||
### Setup
|
||||
- Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`.
|
||||
To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html).
|
||||
- Define the `sheet` or `sheet_id` configuration to specify the sheet to archive.
|
||||
- Customize the column names in your Google sheet using the `columns` configuration.
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -24,9 +24,8 @@ class GsheetsFeeder(Feeder):
|
||||
def setup(self) -> None:
|
||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||
# TODO mv to validators
|
||||
assert self.sheet or self.sheet_id, (
|
||||
"You need to define either a 'sheet' name or a 'sheet_id' in your manifest."
|
||||
)
|
||||
if not self.sheet and not self.sheet_id:
|
||||
raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.")
|
||||
|
||||
def open_sheet(self):
|
||||
if self.sheet:
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
"channel_invites": {
|
||||
"default": {},
|
||||
"help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
|
||||
"type": "auto_archiver.utils.json_loader",
|
||||
"type": "json_loader",
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
from .wacz_enricher import WaczExtractorEnricher
|
||||
@@ -0,0 +1 @@
|
||||
from .wacz_extractor_enricher import WaczExtractorEnricher
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"name": "WACZ Enricher",
|
||||
"name": "WACZ Enricher (and Extractor)",
|
||||
"type": ["enricher", "extractor"],
|
||||
"entry_point": "wacz_enricher::WaczExtractorEnricher",
|
||||
"entry_point": "wacz_extractor_enricher::WaczExtractorEnricher",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": [
|
||||
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"name": "Wayback Machine Enricher",
|
||||
"name": "Wayback Machine Enricher (and Extractor)",
|
||||
"type": ["enricher", "extractor"],
|
||||
"entry_point": "wayback_extractor_enricher::WaybackExtractorEnricher",
|
||||
"requires_setup": True,
|
||||
|
||||
@@ -61,11 +61,7 @@ def random_str(length: int = 32) -> str:
|
||||
return str(uuid.uuid4()).replace("-", "")[:length]
|
||||
|
||||
|
||||
def json_loader(cli_val):
|
||||
return json.loads(cli_val)
|
||||
|
||||
|
||||
def calculate_file_hash(filename: str, hash_algo=hashlib.sha256, chunksize: int = 16000000) -> str:
|
||||
def calculate_file_hash(filename: str, hash_algo = hashlib.sha256, chunksize: int = 16000000) -> str:
|
||||
hash = hash_algo()
|
||||
with open(filename, "rb") as f:
|
||||
while True:
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import re
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
from ipaddress import ip_address
|
||||
|
||||
|
||||
AUTHWALL_URLS = [
|
||||
@@ -7,6 +8,43 @@ AUTHWALL_URLS = [
|
||||
re.compile(r"https:\/\/www\.instagram\.com"), # instagram
|
||||
]
|
||||
|
||||
|
||||
def check_url_or_raise(url: str) -> bool | ValueError:
|
||||
"""
|
||||
Blocks localhost, private, reserved, and link-local IPs and all non-http/https schemes.
|
||||
"""
|
||||
|
||||
|
||||
if not (url.startswith("http://") or url.startswith("https://")):
|
||||
raise ValueError(f"Invalid URL scheme for url {url}")
|
||||
|
||||
parsed = urlparse(url)
|
||||
if not parsed.hostname:
|
||||
raise ValueError(f"Invalid URL hostname for url {url}")
|
||||
|
||||
if parsed.hostname == "localhost":
|
||||
raise ValueError(f"Localhost URLs cannot be parsed for security reasons (for url {url})")
|
||||
|
||||
if parsed.scheme not in ["http", "https"]:
|
||||
raise ValueError(f"Invalid URL scheme, only http and https supported (for url {url})")
|
||||
|
||||
try: # special rules for IP addresses
|
||||
ip = ip_address(parsed.hostname)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
else:
|
||||
if not ip.is_global:
|
||||
raise ValueError(f"IP address {ip} is not globally reachable")
|
||||
if ip.is_reserved:
|
||||
raise ValueError(f"Reserved IP address {ip} used")
|
||||
if ip.is_link_local:
|
||||
raise ValueError(f"Link-local IP address {ip} used")
|
||||
if ip.is_private:
|
||||
raise ValueError(f"Private IP address {ip} used")
|
||||
|
||||
return True
|
||||
|
||||
def domain_for_url(url: str) -> str:
|
||||
"""
|
||||
SECURITY: parse the domain using urllib to avoid any potential security issues
|
||||
|
||||
Reference in New Issue
Block a user