mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9297697ef5 | ||
|
|
5614af3f63 |
3
.github/workflows/tests-core.yaml
vendored
3
.github/workflows/tests-core.yaml
vendored
@@ -16,7 +16,8 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.10", "3.11", "3.12"]
|
python-version: ["3.10", "3.11", "3.12"]
|
||||||
os: [ubuntu-22.04, ubuntu-latest]
|
os: [ubuntu-22.04]
|
||||||
|
#TODO: re-enable ubuntu-latest, this is disabled as oscrypto cannot be pinned to github commit and pushed to pypi
|
||||||
defaults:
|
defaults:
|
||||||
run:
|
run:
|
||||||
working-directory: ./
|
working-directory: ./
|
||||||
|
|||||||
14
poetry.lock
generated
14
poetry.lock
generated
@@ -1367,18 +1367,14 @@ description = "TLS (SSL) sockets, key generation, encryption, decryption, signin
|
|||||||
optional = false
|
optional = false
|
||||||
python-versions = "*"
|
python-versions = "*"
|
||||||
groups = ["main"]
|
groups = ["main"]
|
||||||
files = []
|
files = [
|
||||||
develop = false
|
{file = "oscrypto-1.3.0-py2.py3-none-any.whl", hash = "sha256:2b2f1d2d42ec152ca90ccb5682f3e051fb55986e1b170ebde472b133713e7085"},
|
||||||
|
{file = "oscrypto-1.3.0.tar.gz", hash = "sha256:6f5fef59cb5b3708321db7cca56aed8ad7e662853351e7991fcf60ec606d47a4"},
|
||||||
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
asn1crypto = ">=1.5.1"
|
asn1crypto = ">=1.5.1"
|
||||||
|
|
||||||
[package.source]
|
|
||||||
type = "git"
|
|
||||||
url = "https://github.com/wbond/oscrypto.git"
|
|
||||||
reference = "d5f3437ed24257895ae1edd9e503cfb352e635a8"
|
|
||||||
resolved_reference = "d5f3437ed24257895ae1edd9e503cfb352e635a8"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "outcome"
|
name = "outcome"
|
||||||
version = "1.3.0.post0"
|
version = "1.3.0.post0"
|
||||||
@@ -3168,4 +3164,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.1"
|
lock-version = "2.1"
|
||||||
python-versions = ">=3.10,<3.13"
|
python-versions = ">=3.10,<3.13"
|
||||||
content-hash = "c2503c982b9362c3757f39432cdaa8375b45e2d4a0497fa80c2b82a65d1eedf7"
|
content-hash = "b3a6142d6495bc4c8741e9411d29352af219851e4b84b263f991e1bb6db1614e"
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "auto-archiver"
|
name = "auto-archiver"
|
||||||
version = "0.13.0"
|
version = "0.13.2"
|
||||||
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
||||||
|
|
||||||
requires-python = ">=3.10,<3.13"
|
requires-python = ">=3.10,<3.13"
|
||||||
@@ -22,7 +22,6 @@ classifiers = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"oscrypto @ git+https://github.com/wbond/oscrypto.git@d5f3437ed24257895ae1edd9e503cfb352e635a8",
|
|
||||||
"gspread (>=0.0.0)",
|
"gspread (>=0.0.0)",
|
||||||
"beautifulsoup4 (>=0.0.0)",
|
"beautifulsoup4 (>=0.0.0)",
|
||||||
"bs4 (>=0.0.0)",
|
"bs4 (>=0.0.0)",
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ from loguru import logger
|
|||||||
|
|
||||||
DEFAULT_CONFIG_FILE = "orchestration.yaml"
|
DEFAULT_CONFIG_FILE = "orchestration.yaml"
|
||||||
|
|
||||||
|
|
||||||
class JsonParseAction(argparse.Action):
|
class JsonParseAction(argparse.Action):
|
||||||
def __call__(self, parser, namespace, values, option_string=None):
|
def __call__(self, parser, namespace, values, option_string=None):
|
||||||
try:
|
try:
|
||||||
@@ -60,6 +61,8 @@ class AuthenticationJsonParseAction(JsonParseAction):
|
|||||||
if not isinstance(site, str) or not isinstance(auth, dict):
|
if not isinstance(site, str) or not isinstance(auth, dict):
|
||||||
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
|
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
|
||||||
setattr(namespace, self.dest, auth_dict)
|
setattr(namespace, self.dest, auth_dict)
|
||||||
|
|
||||||
|
|
||||||
class UniqueAppendAction(argparse.Action):
|
class UniqueAppendAction(argparse.Action):
|
||||||
def __call__(self, parser, namespace, values, option_string=None):
|
def __call__(self, parser, namespace, values, option_string=None):
|
||||||
if not hasattr(namespace, self.dest):
|
if not hasattr(namespace, self.dest):
|
||||||
@@ -68,6 +71,7 @@ class UniqueAppendAction(argparse.Action):
|
|||||||
if value not in getattr(namespace, self.dest):
|
if value not in getattr(namespace, self.dest):
|
||||||
getattr(namespace, self.dest).append(value)
|
getattr(namespace, self.dest).append(value)
|
||||||
|
|
||||||
|
|
||||||
class ArchivingOrchestrator:
|
class ArchivingOrchestrator:
|
||||||
|
|
||||||
feeders: List[Type[Feeder]]
|
feeders: List[Type[Feeder]]
|
||||||
@@ -76,17 +80,17 @@ class ArchivingOrchestrator:
|
|||||||
databases: List[Type[Database]]
|
databases: List[Type[Database]]
|
||||||
storages: List[Type[Storage]]
|
storages: List[Type[Storage]]
|
||||||
formatters: List[Type[Formatter]]
|
formatters: List[Type[Formatter]]
|
||||||
|
|
||||||
def setup_basic_parser(self):
|
def setup_basic_parser(self):
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
prog="auto-archiver",
|
prog="auto-archiver",
|
||||||
add_help=False,
|
add_help=False,
|
||||||
description="""
|
description="""
|
||||||
Auto Archiver is a CLI tool to archive media/metadata from online URLs;
|
Auto Archiver is a CLI tool to archive media/metadata from online URLs;
|
||||||
it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!
|
it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!
|
||||||
""",
|
""",
|
||||||
epilog="Check the code at https://github.com/bellingcat/auto-archiver",
|
epilog="Check the code at https://github.com/bellingcat/auto-archiver",
|
||||||
formatter_class=RichHelpFormatter,
|
formatter_class=RichHelpFormatter,
|
||||||
)
|
)
|
||||||
parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit')
|
parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit')
|
||||||
parser.add_argument('--version', action='version', version=__version__)
|
parser.add_argument('--version', action='version', version=__version__)
|
||||||
@@ -130,7 +134,7 @@ class ArchivingOrchestrator:
|
|||||||
|
|
||||||
# for simple mode, we use the cli_feeder and any modules that don't require setup
|
# for simple mode, we use the cli_feeder and any modules that don't require setup
|
||||||
yaml_config['steps']['feeders'] = ['cli_feeder']
|
yaml_config['steps']['feeders'] = ['cli_feeder']
|
||||||
|
|
||||||
# add them to the config
|
# add them to the config
|
||||||
for module in simple_modules:
|
for module in simple_modules:
|
||||||
for module_type in module.type:
|
for module_type in module.type:
|
||||||
@@ -155,23 +159,22 @@ class ArchivingOrchestrator:
|
|||||||
|
|
||||||
if unknown:
|
if unknown:
|
||||||
logger.warning(f"Ignoring unknown/unused arguments: {unknown}\nPerhaps you don't have this module enabled?")
|
logger.warning(f"Ignoring unknown/unused arguments: {unknown}\nPerhaps you don't have this module enabled?")
|
||||||
|
|
||||||
if (self.config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
|
if (self.config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
|
||||||
logger.info(f"Storing configuration file to {basic_config.config_file}")
|
logger.info(f"Storing configuration file to {basic_config.config_file}")
|
||||||
store_yaml(self.config, basic_config.config_file)
|
store_yaml(self.config, basic_config.config_file)
|
||||||
|
|
||||||
return self.config
|
return self.config
|
||||||
|
|
||||||
def add_additional_args(self, parser: argparse.ArgumentParser = None):
|
def add_additional_args(self, parser: argparse.ArgumentParser = None):
|
||||||
if not parser:
|
if not parser:
|
||||||
parser = self.parser
|
parser = self.parser
|
||||||
|
|
||||||
|
|
||||||
# allow passing URLs directly on the command line
|
# allow passing URLs directly on the command line
|
||||||
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
|
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
|
||||||
|
|
||||||
parser.add_argument('--feeders', dest='steps.feeders', nargs='+', default=['cli_feeder'], help='the feeders to use', action=UniqueAppendAction)
|
parser.add_argument('--feeders', dest='steps.feeders', nargs='+', default=['cli_feeder'], help='the feeders to use', action=UniqueAppendAction)
|
||||||
parser.add_argument('--enrichers', dest='steps.enrichers', nargs='+', help='the enrichers to use', action=UniqueAppendAction)
|
parser.add_argument('--enrichers', dest='steps.enrichers', nargs='+', help='the enrichers to use', action=UniqueAppendAction)
|
||||||
parser.add_argument('--extractors', dest='steps.extractors', nargs='+', help='the extractors to use', action=UniqueAppendAction)
|
parser.add_argument('--extractors', dest='steps.extractors', nargs='+', help='the extractors to use', action=UniqueAppendAction)
|
||||||
parser.add_argument('--databases', dest='steps.databases', nargs='+', help='the databases to use', action=UniqueAppendAction)
|
parser.add_argument('--databases', dest='steps.databases', nargs='+', help='the databases to use', action=UniqueAppendAction)
|
||||||
parser.add_argument('--storages', dest='steps.storages', nargs='+', help='the storages to use', action=UniqueAppendAction)
|
parser.add_argument('--storages', dest='steps.storages', nargs='+', help='the storages to use', action=UniqueAppendAction)
|
||||||
@@ -180,7 +183,7 @@ class ArchivingOrchestrator:
|
|||||||
parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
|
parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
|
||||||
(token, username etc.) that extractors can use to log into \
|
(token, username etc.) that extractors can use to log into \
|
||||||
a website. If passing this on the command line, use a JSON string. \
|
a website. If passing this on the command line, use a JSON string. \
|
||||||
You may also pass a path to a valid JSON/YAML file which will be parsed.',\
|
You may also pass a path to a valid JSON/YAML file which will be parsed.',
|
||||||
default={},
|
default={},
|
||||||
action=AuthenticationJsonParseAction)
|
action=AuthenticationJsonParseAction)
|
||||||
# logging arguments
|
# logging arguments
|
||||||
@@ -188,7 +191,6 @@ class ArchivingOrchestrator:
|
|||||||
parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
|
parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
|
||||||
parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
|
parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
|
||||||
|
|
||||||
|
|
||||||
def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
|
def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
|
||||||
|
|
||||||
if not modules:
|
if not modules:
|
||||||
@@ -225,16 +227,16 @@ class ArchivingOrchestrator:
|
|||||||
|
|
||||||
def show_help(self, basic_config: dict):
|
def show_help(self, basic_config: dict):
|
||||||
# for the help message, we want to load *all* possible modules and show the help
|
# for the help message, we want to load *all* possible modules and show the help
|
||||||
# add configs as arg parser arguments
|
# add configs as arg parser arguments
|
||||||
|
|
||||||
self.add_additional_args(self.basic_parser)
|
self.add_additional_args(self.basic_parser)
|
||||||
self.add_module_args(parser=self.basic_parser)
|
self.add_module_args(parser=self.basic_parser)
|
||||||
self.basic_parser.print_help()
|
self.basic_parser.print_help()
|
||||||
self.basic_parser.exit()
|
self.basic_parser.exit()
|
||||||
|
|
||||||
def setup_logging(self):
|
def setup_logging(self):
|
||||||
# setup loguru logging
|
# setup loguru logging
|
||||||
logger.remove(0) # remove the default logger
|
logger.remove(0) # remove the default logger
|
||||||
logging_config = self.config['logging']
|
logging_config = self.config['logging']
|
||||||
logger.add(sys.stderr, level=logging_config['level'])
|
logger.add(sys.stderr, level=logging_config['level'])
|
||||||
if log_file := logging_config['file']:
|
if log_file := logging_config['file']:
|
||||||
@@ -246,7 +248,7 @@ class ArchivingOrchestrator:
|
|||||||
orchestrator's attributes (self.feeders, self.extractors etc.). If no modules of a certain type
|
orchestrator's attributes (self.feeders, self.extractors etc.). If no modules of a certain type
|
||||||
are loaded, the program will exit with an error message.
|
are loaded, the program will exit with an error message.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
invalid_modules = []
|
invalid_modules = []
|
||||||
for module_type in BaseModule.MODULE_TYPES:
|
for module_type in BaseModule.MODULE_TYPES:
|
||||||
|
|
||||||
@@ -273,6 +275,7 @@ class ArchivingOrchestrator:
|
|||||||
logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
|
logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
|
||||||
exit()
|
exit()
|
||||||
# cli_feeder is a pseudo module, it just takes the command line args
|
# cli_feeder is a pseudo module, it just takes the command line args
|
||||||
|
|
||||||
def feed(self) -> Generator[Metadata]:
|
def feed(self) -> Generator[Metadata]:
|
||||||
for url in urls:
|
for url in urls:
|
||||||
logger.debug(f"Processing URL: '{url}'")
|
logger.debug(f"Processing URL: '{url}'")
|
||||||
@@ -284,7 +287,6 @@ class ArchivingOrchestrator:
|
|||||||
'__iter__': feed
|
'__iter__': feed
|
||||||
|
|
||||||
})()
|
})()
|
||||||
|
|
||||||
|
|
||||||
pseudo_module.__iter__ = feed
|
pseudo_module.__iter__ = feed
|
||||||
step_items.append(pseudo_module)
|
step_items.append(pseudo_module)
|
||||||
@@ -308,7 +310,7 @@ class ArchivingOrchestrator:
|
|||||||
|
|
||||||
check_steps_ok()
|
check_steps_ok()
|
||||||
setattr(self, f"{module_type}s", step_items)
|
setattr(self, f"{module_type}s", step_items)
|
||||||
|
|
||||||
def load_config(self, config_file: str) -> dict:
|
def load_config(self, config_file: str) -> dict:
|
||||||
if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
|
if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
|
||||||
logger.error(f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
|
logger.error(f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
|
||||||
@@ -316,8 +318,8 @@ class ArchivingOrchestrator:
|
|||||||
|
|
||||||
return read_yaml(config_file)
|
return read_yaml(config_file)
|
||||||
|
|
||||||
def run(self, args: list) -> None:
|
def run(self, args: list) -> Generator[Metadata]:
|
||||||
|
|
||||||
self.setup_basic_parser()
|
self.setup_basic_parser()
|
||||||
|
|
||||||
# parse the known arguments for now (basically, we want the config file)
|
# parse the known arguments for now (basically, we want the config file)
|
||||||
@@ -340,10 +342,10 @@ class ArchivingOrchestrator:
|
|||||||
for module_type in BaseModule.MODULE_TYPES:
|
for module_type in BaseModule.MODULE_TYPES:
|
||||||
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
|
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
|
||||||
|
|
||||||
for _ in self.feed():
|
for result in self.feed():
|
||||||
pass
|
yield result
|
||||||
|
|
||||||
def cleanup(self)->None:
|
def cleanup(self) -> None:
|
||||||
logger.info("Cleaning up")
|
logger.info("Cleaning up")
|
||||||
for e in self.extractors:
|
for e in self.extractors:
|
||||||
e.cleanup()
|
e.cleanup()
|
||||||
@@ -393,7 +395,6 @@ class ArchivingOrchestrator:
|
|||||||
m.tmp_dir = None
|
m.tmp_dir = None
|
||||||
tmp_dir.cleanup()
|
tmp_dir.cleanup()
|
||||||
|
|
||||||
|
|
||||||
def archive(self, result: Metadata) -> Union[Metadata, None]:
|
def archive(self, result: Metadata) -> Union[Metadata, None]:
|
||||||
"""
|
"""
|
||||||
Runs the archiving process for a single URL
|
Runs the archiving process for a single URL
|
||||||
@@ -440,13 +441,13 @@ class ArchivingOrchestrator:
|
|||||||
try:
|
try:
|
||||||
result.merge(a.download(result))
|
result.merge(a.download(result))
|
||||||
if result.is_success(): break
|
if result.is_success(): break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}")
|
logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}")
|
||||||
|
|
||||||
# 4 - call enrichers to work with archived content
|
# 4 - call enrichers to work with archived content
|
||||||
for e in self.enrichers:
|
for e in self.enrichers:
|
||||||
try: e.enrich(result)
|
try: e.enrich(result)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
|
logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
|
||||||
|
|
||||||
# 5 - store all downloaded/generated media
|
# 5 - store all downloaded/generated media
|
||||||
@@ -474,13 +475,13 @@ class ArchivingOrchestrator:
|
|||||||
Blocks localhost, private, reserved, and link-local IPs and all non-http/https schemes.
|
Blocks localhost, private, reserved, and link-local IPs and all non-http/https schemes.
|
||||||
"""
|
"""
|
||||||
assert url.startswith("http://") or url.startswith("https://"), f"Invalid URL scheme"
|
assert url.startswith("http://") or url.startswith("https://"), f"Invalid URL scheme"
|
||||||
|
|
||||||
parsed = urlparse(url)
|
parsed = urlparse(url)
|
||||||
assert parsed.scheme in ["http", "https"], f"Invalid URL scheme"
|
assert parsed.scheme in ["http", "https"], f"Invalid URL scheme"
|
||||||
assert parsed.hostname, f"Invalid URL hostname"
|
assert parsed.hostname, f"Invalid URL hostname"
|
||||||
assert parsed.hostname != "localhost", f"Invalid URL"
|
assert parsed.hostname != "localhost", f"Invalid URL"
|
||||||
|
|
||||||
try: # special rules for IP addresses
|
try: # special rules for IP addresses
|
||||||
ip = ip_address(parsed.hostname)
|
ip = ip_address(parsed.hostname)
|
||||||
except ValueError: pass
|
except ValueError: pass
|
||||||
else:
|
else:
|
||||||
@@ -489,9 +490,8 @@ class ArchivingOrchestrator:
|
|||||||
assert not ip.is_link_local, f"Invalid IP used"
|
assert not ip.is_link_local, f"Invalid IP used"
|
||||||
assert not ip.is_private, f"Invalid IP used"
|
assert not ip.is_private, f"Invalid IP used"
|
||||||
|
|
||||||
|
|
||||||
# Helper Properties
|
# Helper Properties
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def all_modules(self) -> List[Type[BaseModule]]:
|
def all_modules(self) -> List[Type[BaseModule]]:
|
||||||
return self.feeders + self.extractors + self.enrichers + self.databases + self.storages + self.formatters
|
return self.feeders + self.extractors + self.enrichers + self.databases + self.storages + self.formatters
|
||||||
|
|||||||
Reference in New Issue
Block a user