Compare commits

...

2 Commits

Author SHA1 Message Date
Miguel Sozinho Ramalho
9297697ef5 makes orchestrator.run return the results to allow for code integration (#196) 2025-02-15 12:41:26 +00:00
Miguel Sozinho Ramalho
5614af3f63 removes fixed oscrypto dependency, it blocked pypi publishing (#195)
* disables tests on ubuntu-latest

* drops fixed oscrypto version for git commit

* version bump
2025-02-14 10:51:56 +00:00
4 changed files with 42 additions and 46 deletions

View File

@@ -16,7 +16,8 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
python-version: ["3.10", "3.11", "3.12"] python-version: ["3.10", "3.11", "3.12"]
os: [ubuntu-22.04, ubuntu-latest] os: [ubuntu-22.04]
#TODO: re-enable ubuntu-latest, this is disabled as oscrypto cannot be pinned to github commit and pushed to pypi
defaults: defaults:
run: run:
working-directory: ./ working-directory: ./

14
poetry.lock generated
View File

@@ -1367,18 +1367,14 @@ description = "TLS (SSL) sockets, key generation, encryption, decryption, signin
optional = false optional = false
python-versions = "*" python-versions = "*"
groups = ["main"] groups = ["main"]
files = [] files = [
develop = false {file = "oscrypto-1.3.0-py2.py3-none-any.whl", hash = "sha256:2b2f1d2d42ec152ca90ccb5682f3e051fb55986e1b170ebde472b133713e7085"},
{file = "oscrypto-1.3.0.tar.gz", hash = "sha256:6f5fef59cb5b3708321db7cca56aed8ad7e662853351e7991fcf60ec606d47a4"},
]
[package.dependencies] [package.dependencies]
asn1crypto = ">=1.5.1" asn1crypto = ">=1.5.1"
[package.source]
type = "git"
url = "https://github.com/wbond/oscrypto.git"
reference = "d5f3437ed24257895ae1edd9e503cfb352e635a8"
resolved_reference = "d5f3437ed24257895ae1edd9e503cfb352e635a8"
[[package]] [[package]]
name = "outcome" name = "outcome"
version = "1.3.0.post0" version = "1.3.0.post0"
@@ -3168,4 +3164,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"]
[metadata] [metadata]
lock-version = "2.1" lock-version = "2.1"
python-versions = ">=3.10,<3.13" python-versions = ">=3.10,<3.13"
content-hash = "c2503c982b9362c3757f39432cdaa8375b45e2d4a0497fa80c2b82a65d1eedf7" content-hash = "b3a6142d6495bc4c8741e9411d29352af219851e4b84b263f991e1bb6db1614e"

View File

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
[project] [project]
name = "auto-archiver" name = "auto-archiver"
version = "0.13.0" version = "0.13.2"
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)." description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
requires-python = ">=3.10,<3.13" requires-python = ">=3.10,<3.13"
@@ -22,7 +22,6 @@ classifiers = [
] ]
dependencies = [ dependencies = [
"oscrypto @ git+https://github.com/wbond/oscrypto.git@d5f3437ed24257895ae1edd9e503cfb352e635a8",
"gspread (>=0.0.0)", "gspread (>=0.0.0)",
"beautifulsoup4 (>=0.0.0)", "beautifulsoup4 (>=0.0.0)",
"bs4 (>=0.0.0)", "bs4 (>=0.0.0)",

View File

@@ -30,6 +30,7 @@ from loguru import logger
DEFAULT_CONFIG_FILE = "orchestration.yaml" DEFAULT_CONFIG_FILE = "orchestration.yaml"
class JsonParseAction(argparse.Action): class JsonParseAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None): def __call__(self, parser, namespace, values, option_string=None):
try: try:
@@ -60,6 +61,8 @@ class AuthenticationJsonParseAction(JsonParseAction):
if not isinstance(site, str) or not isinstance(auth, dict): if not isinstance(site, str) or not isinstance(auth, dict):
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods") raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
setattr(namespace, self.dest, auth_dict) setattr(namespace, self.dest, auth_dict)
class UniqueAppendAction(argparse.Action): class UniqueAppendAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None): def __call__(self, parser, namespace, values, option_string=None):
if not hasattr(namespace, self.dest): if not hasattr(namespace, self.dest):
@@ -68,6 +71,7 @@ class UniqueAppendAction(argparse.Action):
if value not in getattr(namespace, self.dest): if value not in getattr(namespace, self.dest):
getattr(namespace, self.dest).append(value) getattr(namespace, self.dest).append(value)
class ArchivingOrchestrator: class ArchivingOrchestrator:
feeders: List[Type[Feeder]] feeders: List[Type[Feeder]]
@@ -79,14 +83,14 @@ class ArchivingOrchestrator:
def setup_basic_parser(self): def setup_basic_parser(self):
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
prog="auto-archiver", prog="auto-archiver",
add_help=False, add_help=False,
description=""" description="""
Auto Archiver is a CLI tool to archive media/metadata from online URLs; Auto Archiver is a CLI tool to archive media/metadata from online URLs;
it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)! it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!
""", """,
epilog="Check the code at https://github.com/bellingcat/auto-archiver", epilog="Check the code at https://github.com/bellingcat/auto-archiver",
formatter_class=RichHelpFormatter, formatter_class=RichHelpFormatter,
) )
parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit') parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit')
parser.add_argument('--version', action='version', version=__version__) parser.add_argument('--version', action='version', version=__version__)
@@ -166,12 +170,11 @@ class ArchivingOrchestrator:
if not parser: if not parser:
parser = self.parser parser = self.parser
# allow passing URLs directly on the command line # allow passing URLs directly on the command line
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml') parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
parser.add_argument('--feeders', dest='steps.feeders', nargs='+', default=['cli_feeder'], help='the feeders to use', action=UniqueAppendAction) parser.add_argument('--feeders', dest='steps.feeders', nargs='+', default=['cli_feeder'], help='the feeders to use', action=UniqueAppendAction)
parser.add_argument('--enrichers', dest='steps.enrichers', nargs='+', help='the enrichers to use', action=UniqueAppendAction) parser.add_argument('--enrichers', dest='steps.enrichers', nargs='+', help='the enrichers to use', action=UniqueAppendAction)
parser.add_argument('--extractors', dest='steps.extractors', nargs='+', help='the extractors to use', action=UniqueAppendAction) parser.add_argument('--extractors', dest='steps.extractors', nargs='+', help='the extractors to use', action=UniqueAppendAction)
parser.add_argument('--databases', dest='steps.databases', nargs='+', help='the databases to use', action=UniqueAppendAction) parser.add_argument('--databases', dest='steps.databases', nargs='+', help='the databases to use', action=UniqueAppendAction)
parser.add_argument('--storages', dest='steps.storages', nargs='+', help='the storages to use', action=UniqueAppendAction) parser.add_argument('--storages', dest='steps.storages', nargs='+', help='the storages to use', action=UniqueAppendAction)
@@ -180,7 +183,7 @@ class ArchivingOrchestrator:
parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \ parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
(token, username etc.) that extractors can use to log into \ (token, username etc.) that extractors can use to log into \
a website. If passing this on the command line, use a JSON string. \ a website. If passing this on the command line, use a JSON string. \
You may also pass a path to a valid JSON/YAML file which will be parsed.',\ You may also pass a path to a valid JSON/YAML file which will be parsed.',
default={}, default={},
action=AuthenticationJsonParseAction) action=AuthenticationJsonParseAction)
# logging arguments # logging arguments
@@ -188,7 +191,6 @@ class ArchivingOrchestrator:
parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None) parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None) parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None: def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
if not modules: if not modules:
@@ -225,7 +227,7 @@ class ArchivingOrchestrator:
def show_help(self, basic_config: dict): def show_help(self, basic_config: dict):
# for the help message, we want to load *all* possible modules and show the help # for the help message, we want to load *all* possible modules and show the help
# add configs as arg parser arguments # add configs as arg parser arguments
self.add_additional_args(self.basic_parser) self.add_additional_args(self.basic_parser)
self.add_module_args(parser=self.basic_parser) self.add_module_args(parser=self.basic_parser)
@@ -234,7 +236,7 @@ class ArchivingOrchestrator:
def setup_logging(self): def setup_logging(self):
# setup loguru logging # setup loguru logging
logger.remove(0) # remove the default logger logger.remove(0) # remove the default logger
logging_config = self.config['logging'] logging_config = self.config['logging']
logger.add(sys.stderr, level=logging_config['level']) logger.add(sys.stderr, level=logging_config['level'])
if log_file := logging_config['file']: if log_file := logging_config['file']:
@@ -273,6 +275,7 @@ class ArchivingOrchestrator:
logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.") logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.")
exit() exit()
# cli_feeder is a pseudo module, it just takes the command line args # cli_feeder is a pseudo module, it just takes the command line args
def feed(self) -> Generator[Metadata]: def feed(self) -> Generator[Metadata]:
for url in urls: for url in urls:
logger.debug(f"Processing URL: '{url}'") logger.debug(f"Processing URL: '{url}'")
@@ -285,7 +288,6 @@ class ArchivingOrchestrator:
})() })()
pseudo_module.__iter__ = feed pseudo_module.__iter__ = feed
step_items.append(pseudo_module) step_items.append(pseudo_module)
continue continue
@@ -316,7 +318,7 @@ class ArchivingOrchestrator:
return read_yaml(config_file) return read_yaml(config_file)
def run(self, args: list) -> None: def run(self, args: list) -> Generator[Metadata]:
self.setup_basic_parser() self.setup_basic_parser()
@@ -340,10 +342,10 @@ class ArchivingOrchestrator:
for module_type in BaseModule.MODULE_TYPES: for module_type in BaseModule.MODULE_TYPES:
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s"))) logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
for _ in self.feed(): for result in self.feed():
pass yield result
def cleanup(self)->None: def cleanup(self) -> None:
logger.info("Cleaning up") logger.info("Cleaning up")
for e in self.extractors: for e in self.extractors:
e.cleanup() e.cleanup()
@@ -393,7 +395,6 @@ class ArchivingOrchestrator:
m.tmp_dir = None m.tmp_dir = None
tmp_dir.cleanup() tmp_dir.cleanup()
def archive(self, result: Metadata) -> Union[Metadata, None]: def archive(self, result: Metadata) -> Union[Metadata, None]:
""" """
Runs the archiving process for a single URL Runs the archiving process for a single URL
@@ -480,7 +481,7 @@ class ArchivingOrchestrator:
assert parsed.hostname, f"Invalid URL hostname" assert parsed.hostname, f"Invalid URL hostname"
assert parsed.hostname != "localhost", f"Invalid URL" assert parsed.hostname != "localhost", f"Invalid URL"
try: # special rules for IP addresses try: # special rules for IP addresses
ip = ip_address(parsed.hostname) ip = ip_address(parsed.hostname)
except ValueError: pass except ValueError: pass
else: else:
@@ -489,7 +490,6 @@ class ArchivingOrchestrator:
assert not ip.is_link_local, f"Invalid IP used" assert not ip.is_link_local, f"Invalid IP used"
assert not ip.is_private, f"Invalid IP used" assert not ip.is_private, f"Invalid IP used"
# Helper Properties # Helper Properties
@property @property