diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 2d462e4..8f36c54 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -36,6 +36,7 @@ steps:""" + "".join([f"\n {module}s: []" for module in BaseModule.MODULE_TYPES # a dictionary of authentication information that can be used by extractors to login to website. # you can use a comma separated list for multiple domains on the same line (common usecase: x.com,twitter.com) # Common login 'types' are username/password, cookie, api key/token. +# There are two special keys for using cookies, they are: cookies_file and cookies_from_browser. # Some Examples: # facebook.com: # username: "my_username" @@ -163,6 +164,6 @@ def read_yaml(yaml_filename: str) -> CommentedMap: def store_yaml(config: CommentedMap, yaml_filename: str) -> None: config_to_save = deepcopy(config) - config.pop('urls', None) + config_to_save.pop('urls', None) with open(yaml_filename, "w", encoding="utf-8") as outf: yaml.dump(config_to_save, outf) \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 641f099..20212ce 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -128,6 +128,10 @@ class ArchivingOrchestrator: elif basic_config.mode == 'simple': simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup] self.add_module_args(simple_modules, parser) + + # for simple mode, we use the cli_feeder and any modules that don't require setup + yaml_config['steps']['feeders'] = ['cli_feeder'] + # add them to the config for module in simple_modules: for module_type in module.type: @@ -237,18 +241,18 @@ class ArchivingOrchestrator: if log_file := logging_config['file']: logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation']) - - def install_modules(self): + def install_modules(self, modules_by_type): """ - Swaps out the previous 'strings' in the config with the actual modules and loads them + Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the + orchestrator's attributes (self.feeders, self.extractors etc.). If no modules of a certain type + are loaded, the program will exit with an error message. """ invalid_modules = [] for module_type in BaseModule.MODULE_TYPES: step_items = [] - modules_to_load = self.config['steps'][f"{module_type}s"] - + modules_to_load = modules_by_type[f"{module_type}s"] assert modules_to_load, f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)" def check_steps_ok(): @@ -264,9 +268,11 @@ class ArchivingOrchestrator: for module in modules_to_load: if module == 'cli_feeder': + # pseudo module, don't load it + breakpoint() urls = self.config['urls'] if not urls: - logger.error("No URLs provided. Please provide at least one URL to archive, or set up a feeder. Use --help for more information.") + logger.error("No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information.") exit() # cli_feeder is a pseudo module, it just takes the command line args def feed(self) -> Generator[Metadata]: @@ -330,7 +336,7 @@ class ArchivingOrchestrator: self.setup_complete_parser(basic_config, yaml_config, unused_args) logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========") - self.install_modules() + self.install_modules(self.config['steps']) # log out the modules that were loaded for module_type in BaseModule.MODULE_TYPES: