From 2c3d1f591f4a721597e2cd9906c1cdc05db8a78e Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Mon, 10 Feb 2025 17:25:15 +0000 Subject: [PATCH] Separate setup() and module_setup(). --- src/auto_archiver/core/base_module.py | 4 ++++ src/auto_archiver/core/module.py | 1 + src/auto_archiver/modules/gdrive_storage/gdrive_storage.py | 4 +--- src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py | 3 +-- src/auto_archiver/modules/html_formatter/html_formatter.py | 3 +-- .../instagram_api_extractor/instagram_api_extractor.py | 3 +-- .../modules/instagram_extractor/instagram_extractor.py | 3 +-- .../instagram_tbot_extractor/instagram_tbot_extractor.py | 3 +-- src/auto_archiver/modules/s3_storage/s3_storage.py | 3 +-- .../modules/telethon_extractor/telethon_extractor.py | 3 +-- .../modules/twitter_api_extractor/twitter_api_extractor.py | 4 +--- src/auto_archiver/modules/vk_extractor/vk_extractor.py | 3 +-- src/auto_archiver/modules/wacz_enricher/wacz_enricher.py | 3 +-- .../modules/whisper_enricher/whisper_enricher.py | 3 +-- 14 files changed, 17 insertions(+), 26 deletions(-) diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index 5c6ecbb..95575e3 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -80,6 +80,10 @@ class BaseModule(ABC): for key, val in config.get(self.name, {}).items(): setattr(self, key, val) + def module_setup(self): + # For any additional setup required by modules, e.g. autehntication + pass + def auth_for_site(self, site: str, extract_cookies=True) -> Mapping[str, Any]: """ Returns the authentication information for a given site. This is used to authenticate diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index f3fbec5..69f9fcc 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -242,6 +242,7 @@ class LazyBaseModule: default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default')) config[self.name] = default_config | config.get(self.name, {}) instance.setup(config) + instance.module_setup() return instance def __repr__(self): diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index 910f48b..51c13c2 100644 --- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -19,9 +19,7 @@ from auto_archiver.core import Storage class GDriveStorage(Storage): - def setup(self, config: dict) -> None: - # Step 1: Call the BaseModule setup to dynamically assign configs - super().setup(config) + def module_setup(self) -> None: self.scopes = ['https://www.googleapis.com/auth/drive'] # Initialize Google Drive service self._setup_google_drive_service() diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index 50bf430..dd98032 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -21,8 +21,7 @@ from . import GWorksheet class GsheetsFeeder(Feeder): - def setup(self, config: dict): - super().setup(config) + def module_setup(self) -> None: self.gsheets_client = gspread.service_account(filename=self.service_account) # TODO mv to validators assert self.sheet or self.sheet_id, ( diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index 4da82c8..bbba097 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -17,9 +17,8 @@ class HtmlFormatter(Formatter): environment: Environment = None template: any = None - def setup(self, config: dict) -> None: + def module_setup(self) -> None: """Sets up the Jinja2 environment and loads the template.""" - super().setup(config) # Ensure the base class logic is executed template_dir = os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/") self.environment = Environment(loader=FileSystemLoader(template_dir), autoescape=True) diff --git a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py index 5dad0ba..367cc75 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py +++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py @@ -32,8 +32,7 @@ class InstagramAPIExtractor(Extractor): r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?" ) - def setup(self, config: dict) -> None: - super().setup(config) + def module_setup(self) -> None: if self.api_endpoint[-1] == "/": self.api_endpoint = self.api_endpoint[:-1] diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py index 3cf0362..e4e210f 100644 --- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py +++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py @@ -25,8 +25,7 @@ class InstagramExtractor(Extractor): profile_pattern = re.compile(r"{valid_url}(\w+)".format(valid_url=valid_url)) # TODO: links to stories - def setup(self, config: dict) -> None: - super().setup(config) + def module_setup(self) -> None: self.insta = instaloader.Instaloader( download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}" diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py index 5660cd2..707dcc3 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py @@ -27,12 +27,11 @@ class InstagramTbotExtractor(Extractor): https://t.me/instagram_load_bot """ - def setup(self, configs) -> None: + def module_setup(self) -> None: """ 1. makes a copy of session_file that is removed in cleanup 2. checks if the session file is valid """ - super().setup(configs) logger.info(f"SETUP {self.name} checking login...") self._prepare_session_file() self._initialize_telegram_client() diff --git a/src/auto_archiver/modules/s3_storage/s3_storage.py b/src/auto_archiver/modules/s3_storage/s3_storage.py index 2f85164..c77bbc3 100644 --- a/src/auto_archiver/modules/s3_storage/s3_storage.py +++ b/src/auto_archiver/modules/s3_storage/s3_storage.py @@ -13,8 +13,7 @@ NO_DUPLICATES_FOLDER = "no-dups/" class S3Storage(Storage): - def setup(self, config: dict) -> None: - super().setup(config) + def module_setup(self) -> None: self.s3 = boto3.client( 's3', region_name=self.region, diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 97d3e94..3762f01 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -18,14 +18,13 @@ class TelethonExtractor(Extractor): invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)") - def setup(self, config: dict) -> None: + def module_setup(self) -> None: """ 1. makes a copy of session_file that is removed in cleanup 2. trigger login process for telegram or proceed if already saved in a session file 3. joins channel_invites where needed """ - super().setup(config) logger.info(f"SETUP {self.name} checking login...") # make a copy of the session that is used exclusively with this archiver instance diff --git a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py index 6573475..0b27e22 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py +++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py @@ -15,9 +15,7 @@ class TwitterApiExtractor(Extractor): valid_url: re.Pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") - def setup(self, config: dict) -> None: - super().setup(config) - + def module_setup(self) -> None: self.api_index = 0 self.apis = [] if len(self.bearer_tokens): diff --git a/src/auto_archiver/modules/vk_extractor/vk_extractor.py b/src/auto_archiver/modules/vk_extractor/vk_extractor.py index 2d09138..0d1fc04 100644 --- a/src/auto_archiver/modules/vk_extractor/vk_extractor.py +++ b/src/auto_archiver/modules/vk_extractor/vk_extractor.py @@ -12,8 +12,7 @@ class VkExtractor(Extractor): Currently only works for /wall posts """ - def setup(self, config: dict) -> None: - super().setup(config) + def module_setup(self) -> None: self.vks = VkScraper(self.username, self.password, session_file=self.session_file) def download(self, item: Metadata) -> Metadata: diff --git a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py index 1586b75..7d91f43 100644 --- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py +++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py @@ -18,8 +18,7 @@ class WaczExtractorEnricher(Enricher, Extractor): When used as an archiver it will extract the media from the .WACZ archive so it can be enriched. """ - def setup(self, configs) -> None: - super().setup(configs) + def module_setup(self) -> None: self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER') self.docker_in_docker = os.environ.get('WACZ_ENABLE_DOCKER') and os.environ.get('RUNNING_IN_DOCKER') diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index a51ffc1..d83319e 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -13,8 +13,7 @@ class WhisperEnricher(Enricher): Only works if an S3 compatible storage is used """ - def setup(self, config: dict) -> None: - super().setup(config) + def module_setup(self) -> None: self.stores = self.config['steps']['storages'] self.s3 = get_module("s3_storage", self.config) if not "s3_storage" in self.stores: