diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py index 858bdfd..ae4c41c 100644 --- a/src/auto_archiver/core/__init__.py +++ b/src/auto_archiver/core/__init__.py @@ -4,7 +4,6 @@ from .metadata import Metadata from .media import Media from .module import BaseModule -from .context import ArchivingContext # cannot import ArchivingOrchestrator/Config to avoid circular dep # from .orchestrator import ArchivingOrchestrator diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index a9a904f..2c1e8a3 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -56,6 +56,10 @@ class BaseModule(ABC): # this is set by the orchestrator prior to archiving tmp_dir: TemporaryDirectory = None + @property + def storages(self) -> list: + return self.config.get('storages', []) + def setup(self, config: dict): authentication = config.get('authentication', {}) @@ -75,9 +79,6 @@ class BaseModule(ABC): self.config = config for key, val in config.get(self.name, {}).items(): setattr(self, key, val) - - def repr(self): - return f"Module<'{self.display_name}' (config: {self.config[self.name]})>" def auth_for_site(self, site: str) -> dict: # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com) @@ -97,4 +98,7 @@ class BaseModule(ABC): did find information for '{key}' which is close, is this what you meant? \ If so, edit your authentication settings to make sure it exactly matches.") - return {} \ No newline at end of file + return {} + + def repr(self): + return f"Module<'{self.display_name}' (config: {self.config[self.name]})>" \ No newline at end of file diff --git a/src/auto_archiver/core/context.py b/src/auto_archiver/core/context.py deleted file mode 100644 index 0db5359..0000000 --- a/src/auto_archiver/core/context.py +++ /dev/null @@ -1,56 +0,0 @@ -""" ArchivingContext provides a global context for managing configurations and temporary data during the archiving process. - -This singleton class allows for: -- Storing and retrieving key-value pairs that are accessible throughout the application lifecycle. -- Marking certain values to persist across resets using `keep_on_reset`. -- Managing temporary directories and other shared data used during the archiving process. - -### Key Features: -- Creates a single global instance. -- Reset functionality allows for clearing configurations, with options for partial or full resets. -- Custom getters and setters for commonly used context values like temporary directories. - -""" - -class ArchivingContext: - """ - Singleton context class for managing global configurations and temporary data. - - ArchivingContext._get_instance() to retrieve it if needed - otherwise just - ArchivingContext.set(key, value) - and - ArchivingContext.get(key, default) - - When reset is called, all values are cleared EXCEPT if they were .set(keep_on_reset=True) - reset(full_reset=True) will recreate everything including the keep_on_reset status - """ - _instance = None - - def __init__(self): - self.configs = {} - self.keep_on_reset = set() - - @staticmethod - def get_instance(): - if ArchivingContext._instance is None: - ArchivingContext._instance = ArchivingContext() - return ArchivingContext._instance - - @staticmethod - def set(key, value, keep_on_reset: bool = False): - ac = ArchivingContext.get_instance() - ac.configs[key] = value - if keep_on_reset: ac.keep_on_reset.add(key) - - @staticmethod - def get(key: str, default=None): - return ArchivingContext.get_instance().configs.get(key, default) - - @staticmethod - def reset(full_reset: bool = False): - ac = ArchivingContext.get_instance() - if full_reset: ac.keep_on_reset = set() - ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset} - - # ---- custom getters/setters for widely used context values \ No newline at end of file diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py index b0d80bc..98f1370 100644 --- a/src/auto_archiver/core/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -17,7 +17,7 @@ from loguru import logger from retrying import retry import re -from ..core import Metadata, ArchivingContext, BaseModule +from ..core import Metadata, BaseModule class Extractor(BaseModule): diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py index e5026af..2cb6fc9 100644 --- a/src/auto_archiver/core/media.py +++ b/src/auto_archiver/core/media.py @@ -11,8 +11,6 @@ from dataclasses import dataclass, field from dataclasses_json import dataclass_json, config import mimetypes -from .context import ArchivingContext - from loguru import logger @@ -36,12 +34,11 @@ class Media: _mimetype: str = None # eg: image/jpeg _stored: bool = field(default=False, repr=False, metadata=config(exclude=lambda _: True)) # always exclude - def store(self: Media, override_storages: List = None, url: str = "url-not-available", metadata: Any = None): + def store(self: Media, metadata: Any, url: str = "url-not-available", storages: List[Any] = None) -> None: # 'Any' typing for metadata to avoid circular imports. Stores the media # into the provided/available storages [Storage] repeats the process for # its properties, in case they have inner media themselves for now it # only goes down 1 level but it's easy to make it recursive if needed. - storages = override_storages or ArchivingContext.get("storages") if not len(storages): logger.warning(f"No storages found in local context or provided directly for {self.filename}.") return @@ -66,8 +63,9 @@ class Media: for inner_media in prop_media.all_inner_media(include_self=True): yield inner_media - def is_stored(self) -> bool: - return len(self.urls) > 0 and len(self.urls) == len(ArchivingContext.get("storages")) + def is_stored(self, in_storage) -> bool: + # checks if the media is already stored in the given storage + return len(self.urls) > 0 and any([u for u in self.urls if in_storage.get_cdn_url() in u]) def set(self, key: str, value: Any) -> Media: self.properties[key] = value diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py index 04683dd..d20ea5e 100644 --- a/src/auto_archiver/core/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -20,8 +20,6 @@ from dateutil.parser import parse as parse_dt from loguru import logger from .media import Media -from .context import ArchivingContext - @dataclass_json # annotation order matters @dataclass @@ -32,6 +30,7 @@ class Metadata: def __post_init__(self): self.set("_processed_at", datetime.datetime.now(datetime.timezone.utc)) + self._context = {} def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata: """ @@ -57,12 +56,11 @@ class Metadata: return right.merge(self) return self - def store(self: Metadata, override_storages: List = None): + def store(self, storages=[]): # calls .store for all contained media. storages [Storage] self.remove_duplicate_media_by_hash() - storages = override_storages or ArchivingContext.get("storages") for media in self.media: - media.store(override_storages=storages, url=self.get_url(), metadata=self) + media.store(url=self.get_url(), metadata=self, storages=storages) def set(self, key: str, val: Any) -> Metadata: self.metadata[key] = val @@ -206,3 +204,10 @@ class Metadata: if len(r.media) > len(most_complete.media): most_complete = r elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata): most_complete = r return most_complete + + def set_context(self, key: str, val: Any) -> Metadata: + self._context[key] = val + return self + + def get_context(self, key: str, default: Any = None) -> Any: + return self._context.get(key, default) \ No newline at end of file diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index 501f238..dec67e1 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -43,7 +43,6 @@ def setup_paths(paths: list[str]) -> None: # sort based on the length of the path, so that the longest path is last in the list auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True) - def get_module(module_name: str, config: dict) -> BaseModule: """ Gets and sets up a module using the provided config @@ -69,6 +68,7 @@ def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBa return module def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]: + # search through all valid 'modules' paths. Default is 'modules' in the current directory # see odoo/modules/module.py -> get_modules diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index ad11849..f046bfe 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -17,9 +17,8 @@ import traceback from rich_argparse import RichHelpFormatter -from .context import ArchivingContext -from .metadata import Metadata +from .metadata import Metadata, Media from ..version import __version__ from .config import yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser from .module import available_modules, LazyBaseModule, get_module, setup_paths @@ -268,7 +267,6 @@ class ArchivingOrchestrator: for url in urls: logger.debug(f"Processing URL: '{url}'") yield Metadata().set_url(url) - ArchivingContext.set("folder", "cli") pseudo_module = type('CLIFeeder', (Feeder,), { 'name': 'cli_feeder', @@ -297,9 +295,6 @@ class ArchivingOrchestrator: continue if loaded_module: step_items.append(loaded_module) - # TODO temp solution - if module_type == "storage": - ArchivingContext.set("storages", step_items, keep_on_reset=True) check_steps_ok() self.config['steps'][f"{module_type}s"] = step_items @@ -449,11 +444,12 @@ class ArchivingOrchestrator: logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}") # 5 - store all downloaded/generated media - result.store() + result.store(storages=self.storages) # 6 - format and store formatted if needed + final_media: Media if final_media := self.formatters[0].format(result): - final_media.store(url=url, metadata=result) + final_media.store(url=url, metadata=result, storages=self.storages) result.set_final_media(final_media) if result.is_empty(): diff --git a/src/auto_archiver/core/storage.py b/src/auto_archiver/core/storage.py index b40c5cc..9373ff9 100644 --- a/src/auto_archiver/core/storage.py +++ b/src/auto_archiver/core/storage.py @@ -8,16 +8,16 @@ from slugify import slugify from auto_archiver.utils.misc import random_str -from auto_archiver.core import Media, BaseModule, ArchivingContext, Metadata +from auto_archiver.core import Media, BaseModule, Metadata from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher - +from auto_archiver.core.module import get_module class Storage(BaseModule): - def store(self, media: Media, url: str, metadata: Optional[Metadata]=None) -> None: - if media.is_stored(): + def store(self, media: Media, url: str, metadata: Metadata=None) -> None: + if media.is_stored(in_storage=self): logger.debug(f"{media.key} already stored, skipping") return - self.set_key(media, url) + self.set_key(media, url, metadata) self.upload(media, metadata=metadata) media.add_url(self.get_cdn_url(media)) @@ -32,30 +32,31 @@ class Storage(BaseModule): with open(media.filename, 'rb') as f: return self.uploadf(f, media, **kwargs) - def set_key(self, media: Media, url) -> None: + def set_key(self, media: Media, url, metadata: Metadata) -> None: """takes the media and optionally item info and generates a key""" if media.key is not None and len(media.key) > 0: return - folder = ArchivingContext.get("folder", "") + folder = metadata.folder filename, ext = os.path.splitext(media.filename) # Handle path_generator logic - path_generator = ArchivingContext.get("path_generator", "url") + path_generator = self.config.get("path_generator", "url") if path_generator == "flat": path = "" filename = slugify(filename) # Ensure filename is slugified elif path_generator == "url": path = slugify(url) elif path_generator == "random": - path = ArchivingContext.get("random_path", random_str(24), True) + path = self.config.get("random_path", random_str(24), True) else: raise ValueError(f"Invalid path_generator: {path_generator}") # Handle filename_generator logic - filename_generator = ArchivingContext.get("filename_generator", "random") + filename_generator = self.config.get("filename_generator", "random") if filename_generator == "random": filename = random_str(24) elif filename_generator == "static": - he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}}) + # load the hash_enricher module + he = get_module(HashEnricher, self.config) hd = he.calculate_hash(media.filename) filename = hd[:24] else: diff --git a/src/auto_archiver/feeders/csv_feeder.py b/src/auto_archiver/feeders/csv_feeder.py index e9da518..b1aedb7 100644 --- a/src/auto_archiver/feeders/csv_feeder.py +++ b/src/auto_archiver/feeders/csv_feeder.py @@ -2,7 +2,7 @@ from loguru import logger import csv from . import Feeder -from ..core import Metadata, ArchivingContext +from ..core import Metadata from ..utils import url_or_none class CSVFeeder(Feeder): @@ -34,5 +34,4 @@ class CSVFeeder(Feeder): for row in reader: url = row[0] logger.debug(f"Processing {url}") - yield Metadata().set_url(url) - ArchivingContext.set("folder", "cli") \ No newline at end of file + yield Metadata().set_url(url) \ No newline at end of file diff --git a/src/auto_archiver/modules/csv_feeder/csv_feeder.py b/src/auto_archiver/modules/csv_feeder/csv_feeder.py index 1cd9022..15dfa85 100644 --- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py +++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py @@ -2,7 +2,7 @@ from loguru import logger import csv from auto_archiver.core import Feeder -from auto_archiver.core import Metadata, ArchivingContext +from auto_archiver.core import Metadata from auto_archiver.utils import url_or_none class CSVFeeder(Feeder): @@ -19,5 +19,4 @@ class CSVFeeder(Feeder): for row in reader: url = row[0] logger.debug(f"Processing {url}") - yield Metadata().set_url(url) - ArchivingContext.set("folder", "cli") \ No newline at end of file + yield Metadata().set_url(url) \ No newline at end of file diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 2879c05..4838489 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -6,7 +6,7 @@ from yt_dlp.extractor.common import InfoExtractor from loguru import logger from auto_archiver.core.extractor import Extractor -from ...core import Metadata, Media, ArchivingContext +from ...core import Metadata, Media class GenericExtractor(Extractor): _dropins = {} diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py index e7e8e5c..5e1ed1e 100644 --- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py +++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py @@ -6,7 +6,7 @@ from urllib.parse import quote from loguru import logger from auto_archiver.core import Database -from auto_archiver.core import Metadata, Media, ArchivingContext +from auto_archiver.core import Metadata, Media from auto_archiver.modules.gsheet_feeder import GWorksheet @@ -93,8 +93,7 @@ class GsheetsDb(Database): logger.debug(f"Unable to update sheet: {e}") def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: - # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now - if gsheet := ArchivingContext.get("gsheet"): + if gsheet := item.get_context("gsheet"): gw: GWorksheet = gsheet.get("worksheet") row: int = gsheet.get("row") elif self.sheet_id: diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index 235dd63..d129182 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -15,7 +15,7 @@ from loguru import logger from slugify import slugify from auto_archiver.core import Feeder -from auto_archiver.core import Metadata, ArchivingContext +from auto_archiver.core import Metadata from . import GWorksheet @@ -60,17 +60,15 @@ class GsheetsFeeder(Feeder): # All checks done - archival process starts here m = Metadata().set_url(url) - ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True) if gw.get_cell_or_default(row, 'folder', "") is None: folder = '' else: folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip()) - if len(folder): - if self.use_sheet_names_in_stored_paths: - ArchivingContext.set("folder", os.path.join(folder, slugify(self.sheet), slugify(wks.title)), True) - else: - ArchivingContext.set("folder", folder, True) + if len(folder) and self.use_sheet_names_in_stored_paths: + folder = os.path.join(folder, slugify(self.sheet), slugify(wks.title)) + m.set_context('folder', folder) + m.set_context('worksheet', {"row": row, "worksheet": gw}) yield m logger.success(f'Finished worksheet {wks.title}') diff --git a/src/auto_archiver/modules/hash_enricher/hash_enricher.py b/src/auto_archiver/modules/hash_enricher/hash_enricher.py index 94b5dce..58c6abe 100644 --- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py +++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py @@ -11,7 +11,7 @@ import hashlib from loguru import logger from auto_archiver.core import Enricher -from auto_archiver.core import Metadata, ArchivingContext +from auto_archiver.core import Metadata class HashEnricher(Enricher): diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py index 791b9c0..5b49484 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py @@ -16,7 +16,7 @@ from loguru import logger from telethon.sync import TelegramClient from auto_archiver.core import Extractor -from auto_archiver.core import Metadata, Media, ArchivingContext +from auto_archiver.core import Metadata, Media from auto_archiver.utils import random_str @@ -61,7 +61,7 @@ class InstagramTbotExtractor(Extractor): if not "instagram.com" in url: return False result = Metadata() - tmp_dir = ArchivingContext.get_tmp_dir() + tmp_dir = self.tmp_dir with self.client.start(): chat = self.client.get_entity("instagram_load_bot") since_id = self.client.send_message(entity=chat, message=url).id diff --git a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py index 76784fa..b429163 100644 --- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py +++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py @@ -4,7 +4,7 @@ from urllib.parse import urlparse from loguru import logger from auto_archiver.core import Enricher -from auto_archiver.core import Metadata, ArchivingContext, Media +from auto_archiver.core import Metadata, Media class SSLEnricher(Enricher): diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index b8fe634..8ca2131 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -3,7 +3,7 @@ import requests, time from loguru import logger from auto_archiver.core import Enricher -from auto_archiver.core import Metadata, Media, ArchivingContext +from auto_archiver.core import Metadata, Media from auto_archiver.modules.s3_storage import S3Storage from auto_archiver.core.module import get_module @@ -25,7 +25,7 @@ class WhisperEnricher(Enricher): job_results = {} for i, m in enumerate(to_enrich.media): if m.is_video() or m.is_audio(): - m.store(url=url, metadata=to_enrich) + m.store(url=url, metadata=to_enrich, storages=self.storages) try: job_id = self.submit_job(m) job_results[job_id] = False @@ -110,7 +110,7 @@ class WhisperEnricher(Enricher): def _get_s3_storage(self) -> S3Storage: try: - return next(s for s in ArchivingContext.get("storages") if s.__class__ == S3Storage) + return next(s for s in self.storages if s.__class__ == S3Storage) except: logger.warning("No S3Storage instance found in storages") return diff --git a/tests/__init__.py b/tests/__init__.py index 31f38cb..e69de29 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,5 +0,0 @@ -import tempfile - -from auto_archiver.core.context import ArchivingContext - -ArchivingContext.reset(full_reset=True) \ No newline at end of file