diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 8f36c54..9bb080f 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -15,15 +15,9 @@ from .module import BaseModule from typing import Any, List, Type, Tuple -yaml: YAML = YAML() +_yaml: YAML = YAML() -b = yaml.load(""" - # This is a comment - site.com,site2.com: - key: value - key2: value2 - """) -EMPTY_CONFIG = yaml.load(""" +EMPTY_CONFIG = _yaml.load(""" # Auto Archiver Configuration # Steps are the modules that will be run in the order they are defined @@ -149,7 +143,7 @@ def read_yaml(yaml_filename: str) -> CommentedMap: config = None try: with open(yaml_filename, "r", encoding="utf-8") as inf: - config = yaml.load(inf) + config = _yaml.load(inf) except FileNotFoundError: pass @@ -166,4 +160,4 @@ def store_yaml(config: CommentedMap, yaml_filename: str) -> None: config_to_save.pop('urls', None) with open(yaml_filename, "w", encoding="utf-8") as outf: - yaml.dump(config_to_save, outf) \ No newline at end of file + _yaml.dump(config_to_save, outf) \ No newline at end of file diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py index d20ea5e..a8d2ad4 100644 --- a/src/auto_archiver/core/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -44,6 +44,7 @@ class Metadata: if overwrite_left: if right.status and len(right.status): self.status = right.status + self._context.update(right._context) for k, v in right.metadata.items(): assert k not in self.metadata or type(v) == type(self.get(k)) if type(v) not in [dict, list, set] or k not in self.metadata: diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index a451443..bb5f9e3 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -20,7 +20,7 @@ from rich_argparse import RichHelpFormatter from .metadata import Metadata, Media from auto_archiver.version import __version__ -from .config import yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser +from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser from .module import available_modules, LazyBaseModule, get_module, setup_paths from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher from .module import BaseModule @@ -50,7 +50,7 @@ class AuthenticationJsonParseAction(JsonParseAction): auth_dict = json.load(f) except json.JSONDecodeError: # maybe it's yaml, try that - auth_dict = yaml.load(f) + auth_dict = _yaml.load(f) except: pass @@ -424,8 +424,8 @@ class ArchivingOrchestrator: cached_result = None for d in self.databases: d.started(result) - if (local_result := d.fetch(result)): - cached_result = (cached_result or Metadata()).merge(local_result) + if local_result := d.fetch(result): + cached_result = (cached_result or Metadata()).merge(local_result).merge(result) if cached_result: logger.debug("Found previously archived entry") for d in self.databases: diff --git a/src/auto_archiver/modules/api_db/__manifest__.py b/src/auto_archiver/modules/api_db/__manifest__.py index 698c2e4..8359174 100644 --- a/src/auto_archiver/modules/api_db/__manifest__.py +++ b/src/auto_archiver/modules/api_db/__manifest__.py @@ -1,7 +1,7 @@ { "name": "Auto-Archiver API Database", "type": ["database"], - "entry_point": "api_db:AAApiDb", + "entry_point": "api_db::AAApiDb", "requires_setup": True, "dependencies": { "python": ["requests", "loguru"], @@ -23,7 +23,7 @@ "default": None, "help": "which group of users have access to the archive in case public=false as author", }, - "allow_rearchive": { + "use_api_cache": { "default": True, "type": "bool", "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived", @@ -43,7 +43,7 @@ ### Features - **API Integration**: Supports querying for existing archives and submitting results. -- **Duplicate Prevention**: Avoids redundant archiving when `allow_rearchive` is disabled. +- **Duplicate Prevention**: Avoids redundant archiving when `use_api_cache` is disabled. - **Configurable**: Supports settings like API endpoint, authentication token, tags, and permissions. - **Tagging and Metadata**: Adds tags and manages metadata for archives. - **Optional Storage**: Archives results conditionally based on configuration. diff --git a/src/auto_archiver/modules/api_db/api_db.py b/src/auto_archiver/modules/api_db/api_db.py index e1f67ce..753ff3f 100644 --- a/src/auto_archiver/modules/api_db/api_db.py +++ b/src/auto_archiver/modules/api_db/api_db.py @@ -15,11 +15,11 @@ class AAApiDb(Database): """ query the database for the existence of this item. Helps avoid re-archiving the same URL multiple times. """ - if not self.allow_rearchive: return - + if not self.use_api_cache: return + params = {"url": item.get_url(), "limit": 15} headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"} - response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers) + response = requests.get(os.path.join(self.api_endpoint, "url/search"), params=params, headers=headers) if response.status_code == 200: if len(response.json()): @@ -30,21 +30,26 @@ class AAApiDb(Database): logger.error(f"AA API FAIL ({response.status_code}): {response.json()}") return False - - def done(self, item: Metadata, cached: bool=False) -> None: + def done(self, item: Metadata, cached: bool = False) -> None: """archival result ready - should be saved to DB""" if not self.store_results: return - if cached: + if cached: logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached") return logger.debug(f"saving archive of {item.get_url()} to the AA API.") - payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)} + payload = { + 'author_id': self.author_id, + 'url': item.get_url(), + 'public': self.public, + 'group_id': self.group_id, + 'tags': list(self.tags), + 'result': item.to_json(), + } headers = {"Authorization": f"Bearer {self.api_token}"} - response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, headers=headers) + response = requests.post(os.path.join(self.api_endpoint, "interop/submit-archive"), json=payload, headers=headers) - if response.status_code == 200: + if response.status_code == 201: logger.success(f"AA API: {response.json()}") else: logger.error(f"AA API FAIL ({response.status_code}): {response.json()}") - diff --git a/src/auto_archiver/modules/atlos_db/__manifest__.py b/src/auto_archiver/modules/atlos_db/__manifest__.py index 8f9473f..b9cabf2 100644 --- a/src/auto_archiver/modules/atlos_db/__manifest__.py +++ b/src/auto_archiver/modules/atlos_db/__manifest__.py @@ -1,7 +1,7 @@ { "name": "Atlos Database", "type": ["database"], - "entry_point": "atlos_db:AtlosDb", + "entry_point": "atlos_db::AtlosDb", "requires_setup": True, "dependencies": {"python": ["loguru", diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py index 682eb94..c19f2ae 100644 --- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py +++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py @@ -109,6 +109,6 @@ class GsheetsDb(Database): gw: GWorksheet = gsheet.get("worksheet") row: int = gsheet.get("row") elif self.sheet_id: - print(self.sheet_id) + logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.") return gw, row diff --git a/src/auto_archiver/modules/telegram_extractor/__manifest__.py b/src/auto_archiver/modules/telegram_extractor/__manifest__.py index e1c49c2..cb0ee1e 100644 --- a/src/auto_archiver/modules/telegram_extractor/__manifest__.py +++ b/src/auto_archiver/modules/telegram_extractor/__manifest__.py @@ -13,7 +13,7 @@ The `TelegramExtractor` retrieves publicly available media content from Telegram message links without requiring login credentials. It processes URLs to fetch images and videos embedded in Telegram messages, ensuring a structured output using `Metadata` and `Media` objects. Recommended for scenarios where login-based archiving is not viable, although `telethon_archiver` - is advised for more comprehensive functionality. + is advised for more comprehensive functionality, and higher quality media extraction. ### Features - Extracts images and videos from public Telegram message links (`t.me`).