From ed81dcdaf081613b44035fae9d2b9de9d6fbc5b1 Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Mon, 10 Feb 2025 23:07:03 +0000
Subject: [PATCH 1/9] Remove dangling 'b = ' from config.py

---
 src/auto_archiver/core/config.py       | 14 ++++----------
 src/auto_archiver/core/orchestrator.py |  4 ++--
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py
index 8f36c54..9bb080f 100644
--- a/src/auto_archiver/core/config.py
+++ b/src/auto_archiver/core/config.py
@@ -15,15 +15,9 @@ from .module import BaseModule
 
 from typing import Any, List, Type, Tuple
 
-yaml: YAML = YAML()
+_yaml: YAML = YAML()
 
-b = yaml.load("""
-          # This is a comment
-          site.com,site2.com:
-            key: value
-            key2: value2
-          """)
-EMPTY_CONFIG = yaml.load("""
+EMPTY_CONFIG = _yaml.load("""
 # Auto Archiver Configuration
 # Steps are the modules that will be run in the order they are defined
 
@@ -149,7 +143,7 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
     config = None
     try:
         with open(yaml_filename, "r", encoding="utf-8") as inf:
-            config = yaml.load(inf)
+            config = _yaml.load(inf)
     except FileNotFoundError:
         pass
 
@@ -166,4 +160,4 @@ def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
 
     config_to_save.pop('urls', None)
     with open(yaml_filename, "w", encoding="utf-8") as outf:
-        yaml.dump(config_to_save, outf)
\ No newline at end of file
+        _yaml.dump(config_to_save, outf)
\ No newline at end of file
diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py
index a451443..473f882 100644
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -20,7 +20,7 @@ from rich_argparse import RichHelpFormatter
 
 from .metadata import Metadata, Media
 from auto_archiver.version import __version__
-from .config import yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
+from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
 from .module import available_modules, LazyBaseModule, get_module, setup_paths
 from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
 from .module import BaseModule
@@ -50,7 +50,7 @@ class AuthenticationJsonParseAction(JsonParseAction):
                         auth_dict = json.load(f)
                     except json.JSONDecodeError:
                         # maybe it's yaml, try that
-                        auth_dict = yaml.load(f)
+                        auth_dict = _yaml.load(f)
             except:
                 pass
 

From 7309cd32e7df6ebf21b32ed0cba288ba8ecea297 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 11 Feb 2025 12:51:17 +0000
Subject: [PATCH 2/9] fix: context to be updated on Metadata.merge

---
 src/auto_archiver/core/metadata.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py
index d20ea5e..a8d2ad4 100644
--- a/src/auto_archiver/core/metadata.py
+++ b/src/auto_archiver/core/metadata.py
@@ -44,6 +44,7 @@ class Metadata:
         if overwrite_left:
             if right.status and len(right.status):
                 self.status = right.status
+            self._context.update(right._context)
             for k, v in right.metadata.items():
                 assert k not in self.metadata or type(v) == type(self.get(k))
                 if type(v) not in [dict, list, set] or k not in self.metadata:

From e6594ad3dcb1f8e95919b1ef8a632ea321f7be7a Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 11 Feb 2025 12:52:42 +0000
Subject: [PATCH 3/9] merge result into cached results for context preservation

---
 src/auto_archiver/core/orchestrator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py
index 473f882..bb5f9e3 100644
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -424,8 +424,8 @@ class ArchivingOrchestrator:
         cached_result = None
         for d in self.databases:
             d.started(result)
-            if (local_result := d.fetch(result)):
-                cached_result = (cached_result or Metadata()).merge(local_result)
+            if local_result := d.fetch(result):
+                cached_result = (cached_result or Metadata()).merge(local_result).merge(result)
         if cached_result:
             logger.debug("Found previously archived entry")
             for d in self.databases:

From 6fdd5f0e662293731ffe435d41b1d5e93d094cec Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 11 Feb 2025 12:53:12 +0000
Subject: [PATCH 4/9] fix cases of single : vs :: in entrypoint

---
 src/auto_archiver/modules/api_db/__manifest__.py   | 2 +-
 src/auto_archiver/modules/atlos_db/__manifest__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/auto_archiver/modules/api_db/__manifest__.py b/src/auto_archiver/modules/api_db/__manifest__.py
index 698c2e4..19129a4 100644
--- a/src/auto_archiver/modules/api_db/__manifest__.py
+++ b/src/auto_archiver/modules/api_db/__manifest__.py
@@ -1,7 +1,7 @@
 {
     "name": "Auto-Archiver API Database",
     "type": ["database"],
-    "entry_point": "api_db:AAApiDb",
+    "entry_point": "api_db::AAApiDb",
     "requires_setup": True,
     "dependencies": {
         "python": ["requests", "loguru"],
diff --git a/src/auto_archiver/modules/atlos_db/__manifest__.py b/src/auto_archiver/modules/atlos_db/__manifest__.py
index 8f9473f..b9cabf2 100644
--- a/src/auto_archiver/modules/atlos_db/__manifest__.py
+++ b/src/auto_archiver/modules/atlos_db/__manifest__.py
@@ -1,7 +1,7 @@
 {
     "name": "Atlos Database",
     "type": ["database"],
-    "entry_point": "atlos_db:AtlosDb",
+    "entry_point": "atlos_db::AtlosDb",
     "requires_setup": True,
     "dependencies":
         {"python": ["loguru",

From 4eeb39477c4b3cf81be680fddbaa3ce91bfad8a1 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 11 Feb 2025 12:53:46 +0000
Subject: [PATCH 5/9] improves gsheetdb feedback on retrieve sheet failure

---
 src/auto_archiver/modules/gsheet_db/gsheet_db.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py
index 5e1ed1e..5b270bf 100644
--- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py
+++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py
@@ -97,6 +97,6 @@ class GsheetsDb(Database):
             gw: GWorksheet = gsheet.get("worksheet")
             row: int = gsheet.get("row")
         elif self.sheet_id:
-            print(self.sheet_id)
+            logger.error(f"Unable to retrieve Gsheet for {item.get_url()}, GsheetDB must be used alongside GsheetFeeder.")
 
         return gw, row

From 5c590292212ffb3aeec56b11b0d854ad993be8e7 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 11 Feb 2025 12:53:58 +0000
Subject: [PATCH 6/9] updates api_db for new API endpoint

---
 src/auto_archiver/modules/api_db/api_db.py | 23 +++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/auto_archiver/modules/api_db/api_db.py b/src/auto_archiver/modules/api_db/api_db.py
index e1f67ce..374e755 100644
--- a/src/auto_archiver/modules/api_db/api_db.py
+++ b/src/auto_archiver/modules/api_db/api_db.py
@@ -16,10 +16,10 @@ class AAApiDb(Database):
             Helps avoid re-archiving the same URL multiple times.
         """
         if not self.allow_rearchive: return
-        
+
         params = {"url": item.get_url(), "limit": 15}
         headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"}
-        response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers)
+        response = requests.get(os.path.join(self.api_endpoint, "url/search"), params=params, headers=headers)
 
         if response.status_code == 200:
             if len(response.json()):
@@ -30,21 +30,26 @@ class AAApiDb(Database):
             logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
         return False
 
-
-    def done(self, item: Metadata, cached: bool=False) -> None:
+    def done(self, item: Metadata, cached: bool = False) -> None:
         """archival result ready - should be saved to DB"""
         if not self.store_results: return
-        if cached: 
+        if cached:
             logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached")
             return
         logger.debug(f"saving archive of {item.get_url()} to the AA API.")
 
-        payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
+        payload = {
+            'author_id': self.author_id,
+            'url': item.get_url(),
+            'public': self.public,
+            'group_id': self.group_id,
+            'tags': list(self.tags),
+            'result': item.to_json(),
+        }
         headers = {"Authorization": f"Bearer {self.api_token}"}
-        response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, headers=headers)
+        response = requests.post(os.path.join(self.api_endpoint, "interop/submit-archive"), json=payload, headers=headers)
 
-        if response.status_code == 200:
+        if response.status_code == 201:
             logger.success(f"AA API: {response.json()}")
         else:
             logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
-

From 977f06c37a159a9557170409f726530b0903f0e9 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 11 Feb 2025 12:56:33 +0000
Subject: [PATCH 7/9] renames api_db property for clarity

---
 src/auto_archiver/modules/api_db/__manifest__.py | 4 ++--
 src/auto_archiver/modules/api_db/api_db.py       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/auto_archiver/modules/api_db/__manifest__.py b/src/auto_archiver/modules/api_db/__manifest__.py
index 19129a4..8359174 100644
--- a/src/auto_archiver/modules/api_db/__manifest__.py
+++ b/src/auto_archiver/modules/api_db/__manifest__.py
@@ -23,7 +23,7 @@
             "default": None,
             "help": "which group of users have access to the archive in case public=false as author",
         },
-        "allow_rearchive": {
+        "use_api_cache": {
             "default": True,
             "type": "bool",
             "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived",
@@ -43,7 +43,7 @@
 
 ### Features
 - **API Integration**: Supports querying for existing archives and submitting results.
-- **Duplicate Prevention**: Avoids redundant archiving when `allow_rearchive` is disabled.
+- **Duplicate Prevention**: Avoids redundant archiving when `use_api_cache` is disabled.
 - **Configurable**: Supports settings like API endpoint, authentication token, tags, and permissions.
 - **Tagging and Metadata**: Adds tags and manages metadata for archives.
 - **Optional Storage**: Archives results conditionally based on configuration.
diff --git a/src/auto_archiver/modules/api_db/api_db.py b/src/auto_archiver/modules/api_db/api_db.py
index 374e755..753ff3f 100644
--- a/src/auto_archiver/modules/api_db/api_db.py
+++ b/src/auto_archiver/modules/api_db/api_db.py
@@ -15,7 +15,7 @@ class AAApiDb(Database):
         """ query the database for the existence of this item.
             Helps avoid re-archiving the same URL multiple times.
         """
-        if not self.allow_rearchive: return
+        if not self.use_api_cache: return
 
         params = {"url": item.get_url(), "limit": 15}
         headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"}

From d90d3cec28d2424a7370d232f6445965507b5d92 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 11 Feb 2025 13:03:18 +0000
Subject: [PATCH 8/9] fix telethon_extractor setup

---
 .../modules/telethon_extractor/telethon_extractor.py           | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
index 21fc4dc..8088364 100644
--- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
+++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
@@ -18,12 +18,13 @@ class TelethonExtractor(Extractor):
     invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
 
 
-    def setup(self) -> None:
+    def setup(self, config: dict) -> None:
         """
         1. makes a copy of session_file that is removed in cleanup
         2. trigger login process for telegram or proceed if already saved in a session file
         3. joins channel_invites where needed
         """
+        super().setup(config)
         logger.info(f"SETUP {self.name} checking login...")
 
         # make a copy of the session that is used exclusively with this archiver instance

From 977618b4ceeb8c02d5a561905cf37f3391c3db3e Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 11 Feb 2025 13:04:59 +0000
Subject: [PATCH 9/9] doc: adds note about telethon vs telegram extractors

---
 src/auto_archiver/modules/telegram_extractor/__manifest__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/auto_archiver/modules/telegram_extractor/__manifest__.py b/src/auto_archiver/modules/telegram_extractor/__manifest__.py
index e1c49c2..cb0ee1e 100644
--- a/src/auto_archiver/modules/telegram_extractor/__manifest__.py
+++ b/src/auto_archiver/modules/telegram_extractor/__manifest__.py
@@ -13,7 +13,7 @@
         The `TelegramExtractor` retrieves publicly available media content from Telegram message links without requiring login credentials. 
         It processes URLs to fetch images and videos embedded in Telegram messages, ensuring a structured output using `Metadata` 
         and `Media` objects. Recommended for scenarios where login-based archiving is not viable, although `telethon_archiver` 
-        is advised for more comprehensive functionality.
+        is advised for more comprehensive functionality, and higher quality media extraction.
         
         ### Features
 - Extracts images and videos from public Telegram message links (`t.me`).