removes rearchiving logic

2026-06-12 05:08:28 +03:00 · 2023-07-27 20:14:50 +01:00
parent 1e66a2c905
commit 3dd3775cbd
8 changed files with 6 additions and 33 deletions
--- a/src/auto_archiver/archivers/archiver.py
+++ b/src/auto_archiver/archivers/archiver.py
@@ -27,11 +27,6 @@ class Archiver(Step):
        # used to clean unnecessary URL parameters OR unfurl redirect links
        return url
    def is_rearchivable(self, url: str) -> bool:
        # archivers can signal if it does not make sense to rearchive a piece of content
        # default is rearchiving
        return True
    def _guess_file_type(self, path: str) -> str:
        """
        Receives a URL or filename and returns global mimetype like 'image' or 'video'
--- a/src/auto_archiver/archivers/telegram_archiver.py
+++ b/src/auto_archiver/archivers/telegram_archiver.py
@@ -19,10 +19,6 @@ class TelegramArchiver(Archiver):
    def configs() -> dict:
        return {}
    def is_rearchivable(self, url: str) -> bool:
        # telegram posts are static
        return False
    def download(self, item: Metadata) -> Metadata:
        url = item.get_url()
        # detect URLs that we definitely cannot handle
--- a/src/auto_archiver/archivers/telethon_archiver.py
+++ b/src/auto_archiver/archivers/telethon_archiver.py
@@ -38,10 +38,6 @@ class TelethonArchiver(Archiver):
            }
        }
    def is_rearchivable(self, url: str) -> bool:
        # telegram posts are static
        return False
    def setup(self) -> None:
        """
        1. trigger login process for telegram or proceed if already saved in a session file
--- a/src/auto_archiver/archivers/tiktok_archiver.py
+++ b/src/auto_archiver/archivers/tiktok_archiver.py
@@ -16,10 +16,6 @@ class TiktokArchiver(Archiver):
    def configs() -> dict:
        return {}
    def is_rearchivable(self, url: str) -> bool:
        # TikTok posts are static
        return False
    def download(self, item: Metadata) -> Metadata:
        url = item.get_url()
        if 'tiktok.com' not in url:
--- a/src/auto_archiver/archivers/twitter_archiver.py
+++ b/src/auto_archiver/archivers/twitter_archiver.py
@@ -37,9 +37,8 @@ class TwitterArchiver(Archiver):
        # https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w
        return self.link_clean_pattern.sub("\\1", url)
-    def is_rearchivable(self, url: str) -> bool:
+    def best_quality_url(self, url: str) -> str:
-        # Twitter posts are static (for now)
+        return re.sub(r"name=(\w+)", "name=orig", url, 1)
        return False
    def download(self, item: Metadata) -> Metadata:
        """
@@ -78,7 +77,7 @@ class TwitterArchiver(Archiver):
                media.set("src", variant.url)
                mimetype = variant.contentType
            elif type(tweet_media) == Photo:
-                media.set("src", tweet_media.fullUrl.replace('name=large', 'name=orig').replace('name=small', 'name=orig'))
+                media.set("src", self.best_quality_url(tweet_media.fullUrl))
                mimetype = "image/jpeg"
            else:
                logger.warning(f"Could not get media URL of {tweet_media}")
@@ -118,6 +117,7 @@ class TwitterArchiver(Archiver):
        for i, u in enumerate(urls):
            media = Media(filename="")
            u = self.best_quality_url(u)
            media.set("src", u)
            ext = ""
            if (mtype := mimetypes.guess_type(UrlUtil.remove_get_parameters(u))[0]):
--- a/src/auto_archiver/archivers/vk_archiver.py
+++ b/src/auto_archiver/archivers/vk_archiver.py
@@ -27,10 +27,6 @@ class VkArchiver(Archiver):
            "session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
        }
    def is_rearchivable(self, url: str) -> bool:
        # VK content is static
        return False
    def download(self, item: Metadata) -> Metadata:
        url = item.get_url()
--- a/src/auto_archiver/core/metadata.py
+++ b/src/auto_archiver/core/metadata.py
@@ -16,7 +16,6 @@ class Metadata:
    status: str = "no archiver"
    metadata: Dict[str, Any] = field(default_factory=dict)
    media: List[Media] = field(default_factory=list)
    rearchivable: bool = True  # defaults to true, archivers can overwrite
    def __post_init__(self):
        self.set("_processed_at", datetime.datetime.utcnow())
@@ -29,7 +28,6 @@ class Metadata:
        if overwrite_left:
            if right.status and len(right.status):
                self.status = right.status
            self.rearchivable |= right.rearchivable
            for k, v in right.metadata.items():
                assert k not in self.metadata or type(v) == type(self.get(k))
                if type(v) not in [dict, list, set] or k not in self.metadata:
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -62,11 +62,7 @@ class ArchivingOrchestrator:
        result.set_url(url)
        if original_url != url: result.set("original_url", original_url)
-        # 2 - rearchiving logic + notify start to DB
+        # 2 - notify start to DB
        # archivers can signal whether the content is rearchivable: eg: tweet vs webpage
        for a in self.archivers: result.rearchivable |= a.is_rearchivable(url)
        logger.debug(f"{result.rearchivable=} for {url=}")
        # signal to DB that archiving has started
        # and propagate already archived if it exists
        cached_result = None
@@ -78,7 +74,7 @@ class ArchivingOrchestrator:
            d.started(result)
            if (local_result := d.fetch(result)):
                cached_result = (cached_result or Metadata()).merge(local_result)
-        if cached_result and not cached_result.rearchivable:
+        if cached_result:
            logger.debug("Found previously archived entry")
            for d in self.databases:
                d.done(cached_result)