From 7b3a1468cd28808fd34ed002b27116b2c1b24f5b Mon Sep 17 00:00:00 2001
From: erinhmclark <erinhannahmary.clark@gmail.com>
Date: Tue, 21 Jan 2025 22:29:50 +0000
Subject: [PATCH] Create manifest files for archiver modules.

---
 src/auto_archiver/archivers/__init__.py       |   7 -
 .../archivers/youtubedl_archiver.py           |   2 -
 .../instagram_api_archiver/__init__.py        |   0
 .../instagram_api_archiver/__manifest__.py    |  30 ++
 .../instagram_api_archiver.py                 | 289 ++++++++++++------
 .../modules/instagram_archiver/__init__.py    |   0
 .../instagram_archiver/__manifest__.py        |  33 ++
 .../instagram_archiver}/instagram_archiver.py |   6 +-
 .../instagram_tbot_archiver/__init__.py       |   0
 .../instagram_tbot_archiver/__manifest__.py   |  35 +++
 .../instagram_tbot_archiver.py                |  15 +-
 .../modules/telegram_archiver/__init__.py     |   0
 .../modules/telegram_archiver/__manifest__.py |  26 ++
 .../telegram_archiver}/telegram_archiver.py   |   7 +-
 .../modules/telethon_archiver/__init__.py     |   0
 .../modules/telethon_archiver/__manifest__.py |  48 +++
 .../telethon_archiver}/telethon_archiver.py   |   6 +-
 .../modules/twitter_api_archiver/__init__.py  |   0
 .../twitter_api_archiver/__manifest__.py      |  45 +++
 .../twitter_api_archiver.py                   |   4 +-
 .../modules/vk_archiver/__init__.py           |   0
 .../modules/vk_archiver/__manifest__.py       |  37 +++
 .../vk_archiver}/vk_archiver.py               |   6 +-
 23 files changed, 467 insertions(+), 129 deletions(-)
 delete mode 100644 src/auto_archiver/archivers/youtubedl_archiver.py
 create mode 100644 src/auto_archiver/modules/instagram_api_archiver/__init__.py
 create mode 100644 src/auto_archiver/modules/instagram_api_archiver/__manifest__.py
 rename src/auto_archiver/{archivers => modules/instagram_api_archiver}/instagram_api_archiver.py (59%)
 create mode 100644 src/auto_archiver/modules/instagram_archiver/__init__.py
 create mode 100644 src/auto_archiver/modules/instagram_archiver/__manifest__.py
 rename src/auto_archiver/{archivers => modules/instagram_archiver}/instagram_archiver.py (98%)
 create mode 100644 src/auto_archiver/modules/instagram_tbot_archiver/__init__.py
 create mode 100644 src/auto_archiver/modules/instagram_tbot_archiver/__manifest__.py
 rename src/auto_archiver/{archivers => modules/instagram_tbot_archiver}/instagram_tbot_archiver.py (96%)
 create mode 100644 src/auto_archiver/modules/telegram_archiver/__init__.py
 create mode 100644 src/auto_archiver/modules/telegram_archiver/__manifest__.py
 rename src/auto_archiver/{archivers => modules/telegram_archiver}/telegram_archiver.py (92%)
 create mode 100644 src/auto_archiver/modules/telethon_archiver/__init__.py
 create mode 100644 src/auto_archiver/modules/telethon_archiver/__manifest__.py
 rename src/auto_archiver/{archivers => modules/telethon_archiver}/telethon_archiver.py (98%)
 create mode 100644 src/auto_archiver/modules/twitter_api_archiver/__init__.py
 create mode 100644 src/auto_archiver/modules/twitter_api_archiver/__manifest__.py
 rename src/auto_archiver/{archivers => modules/twitter_api_archiver}/twitter_api_archiver.py (98%)
 create mode 100644 src/auto_archiver/modules/vk_archiver/__init__.py
 create mode 100644 src/auto_archiver/modules/vk_archiver/__manifest__.py
 rename src/auto_archiver/{archivers => modules/vk_archiver}/vk_archiver.py (91%)

diff --git a/src/auto_archiver/archivers/__init__.py b/src/auto_archiver/archivers/__init__.py
index 7519a8e..54515ec 100644
--- a/src/auto_archiver/archivers/__init__.py
+++ b/src/auto_archiver/archivers/__init__.py
@@ -6,10 +6,3 @@ collect and preserve a variety of content types, such as posts, images, videos a
 
 """
 from .archiver import Archiver
-from .telethon_archiver import TelethonArchiver
-from .twitter_api_archiver import TwitterApiArchiver
-from .instagram_archiver import InstagramArchiver
-from .instagram_tbot_archiver import InstagramTbotArchiver
-from .telegram_archiver import TelegramArchiver
-from .vk_archiver import VkArchiver
-from .instagram_api_archiver import InstagramAPIArchiver
diff --git a/src/auto_archiver/archivers/youtubedl_archiver.py b/src/auto_archiver/archivers/youtubedl_archiver.py
deleted file mode 100644
index 8b61974..0000000
--- a/src/auto_archiver/archivers/youtubedl_archiver.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# temporary hack, as we implement module
-from .generic_archiver.generic_archiver import GenericArchiver as YoutubeDLArchiver
diff --git a/src/auto_archiver/modules/instagram_api_archiver/__init__.py b/src/auto_archiver/modules/instagram_api_archiver/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py b/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py
new file mode 100644
index 0000000..2bb3f67
--- /dev/null
+++ b/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py
@@ -0,0 +1,30 @@
+{
+    "name": "Instagram API Archiver",
+    "type": ["extractor"],
+    "entry_point": "instagram_api_archiver:InstagramApiArchiver",
+    "depends": ["core"],
+    "external_dependencies":
+        {"python": ["requests",
+                    "loguru",
+                    "retrying",
+                    "tqdm",],
+         },
+    "no_setup_required": False,
+    "configs": {
+        "access_token": {"default": None, "help": "a valid instagrapi-api token"},
+        "api_endpoint": {"default": None, "help": "API endpoint to use"},
+        "full_profile": {
+            "default": False,
+            "help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.",
+        },
+        "full_profile_max_posts": {
+            "default": 0,
+            "help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights",
+        },
+        "minimize_json_output": {
+            "default": True,
+            "help": "if true, will remove empty values from the json output",
+        },
+    },
+    "description": "",
+}
diff --git a/src/auto_archiver/archivers/instagram_api_archiver.py b/src/auto_archiver/modules/instagram_api_archiver/instagram_api_archiver.py
similarity index 59%
rename from src/auto_archiver/archivers/instagram_api_archiver.py
rename to src/auto_archiver/modules/instagram_api_archiver/instagram_api_archiver.py
index d0e7e87..cc6e074 100644
--- a/src/auto_archiver/archivers/instagram_api_archiver.py
+++ b/src/auto_archiver/modules/instagram_api_archiver/instagram_api_archiver.py
@@ -9,32 +9,38 @@ data, reducing JSON output size, and handling large profiles.
 """
 
 import re
-import requests
 from datetime import datetime
+
+import requests
 from loguru import logger
 from retrying import retry
 from tqdm import tqdm
 
-from . import Archiver
-from ..core import Metadata
-from ..core import Media
+from auto_archiver.archivers import Archiver
+from auto_archiver.core import Media
+from auto_archiver.core import Metadata
+
 
 class InstagramAPIArchiver(Archiver):
     """
     Uses an https://github.com/subzeroid/instagrapi API deployment to fetch instagram posts data
-    
+
     # TODO: improvement collect aggregates of locations[0].location and mentions for all posts
     """
+
     name = "instagram_api_archiver"
 
-    global_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?")
+    global_pattern = re.compile(
+        r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?"
+    )
 
     def __init__(self, config: dict) -> None:
         super().__init__(config)
         self.assert_valid_string("access_token")
         self.assert_valid_string("api_endpoint")
         self.full_profile_max_posts = int(self.full_profile_max_posts)
-        if self.api_endpoint[-1] == "/": self.api_endpoint = self.api_endpoint[:-1]
+        if self.api_endpoint[-1] == "/":
+            self.api_endpoint = self.api_endpoint[:-1]
 
         self.full_profile = bool(self.full_profile)
         self.minimize_json_output = bool(self.minimize_json_output)
@@ -44,52 +50,74 @@ class InstagramAPIArchiver(Archiver):
         return {
             "access_token": {"default": None, "help": "a valid instagrapi-api token"},
             "api_endpoint": {"default": None, "help": "API endpoint to use"},
-            "full_profile": {"default": False, "help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information."},
-            "full_profile_max_posts": {"default": 0, "help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights"},
-            "minimize_json_output": {"default": True, "help": "if true, will remove empty values from the json output"},
+            "full_profile": {
+                "default": False,
+                "help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.",
+            },
+            "full_profile_max_posts": {
+                "default": 0,
+                "help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights",
+            },
+            "minimize_json_output": {
+                "default": True,
+                "help": "if true, will remove empty values from the json output",
+            },
         }
-    
+
     def download(self, item: Metadata) -> Metadata:
         url = item.get_url()
 
-        url.replace("instagr.com", "instagram.com").replace("instagr.am", "instagram.com")
+        url.replace("instagr.com", "instagram.com").replace(
+            "instagr.am", "instagram.com"
+        )
         insta_matches = self.global_pattern.findall(url)
         logger.info(f"{insta_matches=}")
-        if not len(insta_matches) or len(insta_matches[0])!=3: return
-        if len(insta_matches) > 1: 
-            logger.warning(f"Multiple instagram matches found in {url=}, using the first one")
+        if not len(insta_matches) or len(insta_matches[0]) != 3:
+            return
+        if len(insta_matches) > 1:
+            logger.warning(
+                f"Multiple instagram matches found in {url=}, using the first one"
+            )
             return
         g1, g2, g3 = insta_matches[0][0], insta_matches[0][1], insta_matches[0][2]
-        if g1 == "": return self.download_profile(item, g2)
-        elif g1 == "p": return self.download_post(item, g2, context="post")
-        elif g1 == "reel": return self.download_post(item, g2, context="reel")
-        elif g1 == "stories/highlights": return self.download_highlights(item, g2)
-        elif g1 == "stories": 
-            if len(g3): return self.download_post(item, id=g3, context="story")
+        if g1 == "":
+            return self.download_profile(item, g2)
+        elif g1 == "p":
+            return self.download_post(item, g2, context="post")
+        elif g1 == "reel":
+            return self.download_post(item, g2, context="reel")
+        elif g1 == "stories/highlights":
+            return self.download_highlights(item, g2)
+        elif g1 == "stories":
+            if len(g3):
+                return self.download_post(item, id=g3, context="story")
             return self.download_stories(item, g2)
-        else: 
+        else:
             logger.warning(f"Unknown instagram regex group match {g1=} found in {url=}")
             return
-        
+
     @retry(wait_random_min=1000, wait_random_max=3000, stop_max_attempt_number=5)
     def call_api(self, path: str, params: dict) -> dict:
-        headers = {
-            "accept": "application/json",
-            "x-access-key": self.access_token
-        }
+        headers = {"accept": "application/json", "x-access-key": self.access_token}
         logger.debug(f"calling {self.api_endpoint}/{path} with {params=}")
-        return requests.get(f"{self.api_endpoint}/{path}", headers=headers, params=params).json()
+        return requests.get(
+            f"{self.api_endpoint}/{path}", headers=headers, params=params
+        ).json()
 
     def cleanup_dict(self, d: dict | list) -> dict:
         # repeats 3 times to remove nested empty values
-        if not self.minimize_json_output: return d
-        if type(d) == list: return [self.cleanup_dict(v) for v in d]
-        if type(d) != dict: return d
+        if not self.minimize_json_output:
+            return d
+        if type(d) == list:
+            return [self.cleanup_dict(v) for v in d]
+        if type(d) != dict:
+            return d
         return {
-                k: clean_v
-                for k, v in d.items() 
-                if (clean_v := self.cleanup_dict(v)) not in [0.0, 0, [], {}, "", None, "null"] and
-                k not in ["x", "y", "width", "height"]
+            k: clean_v
+            for k, v in d.items()
+            if (clean_v := self.cleanup_dict(v))
+            not in [0.0, 0, [], {}, "", None, "null"]
+            and k not in ["x", "y", "width", "height"]
         }
 
     def download_profile(self, result: Metadata, username: str) -> Metadata:
@@ -125,7 +153,9 @@ class InstagramAPIArchiver(Archiver):
             try:
                 self.download_all_tagged(result, user_id)
             except Exception as e:
-                result.append("errors", f"Error downloading tagged posts for {username}")
+                result.append(
+                    "errors", f"Error downloading tagged posts for {username}"
+                )
                 logger.error(f"Error downloading tagged posts for {username}: {e}")
 
             # download all highlights
@@ -135,26 +165,37 @@ class InstagramAPIArchiver(Archiver):
                 result.append("errors", f"Error downloading highlights for {username}")
                 logger.error(f"Error downloading highlights for {username}: {e}")
 
-
-        result.set_url(url) # reset as scrape_item modifies it
+        result.set_url(url)  # reset as scrape_item modifies it
         return result.success("insta profile")
 
     def download_all_highlights(self, result, username, user_id):
         count_highlights = 0
         highlights = self.call_api(f"v1/user/highlights", {"user_id": user_id})
         for h in highlights:
-            try: 
+            try:
                 h_info = self._download_highlights_reusable(result, h.get("pk"))
                 count_highlights += len(h_info.get("items", []))
             except Exception as e:
-                result.append("errors", f"Error downloading highlight id{h.get('pk')} for {username}")
-                logger.error(f"Error downloading highlight id{h.get('pk')} for {username}: {e}")
-            if self.full_profile_max_posts and count_highlights >= self.full_profile_max_posts:
-                logger.info(f"HIGHLIGHTS reached full_profile_max_posts={self.full_profile_max_posts}")
+                result.append(
+                    "errors",
+                    f"Error downloading highlight id{h.get('pk')} for {username}",
+                )
+                logger.error(
+                    f"Error downloading highlight id{h.get('pk')} for {username}: {e}"
+                )
+            if (
+                self.full_profile_max_posts
+                and count_highlights >= self.full_profile_max_posts
+            ):
+                logger.info(
+                    f"HIGHLIGHTS reached full_profile_max_posts={self.full_profile_max_posts}"
+                )
                 break
         result.set("#highlights", count_highlights)
 
-    def download_post(self, result: Metadata, code: str = None, id: str = None, context: str = None) -> Metadata:
+    def download_post(
+        self, result: Metadata, code: str = None, id: str = None, context: str = None
+    ) -> Metadata:
         if id:
             post = self.call_api(f"v1/media/by/id", {"id": id})
         else:
@@ -166,7 +207,8 @@ class InstagramAPIArchiver(Archiver):
 
         post = self.scrape_item(result, post, context)
 
-        if post.get("taken_at"): result.set_timestamp(post.get("taken_at"))
+        if post.get("taken_at"):
+            result.set_timestamp(post.get("taken_at"))
         return result.success(f"insta {context or 'post'}")
 
     def download_highlights(self, result: Metadata, id: str) -> Metadata:
@@ -175,96 +217,127 @@ class InstagramAPIArchiver(Archiver):
         del h_info["items"]
         result.set_title(h_info.get("title")).set("data", h_info).set("#reels", items)
         return result.success("insta highlights")
-    
-    def _download_highlights_reusable(self, result: Metadata, id: str) ->dict:
+
+    def _download_highlights_reusable(self, result: Metadata, id: str) -> dict:
         full_h = self.call_api(f"v2/highlight/by/id", {"id": id})
         h_info = full_h.get("response", {}).get("reels", {}).get(f"highlight:{id}")
         assert h_info, f"Highlight {id} not found: {full_h=}"
 
-        if cover_media := h_info.get("cover_media", {}).get("cropped_image_version", {}).get("url"):
+        if (
+            cover_media := h_info.get("cover_media", {})
+            .get("cropped_image_version", {})
+            .get("url")
+        ):
             filename = self.download_from_url(cover_media)
             result.add_media(Media(filename=filename), id=f"cover_media highlight {id}")
 
-        items = h_info.get("items", [])[::-1] # newest to oldest
+        items = h_info.get("items", [])[::-1]  # newest to oldest
         for h in tqdm(items, desc="downloading highlights", unit="highlight"):
-            try: self.scrape_item(result, h, "highlight")
+            try:
+                self.scrape_item(result, h, "highlight")
             except Exception as e:
                 result.append("errors", f"Error downloading highlight {h.get('id')}")
-                logger.error(f"Error downloading highlight, skipping {h.get('id')}: {e}")
-        
+                logger.error(
+                    f"Error downloading highlight, skipping {h.get('id')}: {e}"
+                )
+
         return h_info
-  
+
     def download_stories(self, result: Metadata, username: str) -> Metadata:
         now = datetime.now().strftime("%Y-%m-%d_%H-%M")
         stories = self._download_stories_reusable(result, username)
-        if stories == []: return result.success("insta no story")
+        if stories == []:
+            return result.success("insta no story")
         result.set_title(f"stories {username} at {now}").set("#stories", len(stories))
         return result.success(f"insta stories {now}")
-    
+
     def _download_stories_reusable(self, result: Metadata, username: str) -> list[dict]:
         stories = self.call_api(f"v1/user/stories/by/username", {"username": username})
-        if not stories or not len(stories): return []
-        stories = stories[::-1] # newest to oldest
+        if not stories or not len(stories):
+            return []
+        stories = stories[::-1]  # newest to oldest
 
         for s in tqdm(stories, desc="downloading stories", unit="story"):
-            try: self.scrape_item(result, s, "story")
+            try:
+                self.scrape_item(result, s, "story")
             except Exception as e:
                 result.append("errors", f"Error downloading story {s.get('id')}")
                 logger.error(f"Error downloading story, skipping {s.get('id')}: {e}")
         return stories
-        
+
     def download_all_posts(self, result: Metadata, user_id: str):
         end_cursor = None
         pbar = tqdm(desc="downloading posts")
 
         post_count = 0
         while end_cursor != "":
-            posts = self.call_api(f"v1/user/medias/chunk", {"user_id": user_id, "end_cursor": end_cursor})
-            if not len(posts) or not type(posts) == list or len(posts) != 2: break
+            posts = self.call_api(
+                f"v1/user/medias/chunk", {"user_id": user_id, "end_cursor": end_cursor}
+            )
+            if not len(posts) or not type(posts) == list or len(posts) != 2:
+                break
             posts, end_cursor = posts[0], posts[1]
             logger.info(f"parsing {len(posts)} posts, next {end_cursor=}")
 
             for p in posts:
-                try: self.scrape_item(result, p, "post")
+                try:
+                    self.scrape_item(result, p, "post")
                 except Exception as e:
                     result.append("errors", f"Error downloading post {p.get('id')}")
                     logger.error(f"Error downloading post, skipping {p.get('id')}: {e}")
                 pbar.update(1)
-                post_count+=1
-            if self.full_profile_max_posts and post_count >= self.full_profile_max_posts:
-                logger.info(f"POSTS reached full_profile_max_posts={self.full_profile_max_posts}")
+                post_count += 1
+            if (
+                self.full_profile_max_posts
+                and post_count >= self.full_profile_max_posts
+            ):
+                logger.info(
+                    f"POSTS reached full_profile_max_posts={self.full_profile_max_posts}"
+                )
                 break
         result.set("#posts", post_count)
-        
+
     def download_all_tagged(self, result: Metadata, user_id: str):
         next_page_id = ""
         pbar = tqdm(desc="downloading tagged posts")
 
         tagged_count = 0
         while next_page_id != None:
-            resp = self.call_api(f"v2/user/tag/medias", {"user_id": user_id, "page_id": next_page_id})
+            resp = self.call_api(
+                f"v2/user/tag/medias", {"user_id": user_id, "page_id": next_page_id}
+            )
             posts = resp.get("response", {}).get("items", [])
-            if not len(posts): break
+            if not len(posts):
+                break
             next_page_id = resp.get("next_page_id")
-            
+
             logger.info(f"parsing {len(posts)} tagged posts, next {next_page_id=}")
 
             for p in posts:
-                try: self.scrape_item(result, p, "tagged")
+                try:
+                    self.scrape_item(result, p, "tagged")
                 except Exception as e:
-                    result.append("errors", f"Error downloading tagged post {p.get('id')}")
-                    logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e}")
+                    result.append(
+                        "errors", f"Error downloading tagged post {p.get('id')}"
+                    )
+                    logger.error(
+                        f"Error downloading tagged post, skipping {p.get('id')}: {e}"
+                    )
                 pbar.update(1)
-                tagged_count+=1
-            if self.full_profile_max_posts and tagged_count >= self.full_profile_max_posts:
-                logger.info(f"TAGS reached full_profile_max_posts={self.full_profile_max_posts}")
+                tagged_count += 1
+            if (
+                self.full_profile_max_posts
+                and tagged_count >= self.full_profile_max_posts
+            ):
+                logger.info(
+                    f"TAGS reached full_profile_max_posts={self.full_profile_max_posts}"
+                )
                 break
         result.set("#tagged", tagged_count)
 
+    ### reusable parsing utils below
 
-### reusable parsing utils below
-
-    def scrape_item(self, result:Metadata, item:dict, context:str=None) -> dict:
+    def scrape_item(self, result: Metadata, item: dict, context: str = None) -> dict:
         """
         receives a Metadata and an API dict response
         fetches the media and adds it to the Metadata
@@ -272,23 +345,25 @@ class InstagramAPIArchiver(Archiver):
         context can be used to give specific id prefixes to media
         """
         if "clips_metadata" in item:
-            if reusable_text := item.get("clips_metadata", {}).get("reusable_text_attribute_string"):
+            if reusable_text := item.get("clips_metadata", {}).get(
+                "reusable_text_attribute_string"
+            ):
                 item["clips_metadata_text"] = reusable_text
-            if self.minimize_json_output: 
+            if self.minimize_json_output:
                 del item["clips_metadata"]
 
-        if code := item.get("code") and not result.get("url"): 
+        if code := item.get("code") and not result.get("url"):
             result.set_url(f"https://www.instagram.com/p/{code}/")
-            
+
         resources = item.get("resources", item.get("carousel_media", []))
         item, media, media_id = self.scrape_media(item, context)
         # if resources are present take the main media from the first resource
         if not media and len(resources):
             _, media, media_id = self.scrape_media(resources[0], context)
             resources = resources[1:]
-        
+
         assert media, f"Image/video not found in {item=}"
-            
+
         # posts with multiple items contain a resources list
         resources_metadata = Metadata()
         for r in resources:
@@ -298,40 +373,54 @@ class InstagramAPIArchiver(Archiver):
 
         result.add_media(media, id=media_id)
         return item
-    
-    def scrape_media(self, item: dict, context:str) -> tuple[dict, Media, str]:
+
+    def scrape_media(self, item: dict, context: str) -> tuple[dict, Media, str]:
         # remove unnecessary info
-        if self.minimize_json_output: 
-            for k in ["image_versions", "video_versions", "video_dash_manifest", "image_versions2", "video_versions2"]:
-                if k in item: del item[k]
+        if self.minimize_json_output:
+            for k in [
+                "image_versions",
+                "video_versions",
+                "video_dash_manifest",
+                "image_versions2",
+                "video_versions2",
+            ]:
+                if k in item:
+                    del item[k]
         item = self.cleanup_dict(item)
 
         image_media = None
         if image_url := item.get("thumbnail_url"):
             filename = self.download_from_url(image_url, verbose=False)
             image_media = Media(filename=filename)
-            
+
         # retrieve video info
-        best_id = item.get('id', item.get('pk'))
+        best_id = item.get("id", item.get("pk"))
         taken_at = item.get("taken_at", item.get("taken_at_ts"))
         code = item.get("code")
         caption_text = item.get("caption_text")
-        if "carousel_media" in item: del item["carousel_media"]
+        if "carousel_media" in item:
+            del item["carousel_media"]
 
         if video_url := item.get("video_url"):
             filename = self.download_from_url(video_url, verbose=False)
             video_media = Media(filename=filename)
-            if taken_at: video_media.set("date", taken_at)
-            if code: video_media.set("url", f"https://www.instagram.com/p/{code}")
-            if caption_text: video_media.set("text", caption_text)
+            if taken_at:
+                video_media.set("date", taken_at)
+            if code:
+                video_media.set("url", f"https://www.instagram.com/p/{code}")
+            if caption_text:
+                video_media.set("text", caption_text)
             video_media.set("preview", [image_media])
             video_media.set("data", [item])
             return item, video_media, f"{context or 'video'} {best_id}"
         elif image_media:
-            if taken_at: image_media.set("date", taken_at)
-            if code: image_media.set("url", f"https://www.instagram.com/p/{code}")
-            if caption_text: image_media.set("text", caption_text)
+            if taken_at:
+                image_media.set("date", taken_at)
+            if code:
+                image_media.set("url", f"https://www.instagram.com/p/{code}")
+            if caption_text:
+                image_media.set("text", caption_text)
             image_media.set("data", [item])
             return item, image_media, f"{context or 'image'} {best_id}"
-        
-        return item, None, None
\ No newline at end of file
+
+        return item, None, None
diff --git a/src/auto_archiver/modules/instagram_archiver/__init__.py b/src/auto_archiver/modules/instagram_archiver/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/auto_archiver/modules/instagram_archiver/__manifest__.py b/src/auto_archiver/modules/instagram_archiver/__manifest__.py
new file mode 100644
index 0000000..bd63ab4
--- /dev/null
+++ b/src/auto_archiver/modules/instagram_archiver/__manifest__.py
@@ -0,0 +1,33 @@
+{
+    "name": "Instagram Archiver",
+    "type": ["extractor"],
+    "entry_point": "instagram_archiver:InstagramArchiver",
+    "depends": ["core"],
+    "external_dependencies": {
+        "python": ["instaloader",
+                   "loguru",],
+    },
+    "no_setup_required": False,
+    "configs": {
+        "username": {"default": None, "help": "a valid Instagram username"},
+        "password": {
+            "default": None,
+            "help": "the corresponding Instagram account password",
+        },
+        "download_folder": {
+            "default": "instaloader",
+            "help": "name of a folder to temporarily download content to",
+        },
+        "session_file": {
+            "default": "secrets/instaloader.session",
+            "help": "path to the instagram session which saves session credentials",
+        },
+        # TODO: fine-grain
+        # "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"},
+    },
+    "description": """Uses the Instaloader library to download content from Instagram. This class handles both individual posts
+                    and user profiles, downloading as much information as possible, including images, videos, text, stories,
+                    highlights, and tagged posts. Authentication is required via username/password or a session file.
+                    
+                    """,
+}
diff --git a/src/auto_archiver/archivers/instagram_archiver.py b/src/auto_archiver/modules/instagram_archiver/instagram_archiver.py
similarity index 98%
rename from src/auto_archiver/archivers/instagram_archiver.py
rename to src/auto_archiver/modules/instagram_archiver/instagram_archiver.py
index 94a8fc0..4cf001d 100644
--- a/src/auto_archiver/archivers/instagram_archiver.py
+++ b/src/auto_archiver/modules/instagram_archiver/instagram_archiver.py
@@ -7,9 +7,9 @@ import re, os, shutil, traceback
 import instaloader  # https://instaloader.github.io/as-module.html
 from loguru import logger
 
-from . import Archiver
-from ..core import Metadata
-from ..core import Media
+from auto_archiver.archivers import Archiver
+from auto_archiver.core import Metadata
+from auto_archiver.core import Media
 
 class InstagramArchiver(Archiver):
     """
diff --git a/src/auto_archiver/modules/instagram_tbot_archiver/__init__.py b/src/auto_archiver/modules/instagram_tbot_archiver/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/auto_archiver/modules/instagram_tbot_archiver/__manifest__.py b/src/auto_archiver/modules/instagram_tbot_archiver/__manifest__.py
new file mode 100644
index 0000000..cadb729
--- /dev/null
+++ b/src/auto_archiver/modules/instagram_tbot_archiver/__manifest__.py
@@ -0,0 +1,35 @@
+{
+    "name": "Instagram Telegram Bot Archiver",
+    "type": ["extractor"],
+    "entry_point": "instagram_tbot_archiver:InstagramTbotArchiver",
+    "depends": ["core", "utils"],
+    "external_dependencies": {"python": ["loguru",
+                                         "telethon",],
+                              },
+    "requires_setup": True,
+    "configs": {
+            "api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
+            "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
+            "session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
+            "timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."},
+    },
+    "description": """
+The `InstagramTbotArchiver` module uses a Telegram bot (`instagram_load_bot`) to fetch and archive Instagram content,
+such as posts and stories. It leverages the Telethon library to interact with the Telegram API, sending Instagram URLs
+to the bot and downloading the resulting media and metadata. The downloaded content is stored as `Media` objects and
+returned as part of a `Metadata` object.
+
+### Features
+- Supports archiving Instagram posts and stories through the Telegram bot.
+- Downloads and saves media files (e.g., images, videos) in a temporary directory.
+- Captures and returns metadata, including titles and descriptions, as a `Metadata` object.
+- Automatically manages Telegram session files for secure access.
+
+### Setup
+
+To use the `InstagramTbotArchiver`, you need to provide the following configuration settings:
+- **API ID and Hash**: Telegram API credentials obtained from [my.telegram.org/apps](https://my.telegram.org/apps).
+- **Session File**: Optional path to store the Telegram session file for future use.
+
+    """,
+}
diff --git a/src/auto_archiver/archivers/instagram_tbot_archiver.py b/src/auto_archiver/modules/instagram_tbot_archiver/instagram_tbot_archiver.py
similarity index 96%
rename from src/auto_archiver/archivers/instagram_tbot_archiver.py
rename to src/auto_archiver/modules/instagram_tbot_archiver/instagram_tbot_archiver.py
index 01b1614..9fdc208 100644
--- a/src/auto_archiver/archivers/instagram_tbot_archiver.py
+++ b/src/auto_archiver/modules/instagram_tbot_archiver/instagram_tbot_archiver.py
@@ -7,14 +7,17 @@ relevant media and metadata. The fetched content is saved as `Media` objects in
 `Metadata` object.
 """
 
+import os
 import shutil
-from telethon.sync import TelegramClient
-from loguru import logger
-import time, os
+import time
 from sqlite3 import OperationalError
-from . import Archiver
-from ..core import Metadata, Media, ArchivingContext
-from ..utils import random_str
+
+from loguru import logger
+from telethon.sync import TelegramClient
+
+from auto_archiver.archivers import Archiver
+from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.utils import random_str
 
 
 class InstagramTbotArchiver(Archiver):
diff --git a/src/auto_archiver/modules/telegram_archiver/__init__.py b/src/auto_archiver/modules/telegram_archiver/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/auto_archiver/modules/telegram_archiver/__manifest__.py b/src/auto_archiver/modules/telegram_archiver/__manifest__.py
new file mode 100644
index 0000000..b56477a
--- /dev/null
+++ b/src/auto_archiver/modules/telegram_archiver/__manifest__.py
@@ -0,0 +1,26 @@
+{
+    "name": "Telegram Archiver",
+    "type": ["extractor"],
+    "entry_point": "telegram_archiver:TelegramArchiver",
+    "requires_setup": False,
+    "depends": ["core"],
+    "external_dependencies": {
+        "python": [
+            "requests",
+            "bs4",
+            "loguru",
+        ],
+    },
+    "description": """ 
+        The `TelegramArchiver` retrieves publicly available media content from Telegram message links without requiring login credentials. 
+        It processes URLs to fetch images and videos embedded in Telegram messages, ensuring a structured output using `Metadata` 
+        and `Media` objects. Recommended for scenarios where login-based archiving is not viable, although `telethon_archiver` 
+        is advised for more comprehensive functionality.
+        
+        ### Features
+- Extracts images and videos from public Telegram message links (`t.me`).
+- Processes HTML content of messages to retrieve embedded media.
+- Sets structured metadata, including timestamps, content, and media details.
+- Does not require user authentication for Telegram.
+    """,
+}
diff --git a/src/auto_archiver/archivers/telegram_archiver.py b/src/auto_archiver/modules/telegram_archiver/telegram_archiver.py
similarity index 92%
rename from src/auto_archiver/archivers/telegram_archiver.py
rename to src/auto_archiver/modules/telegram_archiver/telegram_archiver.py
index ed57927..c793095 100644
--- a/src/auto_archiver/archivers/telegram_archiver.py
+++ b/src/auto_archiver/modules/telegram_archiver/telegram_archiver.py
@@ -2,13 +2,14 @@ import requests, re, html
 from bs4 import BeautifulSoup
 from loguru import logger
 
-from . import Archiver
-from ..core import Metadata, Media
+from auto_archiver.archivers import Archiver
+from auto_archiver.core import Metadata, Media
 
 
 class TelegramArchiver(Archiver):
     """
-    Archiver for telegram that does not require login, but the telethon_archiver is much more advised, will only return if at least one image or one video is found
+    Archiver for telegram that does not require login, but the telethon_archiver is much more advised,
+    will only return if at least one image or one video is found
     """
     name = "telegram_archiver"
 
diff --git a/src/auto_archiver/modules/telethon_archiver/__init__.py b/src/auto_archiver/modules/telethon_archiver/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/auto_archiver/modules/telethon_archiver/__manifest__.py b/src/auto_archiver/modules/telethon_archiver/__manifest__.py
new file mode 100644
index 0000000..82d56ba
--- /dev/null
+++ b/src/auto_archiver/modules/telethon_archiver/__manifest__.py
@@ -0,0 +1,48 @@
+# TODO rm dependency on json
+{
+    "name": "telethon_archiver",
+    "type": ["extractor"],
+    "entry_point": "telethon_archiver:TelethonArchiver",
+    "requires_setup": True,
+    "depends": [""],
+    "external_dependencies": {
+        "python": ["telethon",
+                   "loguru",
+                   "tqdm",
+                   ],
+        "bin": [""]
+    },
+    "configs": {
+            "api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"},
+            "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"},
+            "bot_token": {"default": None, "help": "optional, but allows access to more content such as large videos, talk to @botfather"},
+            "session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."},
+            "join_channels": {"default": True, "help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"},
+            "channel_invites": {
+                "default": {},
+                "help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup",
+                # TODO
+                #"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
+            }
+        },
+    "description": """
+The `TelethonArchiver` uses the Telethon library to archive posts and media from Telegram channels and groups. 
+It supports private and public channels, downloading grouped posts with media, and can join channels using invite links 
+if provided in the configuration. 
+
+### Features
+- Fetches posts and metadata from Telegram channels and groups, including private channels.
+- Downloads media attachments (e.g., images, videos, audio) from individual posts or grouped posts.
+- Handles channel invites to join channels dynamically during setup.
+- Utilizes Telethon's capabilities for reliable Telegram interactions.
+- Outputs structured metadata and media using `Metadata` and `Media` objects.
+
+### Setup
+To use the `TelethonArchiver`, you must configure the following:
+- **API ID and API Hash**: Obtain these from [my.telegram.org](https://my.telegram.org/apps).
+- **Session File**: Optional, but records login sessions for future use (default: `secrets/anon.session`).
+- **Bot Token**: Optional, allows access to additional content (e.g., large videos) but limits private channel archiving.
+- **Channel Invites**: Optional, specify a JSON string of invite links to join channels during setup.
+
+"""
+}
diff --git a/src/auto_archiver/archivers/telethon_archiver.py b/src/auto_archiver/modules/telethon_archiver/telethon_archiver.py
similarity index 98%
rename from src/auto_archiver/archivers/telethon_archiver.py
rename to src/auto_archiver/modules/telethon_archiver/telethon_archiver.py
index 2e2305d..89668f3 100644
--- a/src/auto_archiver/archivers/telethon_archiver.py
+++ b/src/auto_archiver/modules/telethon_archiver/telethon_archiver.py
@@ -8,9 +8,9 @@ from loguru import logger
 from tqdm import tqdm
 import re, time, json, os
 
-from . import Archiver
-from ..core import Metadata, Media, ArchivingContext
-from ..utils import random_str
+from auto_archiver.archivers import Archiver
+from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.utils import random_str
 
 
 class TelethonArchiver(Archiver):
diff --git a/src/auto_archiver/modules/twitter_api_archiver/__init__.py b/src/auto_archiver/modules/twitter_api_archiver/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py b/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py
new file mode 100644
index 0000000..f4eb2b9
--- /dev/null
+++ b/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py
@@ -0,0 +1,45 @@
+{
+    "name": "Twitter API Archiver",
+    "type": ["extractor"],
+    "entry_point": "twitter_api_archiver:TwitterApiArchiver",
+    "requires_setup": True,
+    "depends": ["core"],
+    "external_dependencies": {
+        "python": ["requests",
+                   "loguru",
+                   "pytwitter",
+                   "slugify",],
+        "bin": [""]
+    },
+    "configs": {
+            "bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
+            "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))},
+            "consumer_key": {"default": None, "help": "twitter API consumer_key"},
+            "consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
+            "access_token": {"default": None, "help": "twitter API access_token"},
+            "access_secret": {"default": None, "help": "twitter API access_secret"},
+        },
+    "description": """
+        The `TwitterApiArchiver` fetches tweets and associated media using the Twitter API. 
+        It supports multiple API configurations for extended rate limits and reliable access. 
+        Features include URL expansion, media downloads (e.g., images, videos), and structured output 
+        via `Metadata` and `Media` objects. Requires Twitter API credentials such as bearer tokens 
+        or consumer key/secret and access token/secret.
+        
+        ### Features
+        - Fetches tweets and their metadata, including text, creation timestamp, and author information.
+        - Downloads media attachments (e.g., images, videos) in high quality.
+        - Supports multiple API configurations for improved rate limiting.
+        - Expands shortened URLs (e.g., `t.co` links).
+        - Outputs structured metadata and media using `Metadata` and `Media` objects.
+        
+        ### Setup
+        To use the `TwitterApiArchiver`, you must provide valid Twitter API credentials via configuration:
+        - **Bearer Token(s)**: A single token or a list for rate-limited API access.
+        - **Consumer Key and Secret**: Required for user-authenticated API access.
+        - **Access Token and Secret**: Complements the consumer key for enhanced API capabilities.
+        
+        Credentials can be obtained by creating a Twitter developer account at [Twitter Developer Platform](https://developer.twitter.com/en).
+        """
+,
+}
diff --git a/src/auto_archiver/archivers/twitter_api_archiver.py b/src/auto_archiver/modules/twitter_api_archiver/twitter_api_archiver.py
similarity index 98%
rename from src/auto_archiver/archivers/twitter_api_archiver.py
rename to src/auto_archiver/modules/twitter_api_archiver/twitter_api_archiver.py
index d1e4dee..eb607cc 100644
--- a/src/auto_archiver/archivers/twitter_api_archiver.py
+++ b/src/auto_archiver/modules/twitter_api_archiver/twitter_api_archiver.py
@@ -8,8 +8,8 @@ from loguru import logger
 from pytwitter import Api
 from slugify import slugify
 
-from . import Archiver
-from ..core import Metadata,Media
+from auto_archiver.archivers import Archiver
+from auto_archiver.core import Metadata,Media
 
 class TwitterApiArchiver(Archiver):
     name = "twitter_api_archiver"
diff --git a/src/auto_archiver/modules/vk_archiver/__init__.py b/src/auto_archiver/modules/vk_archiver/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/auto_archiver/modules/vk_archiver/__manifest__.py b/src/auto_archiver/modules/vk_archiver/__manifest__.py
new file mode 100644
index 0000000..69bf162
--- /dev/null
+++ b/src/auto_archiver/modules/vk_archiver/__manifest__.py
@@ -0,0 +1,37 @@
+{
+    "name": "VKontakte Archiver",
+    "type": ["extractor"],
+    "entry_point": "vk_archiver:VKArchiver",
+    "requires_setup": True,
+    "depends": ["core", "utils"],
+    "external_dependencies": {
+        "python": ["loguru",
+                   "vk_url_scraper"],
+    },
+    "configs": {
+            "username": {"default": None, "help": "valid VKontakte username"},
+            "password": {"default": None, "help": "valid VKontakte password"},
+            "session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
+        },
+    "description": """
+The `VkArchiver` fetches posts, text, and images from VK (VKontakte) social media pages. 
+This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract 
+and download content. Note that VK videos are handled separately by the `YTDownloader`.
+
+### Features
+- Extracts text, timestamps, and metadata from VK `/wall` posts.
+- Downloads associated images and attaches them to the resulting `Metadata` object.
+- Processes multiple segments of VK URLs that contain mixed content (e.g., wall, photo).
+- Outputs structured metadata and media using `Metadata` and `Media` objects.
+
+### Setup
+To use the `VkArchiver`, you must provide valid VKontakte login credentials and session information:
+- **Username**: A valid VKontakte account username.
+- **Password**: The corresponding password for the VKontakte account.
+- **Session File**: Optional. Path to a session configuration file (`.json`) for persistent VK login.
+
+Credentials can be set in the configuration file or directly via environment variables. Ensure you 
+have access to the VKontakte API by creating an account at [VKontakte](https://vk.com/).
+"""
+,
+}
diff --git a/src/auto_archiver/archivers/vk_archiver.py b/src/auto_archiver/modules/vk_archiver/vk_archiver.py
similarity index 91%
rename from src/auto_archiver/archivers/vk_archiver.py
rename to src/auto_archiver/modules/vk_archiver/vk_archiver.py
index f8bb60a..3cfb446 100644
--- a/src/auto_archiver/archivers/vk_archiver.py
+++ b/src/auto_archiver/modules/vk_archiver/vk_archiver.py
@@ -1,9 +1,9 @@
 from loguru import logger
 from vk_url_scraper import VkScraper
 
-from ..utils.misc import dump_payload
-from . import Archiver
-from ..core import Metadata, Media, ArchivingContext
+from auto_archiver.utils.misc import dump_payload
+from auto_archiver.archivers import Archiver
+from auto_archiver.core import Metadata, Media, ArchivingContext
 
 
 class VkArchiver(Archiver):