From 7b3a1468cd28808fd34ed002b27116b2c1b24f5b Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Tue, 21 Jan 2025 22:29:50 +0000 Subject: [PATCH] Create manifest files for archiver modules. --- src/auto_archiver/archivers/__init__.py | 7 - .../archivers/youtubedl_archiver.py | 2 - .../instagram_api_archiver/__init__.py | 0 .../instagram_api_archiver/__manifest__.py | 30 ++ .../instagram_api_archiver.py | 289 ++++++++++++------ .../modules/instagram_archiver/__init__.py | 0 .../instagram_archiver/__manifest__.py | 33 ++ .../instagram_archiver}/instagram_archiver.py | 6 +- .../instagram_tbot_archiver/__init__.py | 0 .../instagram_tbot_archiver/__manifest__.py | 35 +++ .../instagram_tbot_archiver.py | 15 +- .../modules/telegram_archiver/__init__.py | 0 .../modules/telegram_archiver/__manifest__.py | 26 ++ .../telegram_archiver}/telegram_archiver.py | 7 +- .../modules/telethon_archiver/__init__.py | 0 .../modules/telethon_archiver/__manifest__.py | 48 +++ .../telethon_archiver}/telethon_archiver.py | 6 +- .../modules/twitter_api_archiver/__init__.py | 0 .../twitter_api_archiver/__manifest__.py | 45 +++ .../twitter_api_archiver.py | 4 +- .../modules/vk_archiver/__init__.py | 0 .../modules/vk_archiver/__manifest__.py | 37 +++ .../vk_archiver}/vk_archiver.py | 6 +- 23 files changed, 467 insertions(+), 129 deletions(-) delete mode 100644 src/auto_archiver/archivers/youtubedl_archiver.py create mode 100644 src/auto_archiver/modules/instagram_api_archiver/__init__.py create mode 100644 src/auto_archiver/modules/instagram_api_archiver/__manifest__.py rename src/auto_archiver/{archivers => modules/instagram_api_archiver}/instagram_api_archiver.py (59%) create mode 100644 src/auto_archiver/modules/instagram_archiver/__init__.py create mode 100644 src/auto_archiver/modules/instagram_archiver/__manifest__.py rename src/auto_archiver/{archivers => modules/instagram_archiver}/instagram_archiver.py (98%) create mode 100644 src/auto_archiver/modules/instagram_tbot_archiver/__init__.py create mode 100644 src/auto_archiver/modules/instagram_tbot_archiver/__manifest__.py rename src/auto_archiver/{archivers => modules/instagram_tbot_archiver}/instagram_tbot_archiver.py (96%) create mode 100644 src/auto_archiver/modules/telegram_archiver/__init__.py create mode 100644 src/auto_archiver/modules/telegram_archiver/__manifest__.py rename src/auto_archiver/{archivers => modules/telegram_archiver}/telegram_archiver.py (92%) create mode 100644 src/auto_archiver/modules/telethon_archiver/__init__.py create mode 100644 src/auto_archiver/modules/telethon_archiver/__manifest__.py rename src/auto_archiver/{archivers => modules/telethon_archiver}/telethon_archiver.py (98%) create mode 100644 src/auto_archiver/modules/twitter_api_archiver/__init__.py create mode 100644 src/auto_archiver/modules/twitter_api_archiver/__manifest__.py rename src/auto_archiver/{archivers => modules/twitter_api_archiver}/twitter_api_archiver.py (98%) create mode 100644 src/auto_archiver/modules/vk_archiver/__init__.py create mode 100644 src/auto_archiver/modules/vk_archiver/__manifest__.py rename src/auto_archiver/{archivers => modules/vk_archiver}/vk_archiver.py (91%) diff --git a/src/auto_archiver/archivers/__init__.py b/src/auto_archiver/archivers/__init__.py index 7519a8e..54515ec 100644 --- a/src/auto_archiver/archivers/__init__.py +++ b/src/auto_archiver/archivers/__init__.py @@ -6,10 +6,3 @@ collect and preserve a variety of content types, such as posts, images, videos a """ from .archiver import Archiver -from .telethon_archiver import TelethonArchiver -from .twitter_api_archiver import TwitterApiArchiver -from .instagram_archiver import InstagramArchiver -from .instagram_tbot_archiver import InstagramTbotArchiver -from .telegram_archiver import TelegramArchiver -from .vk_archiver import VkArchiver -from .instagram_api_archiver import InstagramAPIArchiver diff --git a/src/auto_archiver/archivers/youtubedl_archiver.py b/src/auto_archiver/archivers/youtubedl_archiver.py deleted file mode 100644 index 8b61974..0000000 --- a/src/auto_archiver/archivers/youtubedl_archiver.py +++ /dev/null @@ -1,2 +0,0 @@ -# temporary hack, as we implement module -from .generic_archiver.generic_archiver import GenericArchiver as YoutubeDLArchiver diff --git a/src/auto_archiver/modules/instagram_api_archiver/__init__.py b/src/auto_archiver/modules/instagram_api_archiver/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py b/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py new file mode 100644 index 0000000..2bb3f67 --- /dev/null +++ b/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py @@ -0,0 +1,30 @@ +{ + "name": "Instagram API Archiver", + "type": ["extractor"], + "entry_point": "instagram_api_archiver:InstagramApiArchiver", + "depends": ["core"], + "external_dependencies": + {"python": ["requests", + "loguru", + "retrying", + "tqdm",], + }, + "no_setup_required": False, + "configs": { + "access_token": {"default": None, "help": "a valid instagrapi-api token"}, + "api_endpoint": {"default": None, "help": "API endpoint to use"}, + "full_profile": { + "default": False, + "help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.", + }, + "full_profile_max_posts": { + "default": 0, + "help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights", + }, + "minimize_json_output": { + "default": True, + "help": "if true, will remove empty values from the json output", + }, + }, + "description": "", +} diff --git a/src/auto_archiver/archivers/instagram_api_archiver.py b/src/auto_archiver/modules/instagram_api_archiver/instagram_api_archiver.py similarity index 59% rename from src/auto_archiver/archivers/instagram_api_archiver.py rename to src/auto_archiver/modules/instagram_api_archiver/instagram_api_archiver.py index d0e7e87..cc6e074 100644 --- a/src/auto_archiver/archivers/instagram_api_archiver.py +++ b/src/auto_archiver/modules/instagram_api_archiver/instagram_api_archiver.py @@ -9,32 +9,38 @@ data, reducing JSON output size, and handling large profiles. """ import re -import requests from datetime import datetime + +import requests from loguru import logger from retrying import retry from tqdm import tqdm -from . import Archiver -from ..core import Metadata -from ..core import Media +from auto_archiver.archivers import Archiver +from auto_archiver.core import Media +from auto_archiver.core import Metadata + class InstagramAPIArchiver(Archiver): """ Uses an https://github.com/subzeroid/instagrapi API deployment to fetch instagram posts data - + # TODO: improvement collect aggregates of locations[0].location and mentions for all posts """ + name = "instagram_api_archiver" - global_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?") + global_pattern = re.compile( + r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?" + ) def __init__(self, config: dict) -> None: super().__init__(config) self.assert_valid_string("access_token") self.assert_valid_string("api_endpoint") self.full_profile_max_posts = int(self.full_profile_max_posts) - if self.api_endpoint[-1] == "/": self.api_endpoint = self.api_endpoint[:-1] + if self.api_endpoint[-1] == "/": + self.api_endpoint = self.api_endpoint[:-1] self.full_profile = bool(self.full_profile) self.minimize_json_output = bool(self.minimize_json_output) @@ -44,52 +50,74 @@ class InstagramAPIArchiver(Archiver): return { "access_token": {"default": None, "help": "a valid instagrapi-api token"}, "api_endpoint": {"default": None, "help": "API endpoint to use"}, - "full_profile": {"default": False, "help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information."}, - "full_profile_max_posts": {"default": 0, "help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights"}, - "minimize_json_output": {"default": True, "help": "if true, will remove empty values from the json output"}, + "full_profile": { + "default": False, + "help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information.", + }, + "full_profile_max_posts": { + "default": 0, + "help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights", + }, + "minimize_json_output": { + "default": True, + "help": "if true, will remove empty values from the json output", + }, } - + def download(self, item: Metadata) -> Metadata: url = item.get_url() - url.replace("instagr.com", "instagram.com").replace("instagr.am", "instagram.com") + url.replace("instagr.com", "instagram.com").replace( + "instagr.am", "instagram.com" + ) insta_matches = self.global_pattern.findall(url) logger.info(f"{insta_matches=}") - if not len(insta_matches) or len(insta_matches[0])!=3: return - if len(insta_matches) > 1: - logger.warning(f"Multiple instagram matches found in {url=}, using the first one") + if not len(insta_matches) or len(insta_matches[0]) != 3: + return + if len(insta_matches) > 1: + logger.warning( + f"Multiple instagram matches found in {url=}, using the first one" + ) return g1, g2, g3 = insta_matches[0][0], insta_matches[0][1], insta_matches[0][2] - if g1 == "": return self.download_profile(item, g2) - elif g1 == "p": return self.download_post(item, g2, context="post") - elif g1 == "reel": return self.download_post(item, g2, context="reel") - elif g1 == "stories/highlights": return self.download_highlights(item, g2) - elif g1 == "stories": - if len(g3): return self.download_post(item, id=g3, context="story") + if g1 == "": + return self.download_profile(item, g2) + elif g1 == "p": + return self.download_post(item, g2, context="post") + elif g1 == "reel": + return self.download_post(item, g2, context="reel") + elif g1 == "stories/highlights": + return self.download_highlights(item, g2) + elif g1 == "stories": + if len(g3): + return self.download_post(item, id=g3, context="story") return self.download_stories(item, g2) - else: + else: logger.warning(f"Unknown instagram regex group match {g1=} found in {url=}") return - + @retry(wait_random_min=1000, wait_random_max=3000, stop_max_attempt_number=5) def call_api(self, path: str, params: dict) -> dict: - headers = { - "accept": "application/json", - "x-access-key": self.access_token - } + headers = {"accept": "application/json", "x-access-key": self.access_token} logger.debug(f"calling {self.api_endpoint}/{path} with {params=}") - return requests.get(f"{self.api_endpoint}/{path}", headers=headers, params=params).json() + return requests.get( + f"{self.api_endpoint}/{path}", headers=headers, params=params + ).json() def cleanup_dict(self, d: dict | list) -> dict: # repeats 3 times to remove nested empty values - if not self.minimize_json_output: return d - if type(d) == list: return [self.cleanup_dict(v) for v in d] - if type(d) != dict: return d + if not self.minimize_json_output: + return d + if type(d) == list: + return [self.cleanup_dict(v) for v in d] + if type(d) != dict: + return d return { - k: clean_v - for k, v in d.items() - if (clean_v := self.cleanup_dict(v)) not in [0.0, 0, [], {}, "", None, "null"] and - k not in ["x", "y", "width", "height"] + k: clean_v + for k, v in d.items() + if (clean_v := self.cleanup_dict(v)) + not in [0.0, 0, [], {}, "", None, "null"] + and k not in ["x", "y", "width", "height"] } def download_profile(self, result: Metadata, username: str) -> Metadata: @@ -125,7 +153,9 @@ class InstagramAPIArchiver(Archiver): try: self.download_all_tagged(result, user_id) except Exception as e: - result.append("errors", f"Error downloading tagged posts for {username}") + result.append( + "errors", f"Error downloading tagged posts for {username}" + ) logger.error(f"Error downloading tagged posts for {username}: {e}") # download all highlights @@ -135,26 +165,37 @@ class InstagramAPIArchiver(Archiver): result.append("errors", f"Error downloading highlights for {username}") logger.error(f"Error downloading highlights for {username}: {e}") - - result.set_url(url) # reset as scrape_item modifies it + result.set_url(url) # reset as scrape_item modifies it return result.success("insta profile") def download_all_highlights(self, result, username, user_id): count_highlights = 0 highlights = self.call_api(f"v1/user/highlights", {"user_id": user_id}) for h in highlights: - try: + try: h_info = self._download_highlights_reusable(result, h.get("pk")) count_highlights += len(h_info.get("items", [])) except Exception as e: - result.append("errors", f"Error downloading highlight id{h.get('pk')} for {username}") - logger.error(f"Error downloading highlight id{h.get('pk')} for {username}: {e}") - if self.full_profile_max_posts and count_highlights >= self.full_profile_max_posts: - logger.info(f"HIGHLIGHTS reached full_profile_max_posts={self.full_profile_max_posts}") + result.append( + "errors", + f"Error downloading highlight id{h.get('pk')} for {username}", + ) + logger.error( + f"Error downloading highlight id{h.get('pk')} for {username}: {e}" + ) + if ( + self.full_profile_max_posts + and count_highlights >= self.full_profile_max_posts + ): + logger.info( + f"HIGHLIGHTS reached full_profile_max_posts={self.full_profile_max_posts}" + ) break result.set("#highlights", count_highlights) - def download_post(self, result: Metadata, code: str = None, id: str = None, context: str = None) -> Metadata: + def download_post( + self, result: Metadata, code: str = None, id: str = None, context: str = None + ) -> Metadata: if id: post = self.call_api(f"v1/media/by/id", {"id": id}) else: @@ -166,7 +207,8 @@ class InstagramAPIArchiver(Archiver): post = self.scrape_item(result, post, context) - if post.get("taken_at"): result.set_timestamp(post.get("taken_at")) + if post.get("taken_at"): + result.set_timestamp(post.get("taken_at")) return result.success(f"insta {context or 'post'}") def download_highlights(self, result: Metadata, id: str) -> Metadata: @@ -175,96 +217,127 @@ class InstagramAPIArchiver(Archiver): del h_info["items"] result.set_title(h_info.get("title")).set("data", h_info).set("#reels", items) return result.success("insta highlights") - - def _download_highlights_reusable(self, result: Metadata, id: str) ->dict: + + def _download_highlights_reusable(self, result: Metadata, id: str) -> dict: full_h = self.call_api(f"v2/highlight/by/id", {"id": id}) h_info = full_h.get("response", {}).get("reels", {}).get(f"highlight:{id}") assert h_info, f"Highlight {id} not found: {full_h=}" - if cover_media := h_info.get("cover_media", {}).get("cropped_image_version", {}).get("url"): + if ( + cover_media := h_info.get("cover_media", {}) + .get("cropped_image_version", {}) + .get("url") + ): filename = self.download_from_url(cover_media) result.add_media(Media(filename=filename), id=f"cover_media highlight {id}") - items = h_info.get("items", [])[::-1] # newest to oldest + items = h_info.get("items", [])[::-1] # newest to oldest for h in tqdm(items, desc="downloading highlights", unit="highlight"): - try: self.scrape_item(result, h, "highlight") + try: + self.scrape_item(result, h, "highlight") except Exception as e: result.append("errors", f"Error downloading highlight {h.get('id')}") - logger.error(f"Error downloading highlight, skipping {h.get('id')}: {e}") - + logger.error( + f"Error downloading highlight, skipping {h.get('id')}: {e}" + ) + return h_info - + def download_stories(self, result: Metadata, username: str) -> Metadata: now = datetime.now().strftime("%Y-%m-%d_%H-%M") stories = self._download_stories_reusable(result, username) - if stories == []: return result.success("insta no story") + if stories == []: + return result.success("insta no story") result.set_title(f"stories {username} at {now}").set("#stories", len(stories)) return result.success(f"insta stories {now}") - + def _download_stories_reusable(self, result: Metadata, username: str) -> list[dict]: stories = self.call_api(f"v1/user/stories/by/username", {"username": username}) - if not stories or not len(stories): return [] - stories = stories[::-1] # newest to oldest + if not stories or not len(stories): + return [] + stories = stories[::-1] # newest to oldest for s in tqdm(stories, desc="downloading stories", unit="story"): - try: self.scrape_item(result, s, "story") + try: + self.scrape_item(result, s, "story") except Exception as e: result.append("errors", f"Error downloading story {s.get('id')}") logger.error(f"Error downloading story, skipping {s.get('id')}: {e}") return stories - + def download_all_posts(self, result: Metadata, user_id: str): end_cursor = None pbar = tqdm(desc="downloading posts") post_count = 0 while end_cursor != "": - posts = self.call_api(f"v1/user/medias/chunk", {"user_id": user_id, "end_cursor": end_cursor}) - if not len(posts) or not type(posts) == list or len(posts) != 2: break + posts = self.call_api( + f"v1/user/medias/chunk", {"user_id": user_id, "end_cursor": end_cursor} + ) + if not len(posts) or not type(posts) == list or len(posts) != 2: + break posts, end_cursor = posts[0], posts[1] logger.info(f"parsing {len(posts)} posts, next {end_cursor=}") for p in posts: - try: self.scrape_item(result, p, "post") + try: + self.scrape_item(result, p, "post") except Exception as e: result.append("errors", f"Error downloading post {p.get('id')}") logger.error(f"Error downloading post, skipping {p.get('id')}: {e}") pbar.update(1) - post_count+=1 - if self.full_profile_max_posts and post_count >= self.full_profile_max_posts: - logger.info(f"POSTS reached full_profile_max_posts={self.full_profile_max_posts}") + post_count += 1 + if ( + self.full_profile_max_posts + and post_count >= self.full_profile_max_posts + ): + logger.info( + f"POSTS reached full_profile_max_posts={self.full_profile_max_posts}" + ) break result.set("#posts", post_count) - + def download_all_tagged(self, result: Metadata, user_id: str): next_page_id = "" pbar = tqdm(desc="downloading tagged posts") tagged_count = 0 while next_page_id != None: - resp = self.call_api(f"v2/user/tag/medias", {"user_id": user_id, "page_id": next_page_id}) + resp = self.call_api( + f"v2/user/tag/medias", {"user_id": user_id, "page_id": next_page_id} + ) posts = resp.get("response", {}).get("items", []) - if not len(posts): break + if not len(posts): + break next_page_id = resp.get("next_page_id") - + logger.info(f"parsing {len(posts)} tagged posts, next {next_page_id=}") for p in posts: - try: self.scrape_item(result, p, "tagged") + try: + self.scrape_item(result, p, "tagged") except Exception as e: - result.append("errors", f"Error downloading tagged post {p.get('id')}") - logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e}") + result.append( + "errors", f"Error downloading tagged post {p.get('id')}" + ) + logger.error( + f"Error downloading tagged post, skipping {p.get('id')}: {e}" + ) pbar.update(1) - tagged_count+=1 - if self.full_profile_max_posts and tagged_count >= self.full_profile_max_posts: - logger.info(f"TAGS reached full_profile_max_posts={self.full_profile_max_posts}") + tagged_count += 1 + if ( + self.full_profile_max_posts + and tagged_count >= self.full_profile_max_posts + ): + logger.info( + f"TAGS reached full_profile_max_posts={self.full_profile_max_posts}" + ) break result.set("#tagged", tagged_count) + ### reusable parsing utils below -### reusable parsing utils below - - def scrape_item(self, result:Metadata, item:dict, context:str=None) -> dict: + def scrape_item(self, result: Metadata, item: dict, context: str = None) -> dict: """ receives a Metadata and an API dict response fetches the media and adds it to the Metadata @@ -272,23 +345,25 @@ class InstagramAPIArchiver(Archiver): context can be used to give specific id prefixes to media """ if "clips_metadata" in item: - if reusable_text := item.get("clips_metadata", {}).get("reusable_text_attribute_string"): + if reusable_text := item.get("clips_metadata", {}).get( + "reusable_text_attribute_string" + ): item["clips_metadata_text"] = reusable_text - if self.minimize_json_output: + if self.minimize_json_output: del item["clips_metadata"] - if code := item.get("code") and not result.get("url"): + if code := item.get("code") and not result.get("url"): result.set_url(f"https://www.instagram.com/p/{code}/") - + resources = item.get("resources", item.get("carousel_media", [])) item, media, media_id = self.scrape_media(item, context) # if resources are present take the main media from the first resource if not media and len(resources): _, media, media_id = self.scrape_media(resources[0], context) resources = resources[1:] - + assert media, f"Image/video not found in {item=}" - + # posts with multiple items contain a resources list resources_metadata = Metadata() for r in resources: @@ -298,40 +373,54 @@ class InstagramAPIArchiver(Archiver): result.add_media(media, id=media_id) return item - - def scrape_media(self, item: dict, context:str) -> tuple[dict, Media, str]: + + def scrape_media(self, item: dict, context: str) -> tuple[dict, Media, str]: # remove unnecessary info - if self.minimize_json_output: - for k in ["image_versions", "video_versions", "video_dash_manifest", "image_versions2", "video_versions2"]: - if k in item: del item[k] + if self.minimize_json_output: + for k in [ + "image_versions", + "video_versions", + "video_dash_manifest", + "image_versions2", + "video_versions2", + ]: + if k in item: + del item[k] item = self.cleanup_dict(item) image_media = None if image_url := item.get("thumbnail_url"): filename = self.download_from_url(image_url, verbose=False) image_media = Media(filename=filename) - + # retrieve video info - best_id = item.get('id', item.get('pk')) + best_id = item.get("id", item.get("pk")) taken_at = item.get("taken_at", item.get("taken_at_ts")) code = item.get("code") caption_text = item.get("caption_text") - if "carousel_media" in item: del item["carousel_media"] + if "carousel_media" in item: + del item["carousel_media"] if video_url := item.get("video_url"): filename = self.download_from_url(video_url, verbose=False) video_media = Media(filename=filename) - if taken_at: video_media.set("date", taken_at) - if code: video_media.set("url", f"https://www.instagram.com/p/{code}") - if caption_text: video_media.set("text", caption_text) + if taken_at: + video_media.set("date", taken_at) + if code: + video_media.set("url", f"https://www.instagram.com/p/{code}") + if caption_text: + video_media.set("text", caption_text) video_media.set("preview", [image_media]) video_media.set("data", [item]) return item, video_media, f"{context or 'video'} {best_id}" elif image_media: - if taken_at: image_media.set("date", taken_at) - if code: image_media.set("url", f"https://www.instagram.com/p/{code}") - if caption_text: image_media.set("text", caption_text) + if taken_at: + image_media.set("date", taken_at) + if code: + image_media.set("url", f"https://www.instagram.com/p/{code}") + if caption_text: + image_media.set("text", caption_text) image_media.set("data", [item]) return item, image_media, f"{context or 'image'} {best_id}" - - return item, None, None \ No newline at end of file + + return item, None, None diff --git a/src/auto_archiver/modules/instagram_archiver/__init__.py b/src/auto_archiver/modules/instagram_archiver/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/instagram_archiver/__manifest__.py b/src/auto_archiver/modules/instagram_archiver/__manifest__.py new file mode 100644 index 0000000..bd63ab4 --- /dev/null +++ b/src/auto_archiver/modules/instagram_archiver/__manifest__.py @@ -0,0 +1,33 @@ +{ + "name": "Instagram Archiver", + "type": ["extractor"], + "entry_point": "instagram_archiver:InstagramArchiver", + "depends": ["core"], + "external_dependencies": { + "python": ["instaloader", + "loguru",], + }, + "no_setup_required": False, + "configs": { + "username": {"default": None, "help": "a valid Instagram username"}, + "password": { + "default": None, + "help": "the corresponding Instagram account password", + }, + "download_folder": { + "default": "instaloader", + "help": "name of a folder to temporarily download content to", + }, + "session_file": { + "default": "secrets/instaloader.session", + "help": "path to the instagram session which saves session credentials", + }, + # TODO: fine-grain + # "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"}, + }, + "description": """Uses the Instaloader library to download content from Instagram. This class handles both individual posts + and user profiles, downloading as much information as possible, including images, videos, text, stories, + highlights, and tagged posts. Authentication is required via username/password or a session file. + + """, +} diff --git a/src/auto_archiver/archivers/instagram_archiver.py b/src/auto_archiver/modules/instagram_archiver/instagram_archiver.py similarity index 98% rename from src/auto_archiver/archivers/instagram_archiver.py rename to src/auto_archiver/modules/instagram_archiver/instagram_archiver.py index 94a8fc0..4cf001d 100644 --- a/src/auto_archiver/archivers/instagram_archiver.py +++ b/src/auto_archiver/modules/instagram_archiver/instagram_archiver.py @@ -7,9 +7,9 @@ import re, os, shutil, traceback import instaloader # https://instaloader.github.io/as-module.html from loguru import logger -from . import Archiver -from ..core import Metadata -from ..core import Media +from auto_archiver.archivers import Archiver +from auto_archiver.core import Metadata +from auto_archiver.core import Media class InstagramArchiver(Archiver): """ diff --git a/src/auto_archiver/modules/instagram_tbot_archiver/__init__.py b/src/auto_archiver/modules/instagram_tbot_archiver/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/instagram_tbot_archiver/__manifest__.py b/src/auto_archiver/modules/instagram_tbot_archiver/__manifest__.py new file mode 100644 index 0000000..cadb729 --- /dev/null +++ b/src/auto_archiver/modules/instagram_tbot_archiver/__manifest__.py @@ -0,0 +1,35 @@ +{ + "name": "Instagram Telegram Bot Archiver", + "type": ["extractor"], + "entry_point": "instagram_tbot_archiver:InstagramTbotArchiver", + "depends": ["core", "utils"], + "external_dependencies": {"python": ["loguru", + "telethon",], + }, + "requires_setup": True, + "configs": { + "api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"}, + "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"}, + "session_file": {"default": "secrets/anon-insta", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."}, + "timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."}, + }, + "description": """ +The `InstagramTbotArchiver` module uses a Telegram bot (`instagram_load_bot`) to fetch and archive Instagram content, +such as posts and stories. It leverages the Telethon library to interact with the Telegram API, sending Instagram URLs +to the bot and downloading the resulting media and metadata. The downloaded content is stored as `Media` objects and +returned as part of a `Metadata` object. + +### Features +- Supports archiving Instagram posts and stories through the Telegram bot. +- Downloads and saves media files (e.g., images, videos) in a temporary directory. +- Captures and returns metadata, including titles and descriptions, as a `Metadata` object. +- Automatically manages Telegram session files for secure access. + +### Setup + +To use the `InstagramTbotArchiver`, you need to provide the following configuration settings: +- **API ID and Hash**: Telegram API credentials obtained from [my.telegram.org/apps](https://my.telegram.org/apps). +- **Session File**: Optional path to store the Telegram session file for future use. + + """, +} diff --git a/src/auto_archiver/archivers/instagram_tbot_archiver.py b/src/auto_archiver/modules/instagram_tbot_archiver/instagram_tbot_archiver.py similarity index 96% rename from src/auto_archiver/archivers/instagram_tbot_archiver.py rename to src/auto_archiver/modules/instagram_tbot_archiver/instagram_tbot_archiver.py index 01b1614..9fdc208 100644 --- a/src/auto_archiver/archivers/instagram_tbot_archiver.py +++ b/src/auto_archiver/modules/instagram_tbot_archiver/instagram_tbot_archiver.py @@ -7,14 +7,17 @@ relevant media and metadata. The fetched content is saved as `Media` objects in `Metadata` object. """ +import os import shutil -from telethon.sync import TelegramClient -from loguru import logger -import time, os +import time from sqlite3 import OperationalError -from . import Archiver -from ..core import Metadata, Media, ArchivingContext -from ..utils import random_str + +from loguru import logger +from telethon.sync import TelegramClient + +from auto_archiver.archivers import Archiver +from auto_archiver.core import Metadata, Media, ArchivingContext +from auto_archiver.utils import random_str class InstagramTbotArchiver(Archiver): diff --git a/src/auto_archiver/modules/telegram_archiver/__init__.py b/src/auto_archiver/modules/telegram_archiver/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/telegram_archiver/__manifest__.py b/src/auto_archiver/modules/telegram_archiver/__manifest__.py new file mode 100644 index 0000000..b56477a --- /dev/null +++ b/src/auto_archiver/modules/telegram_archiver/__manifest__.py @@ -0,0 +1,26 @@ +{ + "name": "Telegram Archiver", + "type": ["extractor"], + "entry_point": "telegram_archiver:TelegramArchiver", + "requires_setup": False, + "depends": ["core"], + "external_dependencies": { + "python": [ + "requests", + "bs4", + "loguru", + ], + }, + "description": """ + The `TelegramArchiver` retrieves publicly available media content from Telegram message links without requiring login credentials. + It processes URLs to fetch images and videos embedded in Telegram messages, ensuring a structured output using `Metadata` + and `Media` objects. Recommended for scenarios where login-based archiving is not viable, although `telethon_archiver` + is advised for more comprehensive functionality. + + ### Features +- Extracts images and videos from public Telegram message links (`t.me`). +- Processes HTML content of messages to retrieve embedded media. +- Sets structured metadata, including timestamps, content, and media details. +- Does not require user authentication for Telegram. + """, +} diff --git a/src/auto_archiver/archivers/telegram_archiver.py b/src/auto_archiver/modules/telegram_archiver/telegram_archiver.py similarity index 92% rename from src/auto_archiver/archivers/telegram_archiver.py rename to src/auto_archiver/modules/telegram_archiver/telegram_archiver.py index ed57927..c793095 100644 --- a/src/auto_archiver/archivers/telegram_archiver.py +++ b/src/auto_archiver/modules/telegram_archiver/telegram_archiver.py @@ -2,13 +2,14 @@ import requests, re, html from bs4 import BeautifulSoup from loguru import logger -from . import Archiver -from ..core import Metadata, Media +from auto_archiver.archivers import Archiver +from auto_archiver.core import Metadata, Media class TelegramArchiver(Archiver): """ - Archiver for telegram that does not require login, but the telethon_archiver is much more advised, will only return if at least one image or one video is found + Archiver for telegram that does not require login, but the telethon_archiver is much more advised, + will only return if at least one image or one video is found """ name = "telegram_archiver" diff --git a/src/auto_archiver/modules/telethon_archiver/__init__.py b/src/auto_archiver/modules/telethon_archiver/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/telethon_archiver/__manifest__.py b/src/auto_archiver/modules/telethon_archiver/__manifest__.py new file mode 100644 index 0000000..82d56ba --- /dev/null +++ b/src/auto_archiver/modules/telethon_archiver/__manifest__.py @@ -0,0 +1,48 @@ +# TODO rm dependency on json +{ + "name": "telethon_archiver", + "type": ["extractor"], + "entry_point": "telethon_archiver:TelethonArchiver", + "requires_setup": True, + "depends": [""], + "external_dependencies": { + "python": ["telethon", + "loguru", + "tqdm", + ], + "bin": [""] + }, + "configs": { + "api_id": {"default": None, "help": "telegram API_ID value, go to https://my.telegram.org/apps"}, + "api_hash": {"default": None, "help": "telegram API_HASH value, go to https://my.telegram.org/apps"}, + "bot_token": {"default": None, "help": "optional, but allows access to more content such as large videos, talk to @botfather"}, + "session_file": {"default": "secrets/anon", "help": "optional, records the telegram login session for future usage, '.session' will be appended to the provided value."}, + "join_channels": {"default": True, "help": "disables the initial setup with channel_invites config, useful if you have a lot and get stuck"}, + "channel_invites": { + "default": {}, + "help": "(JSON string) private channel invite links (format: t.me/joinchat/HASH OR t.me/+HASH) and (optional but important to avoid hanging for minutes on startup) channel id (format: CHANNEL_ID taken from a post url like https://t.me/c/CHANNEL_ID/1), the telegram account will join any new channels on setup", + # TODO + #"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val)) + } + }, + "description": """ +The `TelethonArchiver` uses the Telethon library to archive posts and media from Telegram channels and groups. +It supports private and public channels, downloading grouped posts with media, and can join channels using invite links +if provided in the configuration. + +### Features +- Fetches posts and metadata from Telegram channels and groups, including private channels. +- Downloads media attachments (e.g., images, videos, audio) from individual posts or grouped posts. +- Handles channel invites to join channels dynamically during setup. +- Utilizes Telethon's capabilities for reliable Telegram interactions. +- Outputs structured metadata and media using `Metadata` and `Media` objects. + +### Setup +To use the `TelethonArchiver`, you must configure the following: +- **API ID and API Hash**: Obtain these from [my.telegram.org](https://my.telegram.org/apps). +- **Session File**: Optional, but records login sessions for future use (default: `secrets/anon.session`). +- **Bot Token**: Optional, allows access to additional content (e.g., large videos) but limits private channel archiving. +- **Channel Invites**: Optional, specify a JSON string of invite links to join channels during setup. + +""" +} diff --git a/src/auto_archiver/archivers/telethon_archiver.py b/src/auto_archiver/modules/telethon_archiver/telethon_archiver.py similarity index 98% rename from src/auto_archiver/archivers/telethon_archiver.py rename to src/auto_archiver/modules/telethon_archiver/telethon_archiver.py index 2e2305d..89668f3 100644 --- a/src/auto_archiver/archivers/telethon_archiver.py +++ b/src/auto_archiver/modules/telethon_archiver/telethon_archiver.py @@ -8,9 +8,9 @@ from loguru import logger from tqdm import tqdm import re, time, json, os -from . import Archiver -from ..core import Metadata, Media, ArchivingContext -from ..utils import random_str +from auto_archiver.archivers import Archiver +from auto_archiver.core import Metadata, Media, ArchivingContext +from auto_archiver.utils import random_str class TelethonArchiver(Archiver): diff --git a/src/auto_archiver/modules/twitter_api_archiver/__init__.py b/src/auto_archiver/modules/twitter_api_archiver/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py b/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py new file mode 100644 index 0000000..f4eb2b9 --- /dev/null +++ b/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py @@ -0,0 +1,45 @@ +{ + "name": "Twitter API Archiver", + "type": ["extractor"], + "entry_point": "twitter_api_archiver:TwitterApiArchiver", + "requires_setup": True, + "depends": ["core"], + "external_dependencies": { + "python": ["requests", + "loguru", + "pytwitter", + "slugify",], + "bin": [""] + }, + "configs": { + "bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"}, + "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))}, + "consumer_key": {"default": None, "help": "twitter API consumer_key"}, + "consumer_secret": {"default": None, "help": "twitter API consumer_secret"}, + "access_token": {"default": None, "help": "twitter API access_token"}, + "access_secret": {"default": None, "help": "twitter API access_secret"}, + }, + "description": """ + The `TwitterApiArchiver` fetches tweets and associated media using the Twitter API. + It supports multiple API configurations for extended rate limits and reliable access. + Features include URL expansion, media downloads (e.g., images, videos), and structured output + via `Metadata` and `Media` objects. Requires Twitter API credentials such as bearer tokens + or consumer key/secret and access token/secret. + + ### Features + - Fetches tweets and their metadata, including text, creation timestamp, and author information. + - Downloads media attachments (e.g., images, videos) in high quality. + - Supports multiple API configurations for improved rate limiting. + - Expands shortened URLs (e.g., `t.co` links). + - Outputs structured metadata and media using `Metadata` and `Media` objects. + + ### Setup + To use the `TwitterApiArchiver`, you must provide valid Twitter API credentials via configuration: + - **Bearer Token(s)**: A single token or a list for rate-limited API access. + - **Consumer Key and Secret**: Required for user-authenticated API access. + - **Access Token and Secret**: Complements the consumer key for enhanced API capabilities. + + Credentials can be obtained by creating a Twitter developer account at [Twitter Developer Platform](https://developer.twitter.com/en). + """ +, +} diff --git a/src/auto_archiver/archivers/twitter_api_archiver.py b/src/auto_archiver/modules/twitter_api_archiver/twitter_api_archiver.py similarity index 98% rename from src/auto_archiver/archivers/twitter_api_archiver.py rename to src/auto_archiver/modules/twitter_api_archiver/twitter_api_archiver.py index d1e4dee..eb607cc 100644 --- a/src/auto_archiver/archivers/twitter_api_archiver.py +++ b/src/auto_archiver/modules/twitter_api_archiver/twitter_api_archiver.py @@ -8,8 +8,8 @@ from loguru import logger from pytwitter import Api from slugify import slugify -from . import Archiver -from ..core import Metadata,Media +from auto_archiver.archivers import Archiver +from auto_archiver.core import Metadata,Media class TwitterApiArchiver(Archiver): name = "twitter_api_archiver" diff --git a/src/auto_archiver/modules/vk_archiver/__init__.py b/src/auto_archiver/modules/vk_archiver/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/vk_archiver/__manifest__.py b/src/auto_archiver/modules/vk_archiver/__manifest__.py new file mode 100644 index 0000000..69bf162 --- /dev/null +++ b/src/auto_archiver/modules/vk_archiver/__manifest__.py @@ -0,0 +1,37 @@ +{ + "name": "VKontakte Archiver", + "type": ["extractor"], + "entry_point": "vk_archiver:VKArchiver", + "requires_setup": True, + "depends": ["core", "utils"], + "external_dependencies": { + "python": ["loguru", + "vk_url_scraper"], + }, + "configs": { + "username": {"default": None, "help": "valid VKontakte username"}, + "password": {"default": None, "help": "valid VKontakte password"}, + "session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"}, + }, + "description": """ +The `VkArchiver` fetches posts, text, and images from VK (VKontakte) social media pages. +This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract +and download content. Note that VK videos are handled separately by the `YTDownloader`. + +### Features +- Extracts text, timestamps, and metadata from VK `/wall` posts. +- Downloads associated images and attaches them to the resulting `Metadata` object. +- Processes multiple segments of VK URLs that contain mixed content (e.g., wall, photo). +- Outputs structured metadata and media using `Metadata` and `Media` objects. + +### Setup +To use the `VkArchiver`, you must provide valid VKontakte login credentials and session information: +- **Username**: A valid VKontakte account username. +- **Password**: The corresponding password for the VKontakte account. +- **Session File**: Optional. Path to a session configuration file (`.json`) for persistent VK login. + +Credentials can be set in the configuration file or directly via environment variables. Ensure you +have access to the VKontakte API by creating an account at [VKontakte](https://vk.com/). +""" +, +} diff --git a/src/auto_archiver/archivers/vk_archiver.py b/src/auto_archiver/modules/vk_archiver/vk_archiver.py similarity index 91% rename from src/auto_archiver/archivers/vk_archiver.py rename to src/auto_archiver/modules/vk_archiver/vk_archiver.py index f8bb60a..3cfb446 100644 --- a/src/auto_archiver/archivers/vk_archiver.py +++ b/src/auto_archiver/modules/vk_archiver/vk_archiver.py @@ -1,9 +1,9 @@ from loguru import logger from vk_url_scraper import VkScraper -from ..utils.misc import dump_payload -from . import Archiver -from ..core import Metadata, Media, ArchivingContext +from auto_archiver.utils.misc import dump_payload +from auto_archiver.archivers import Archiver +from auto_archiver.core import Metadata, Media, ArchivingContext class VkArchiver(Archiver):