More manifests, base modules and rename from archiver to extractor.

2026-06-12 21:28:29 +03:00 · 2025-01-23 16:40:48 +00:00
parent 9db26cdfc2
commit 1274a1b231
93 changed files with 378 additions and 238 deletions
--- a/src/auto_archiver/modules/vk_extractor/init.py
+++ b/src/auto_archiver/modules/vk_extractor/init.py
--- a/src/auto_archiver/modules/vk_extractor/manifest.py
+++ b/src/auto_archiver/modules/vk_extractor/manifest.py
@@ -0,0 +1,36 @@
+{
+    "name": "VKontakte Extractor",
+    "type": ["extractor"],
+    "requires_setup": True,
+    "depends": ["core", "utils"],
+    "external_dependencies": {
+        "python": ["loguru",
+                   "vk_url_scraper"],
+    },
+    "configs": {
+            "username": {"default": None, "help": "valid VKontakte username"},
+            "password": {"default": None, "help": "valid VKontakte password"},
+            "session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
+        },
+    "description": """
+The `VkExtractor` fetches posts, text, and images from VK (VKontakte) social media pages. 
+This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract 
+and download content. Note that VK videos are handled separately by the `YTDownloader`.
+
+### Features
+- Extracts text, timestamps, and metadata from VK `/wall` posts.
+- Downloads associated images and attaches them to the resulting `Metadata` object.
+- Processes multiple segments of VK URLs that contain mixed content (e.g., wall, photo).
+- Outputs structured metadata and media using `Metadata` and `Media` objects.
+
+### Setup
+To use the `VkArchiver`, you must provide valid VKontakte login credentials and session information:
+- **Username**: A valid VKontakte account username.
+- **Password**: The corresponding password for the VKontakte account.
+- **Session File**: Optional. Path to a session configuration file (`.json`) for persistent VK login.
+
+Credentials can be set in the configuration file or directly via environment variables. Ensure you 
+have access to the VKontakte API by creating an account at [VKontakte](https://vk.com/).
+"""
+,
+}
--- a/src/auto_archiver/modules/vk_extractor/vk_archiver.py
+++ b/src/auto_archiver/modules/vk_extractor/vk_archiver.py
@@ -0,0 +1,45 @@
+from loguru import logger
+from vk_url_scraper import VkScraper
+
+from auto_archiver.utils.misc import dump_payload
+from auto_archiver.base_modules import Extractor
+from auto_archiver.core import Metadata, Media, ArchivingContext
+
+
+class VkExtractor(Extractor):
+    """"
+    VK videos are handled by YTDownloader, this archiver gets posts text and images.
+    Currently only works for /wall posts
+    """
+    name = "vk_extractor"
+
+    def __init__(self, config: dict) -> None:
+        super().__init__(config)
+        self.assert_valid_string("username")
+        self.assert_valid_string("password")
+        self.vks = VkScraper(self.username, self.password, session_file=self.session_file)
+
+    def download(self, item: Metadata) -> Metadata:
+        url = item.get_url()
+
+        if "vk.com" not in item.netloc: return False
+
+        # some urls can contain multiple wall/photo/... parts and all will be fetched
+        vk_scrapes = self.vks.scrape(url)
+        if not len(vk_scrapes): return False
+        logger.debug(f"VK: got {len(vk_scrapes)} scraped instances")
+
+        result = Metadata()
+        for scrape in vk_scrapes:
+            if not result.get_title():
+                result.set_title(scrape["text"])
+            if not result.get_timestamp():
+                result.set_timestamp(scrape["datetime"])
+
+        result.set_content(dump_payload(vk_scrapes))
+
+        filenames = self.vks.download_media(vk_scrapes, ArchivingContext.get_tmp_dir())
+        for filename in filenames:
+            result.add_media(Media(filename))
+
+        return result.success("vk")