mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
More manifests, base modules and rename from archiver to extractor.
This commit is contained in:
0
src/auto_archiver/modules/vk_extractor/__init__.py
Normal file
0
src/auto_archiver/modules/vk_extractor/__init__.py
Normal file
36
src/auto_archiver/modules/vk_extractor/__manifest__.py
Normal file
36
src/auto_archiver/modules/vk_extractor/__manifest__.py
Normal file
@@ -0,0 +1,36 @@
|
||||
{
|
||||
"name": "VKontakte Extractor",
|
||||
"type": ["extractor"],
|
||||
"requires_setup": True,
|
||||
"depends": ["core", "utils"],
|
||||
"external_dependencies": {
|
||||
"python": ["loguru",
|
||||
"vk_url_scraper"],
|
||||
},
|
||||
"configs": {
|
||||
"username": {"default": None, "help": "valid VKontakte username"},
|
||||
"password": {"default": None, "help": "valid VKontakte password"},
|
||||
"session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
|
||||
},
|
||||
"description": """
|
||||
The `VkExtractor` fetches posts, text, and images from VK (VKontakte) social media pages.
|
||||
This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract
|
||||
and download content. Note that VK videos are handled separately by the `YTDownloader`.
|
||||
|
||||
### Features
|
||||
- Extracts text, timestamps, and metadata from VK `/wall` posts.
|
||||
- Downloads associated images and attaches them to the resulting `Metadata` object.
|
||||
- Processes multiple segments of VK URLs that contain mixed content (e.g., wall, photo).
|
||||
- Outputs structured metadata and media using `Metadata` and `Media` objects.
|
||||
|
||||
### Setup
|
||||
To use the `VkArchiver`, you must provide valid VKontakte login credentials and session information:
|
||||
- **Username**: A valid VKontakte account username.
|
||||
- **Password**: The corresponding password for the VKontakte account.
|
||||
- **Session File**: Optional. Path to a session configuration file (`.json`) for persistent VK login.
|
||||
|
||||
Credentials can be set in the configuration file or directly via environment variables. Ensure you
|
||||
have access to the VKontakte API by creating an account at [VKontakte](https://vk.com/).
|
||||
"""
|
||||
,
|
||||
}
|
||||
45
src/auto_archiver/modules/vk_extractor/vk_archiver.py
Normal file
45
src/auto_archiver/modules/vk_extractor/vk_archiver.py
Normal file
@@ -0,0 +1,45 @@
|
||||
from loguru import logger
|
||||
from vk_url_scraper import VkScraper
|
||||
|
||||
from auto_archiver.utils.misc import dump_payload
|
||||
from auto_archiver.base_modules import Extractor
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
|
||||
|
||||
class VkExtractor(Extractor):
|
||||
""""
|
||||
VK videos are handled by YTDownloader, this archiver gets posts text and images.
|
||||
Currently only works for /wall posts
|
||||
"""
|
||||
name = "vk_extractor"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
self.assert_valid_string("username")
|
||||
self.assert_valid_string("password")
|
||||
self.vks = VkScraper(self.username, self.password, session_file=self.session_file)
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
if "vk.com" not in item.netloc: return False
|
||||
|
||||
# some urls can contain multiple wall/photo/... parts and all will be fetched
|
||||
vk_scrapes = self.vks.scrape(url)
|
||||
if not len(vk_scrapes): return False
|
||||
logger.debug(f"VK: got {len(vk_scrapes)} scraped instances")
|
||||
|
||||
result = Metadata()
|
||||
for scrape in vk_scrapes:
|
||||
if not result.get_title():
|
||||
result.set_title(scrape["text"])
|
||||
if not result.get_timestamp():
|
||||
result.set_timestamp(scrape["datetime"])
|
||||
|
||||
result.set_content(dump_payload(vk_scrapes))
|
||||
|
||||
filenames = self.vks.download_media(vk_scrapes, ArchivingContext.get_tmp_dir())
|
||||
for filename in filenames:
|
||||
result.add_media(Media(filename))
|
||||
|
||||
return result.success("vk")
|
||||
Reference in New Issue
Block a user