mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 05:08:28 +03:00
REMOVES vk_extractor until further notice
This commit is contained in:
@@ -1 +0,0 @@
|
||||
from .vk_extractor import VkExtractor
|
||||
@@ -1,37 +0,0 @@
|
||||
{
|
||||
"name": "VKontakte Extractor",
|
||||
"type": ["extractor"],
|
||||
"requires_setup": True,
|
||||
"depends": ["core", "utils"],
|
||||
"dependencies": {
|
||||
"python": ["loguru", "vk_url_scraper"],
|
||||
},
|
||||
"configs": {
|
||||
"username": {"required": True, "help": "valid VKontakte username"},
|
||||
"password": {"required": True, "help": "valid VKontakte password"},
|
||||
"session_file": {
|
||||
"default": "secrets/vk_config.v2.json",
|
||||
"help": "valid VKontakte password",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
The `VkExtractor` fetches posts, text, and images from VK (VKontakte) social media pages.
|
||||
This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract
|
||||
and download content. Note that VK videos are handled separately by the `YTDownloader`.
|
||||
|
||||
### Features
|
||||
- Extracts text, timestamps, and metadata from VK `/wall` posts.
|
||||
- Downloads associated images and attaches them to the resulting `Metadata` object.
|
||||
- Processes multiple segments of VK URLs that contain mixed content (e.g., wall, photo).
|
||||
- Outputs structured metadata and media using `Metadata` and `Media` objects.
|
||||
|
||||
### Setup
|
||||
To use the `VkArchiver`, you must provide valid VKontakte login credentials and session information:
|
||||
- **Username**: A valid VKontakte account username.
|
||||
- **Password**: The corresponding password for the VKontakte account.
|
||||
- **Session File**: Optional. Path to a session configuration file (`.json`) for persistent VK login.
|
||||
|
||||
Credentials can be set in the configuration file or directly via environment variables. Ensure you
|
||||
have access to the VKontakte API by creating an account at [VKontakte](https://vk.com/).
|
||||
""",
|
||||
}
|
||||
@@ -1,43 +0,0 @@
|
||||
from loguru import logger
|
||||
from vk_url_scraper import VkScraper
|
||||
|
||||
from auto_archiver.utils.misc import dump_payload
|
||||
from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
|
||||
class VkExtractor(Extractor):
|
||||
""" "
|
||||
VK videos are handled by YTDownloader, this archiver gets posts text and images.
|
||||
Currently only works for /wall posts
|
||||
"""
|
||||
|
||||
def setup(self) -> None:
|
||||
self.vks = VkScraper(self.username, self.password, session_file=self.session_file)
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
if "vk.com" not in item.netloc:
|
||||
return False
|
||||
|
||||
# some urls can contain multiple wall/photo/... parts and all will be fetched
|
||||
vk_scrapes = self.vks.scrape(url)
|
||||
if not len(vk_scrapes):
|
||||
return False
|
||||
logger.debug(f"VK: got {len(vk_scrapes)} scraped instances")
|
||||
|
||||
result = Metadata()
|
||||
for scrape in vk_scrapes:
|
||||
if not result.get_title():
|
||||
result.set_title(scrape["text"])
|
||||
if not result.get_timestamp():
|
||||
result.set_timestamp(scrape["datetime"])
|
||||
|
||||
result.set_content(dump_payload(vk_scrapes))
|
||||
|
||||
filenames = self.vks.download_media(vk_scrapes, self.tmp_dir)
|
||||
for filename in filenames:
|
||||
result.add_media(Media(filename))
|
||||
|
||||
return result.success("vk")
|
||||
Reference in New Issue
Block a user