mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
v1.0.1 dependency updates, generic extractor improvements (#307)
* wacz: allow exceptional cases where more than one resource image is available * improves generic extractor edge-cases and yt-dlp updates * REMOVES vk_extractor until further notice * bumps browsertrix in docker image * npm version bump on scripts/settings * poetry updates * Changed log level on gsheet_feeder_db started from warning to info (#301) * closes 305 and further fixes finding local downloads from uncommon ytdlp extractors * use ffmpeg -bitexact to reduce duplicate content storing * formatting * adds yt-dlp curl-cffi * version bump * linting --------- Co-authored-by: Dave Mateer <davemateer@gmail.com>
This commit is contained in:
committed by
GitHub
parent
48be13fb2a
commit
6735fa890b
@@ -119,4 +119,4 @@ def test_extract_media(wacz_enricher, metadata, tmp_path, mocker) -> None:
|
||||
metadata.add_media(Media("something.wacz"), "browsertrix")
|
||||
wacz_enricher.extract_media_from_wacz(metadata, str(wacz_file))
|
||||
assert len(metadata.media) == 2
|
||||
assert metadata.media[1].properties.get("id") == "browsertrix-screenshot"
|
||||
assert metadata.media[1].properties.get("id") == "browsertrix-screenshot-0"
|
||||
|
||||
@@ -1,77 +0,0 @@
|
||||
import pytest
|
||||
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.modules.vk_extractor import VkExtractor
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_vk_scraper(mocker):
|
||||
"""Fixture to mock VkScraper."""
|
||||
return mocker.patch("auto_archiver.modules.vk_extractor.vk_extractor.VkScraper")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vk_extractor(setup_module, mock_vk_scraper) -> VkExtractor:
|
||||
"""Fixture to initialize VkExtractor with mocked VkScraper."""
|
||||
extractor_module = "vk_extractor"
|
||||
configs = {
|
||||
"username": "name",
|
||||
"password": "password123",
|
||||
"session_file": "secrets/vk_config.v2.json",
|
||||
}
|
||||
vk = setup_module(extractor_module, configs)
|
||||
vk.vks = mock_vk_scraper.return_value
|
||||
return vk
|
||||
|
||||
|
||||
def test_netloc(vk_extractor, metadata):
|
||||
# metadata url set as: "https://example.com/"
|
||||
assert vk_extractor.download(metadata) is False
|
||||
|
||||
|
||||
def test_vk_url_but_scrape_returns_empty(vk_extractor, metadata):
|
||||
metadata.set_url("https://vk.com/valid-wall")
|
||||
vk_extractor.vks.scrape.return_value = []
|
||||
assert vk_extractor.download(metadata) is False
|
||||
assert metadata.netloc == "vk.com"
|
||||
vk_extractor.vks.scrape.assert_called_once_with(metadata.get_url())
|
||||
|
||||
|
||||
def test_successful_scrape_and_download(vk_extractor, metadata, mocker):
|
||||
mock_scrapes = [
|
||||
{"text": "Post Title", "datetime": "2023-01-01T00:00:00", "id": 1},
|
||||
{"text": "Another Post", "datetime": "2023-01-02T00:00:00", "id": 2},
|
||||
]
|
||||
mock_filenames = ["image1.jpg", "image2.png"]
|
||||
vk_extractor.vks.scrape.return_value = mock_scrapes
|
||||
vk_extractor.vks.download_media.return_value = mock_filenames
|
||||
metadata.set_url("https://vk.com/valid-wall")
|
||||
result = vk_extractor.download(metadata)
|
||||
# Test metadata
|
||||
assert result.is_success()
|
||||
assert result.status == "vk: success"
|
||||
assert result.get_title() == "Post Title"
|
||||
assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
|
||||
assert "Another Post" in result.metadata["content"]
|
||||
# Test Media objects
|
||||
assert len(result.media) == 2
|
||||
assert result.media[0].filename == "image1.jpg"
|
||||
assert result.media[1].filename == "image2.png"
|
||||
vk_extractor.vks.download_media.assert_called_once_with(mock_scrapes, vk_extractor.tmp_dir)
|
||||
|
||||
|
||||
def test_adds_first_title_and_timestamp(vk_extractor):
|
||||
metadata = Metadata().set_url("https://vk.com/no-metadata")
|
||||
metadata.set_url("https://vk.com/no-metadata")
|
||||
mock_scrapes = [
|
||||
{"text": "value", "datetime": "2023-01-01T00:00:00"},
|
||||
{"text": "value2", "datetime": "2023-01-02T00:00:00"},
|
||||
]
|
||||
vk_extractor.vks.scrape.return_value = mock_scrapes
|
||||
vk_extractor.vks.download_media.return_value = []
|
||||
result = vk_extractor.download(metadata)
|
||||
|
||||
assert result.get_title() == "value"
|
||||
# formatted timestamp
|
||||
assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
|
||||
assert result.is_success()
|
||||
Reference in New Issue
Block a user