mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-13 05:38:29 +03:00
Merge branch 'dev' into specify-medatada-feature
This commit is contained in:
@@ -5,6 +5,9 @@ from auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher
|
||||
from .test_extractor_base import TestExtractorBase
|
||||
|
||||
|
||||
CI = os.getenv("GITHUB_ACTIONS", "") == "true"
|
||||
|
||||
|
||||
class DummySB:
|
||||
def __init__(self, url="", title="", visible_texts=None, visible_elements=None):
|
||||
self._url = url
|
||||
@@ -51,14 +54,15 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
|
||||
@pytest.mark.download
|
||||
@pytest.mark.parametrize(
|
||||
"url,in_title,in_text,image_count,video_count",
|
||||
"url,in_title,in_text,image_count,video_count,skip_ci",
|
||||
[
|
||||
(
|
||||
"https://en.wikipedia.org/wiki/Western_barn_owl",
|
||||
"western barn owl",
|
||||
"Tyto alba",
|
||||
5,
|
||||
4,
|
||||
0,
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://www.bellingcat.com/news/2025/04/29/open-sources-show-myanmar-junta-airstrike-damages-despite-post-earthquake-ceasefire/",
|
||||
@@ -66,6 +70,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
"Bellingcat has geolocated",
|
||||
5,
|
||||
0,
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://www.bellingcat.com/news/2025/03/27/gaza-israel-palestine-shot-killed-injured-destroyed-dangerous-drone-journalists-in-gaza/",
|
||||
@@ -73,6 +78,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
"continued the work of Gazan journalists",
|
||||
5,
|
||||
1,
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://www.bellingcat.com/about/general-information",
|
||||
@@ -80,6 +86,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
"Stichting Bellingcat",
|
||||
0, # SVGs are ignored
|
||||
0,
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://vk.com/wikipedia?from=search&w=wall-36156673_20451",
|
||||
@@ -87,6 +94,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
"16 сентября 1985 года лейблом EMI Records.",
|
||||
5,
|
||||
0,
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://www.tiktok.com/@tracy_2424/photo/7418200173953830162",
|
||||
@@ -94,13 +102,19 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
"Dito ko lang",
|
||||
1,
|
||||
0,
|
||||
True,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_download_pages_with_media(self, setup_module, make_item, url, in_title, in_text, image_count, video_count):
|
||||
def test_download_pages_with_media(
|
||||
self, setup_module, make_item, url, in_title, in_text, image_count, video_count, skip_ci
|
||||
):
|
||||
"""
|
||||
Test downloading pages with media.
|
||||
"""
|
||||
if CI and skip_ci:
|
||||
pytest.skip("Skipping test in CI environment")
|
||||
|
||||
self.extractor = setup_module(
|
||||
self.extractor_module,
|
||||
self.config
|
||||
|
||||
@@ -48,8 +48,6 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
("https://www.youtube.com/watch?v=5qap5aO4i9A", ["youtube"]),
|
||||
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", ["tiktok"]),
|
||||
("https://www.instagram.com/p/CU1J9JYJ9Zz/", ["instagram"]),
|
||||
("https://www.facebook.com/nytimes/videos/10160796550110716", ["facebook"]),
|
||||
("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/", ["facebook"]),
|
||||
],
|
||||
)
|
||||
def test_suitable_extractors(self, url, suitable_extractors):
|
||||
@@ -148,6 +146,7 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
def test_bluesky_download_video(self, make_item):
|
||||
item = make_item("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
|
||||
result = self.extractor.download(item)
|
||||
assert result.get_url() == "https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i"
|
||||
assert result is not False
|
||||
|
||||
@pytest.mark.skipif(not TEST_TRUTH_SOCIAL, reason="Truth social download tests disabled in environment variables.")
|
||||
|
||||
@@ -55,6 +55,7 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
|
||||
("https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375", True),
|
||||
("https://www.tiktok.com/t/ZP8YQ8e5j/", True),
|
||||
("https://vt.tiktok.com/ZSMTJeqRP/", True),
|
||||
("https://tiktok.com/@user/photo/123?lang=en", True),
|
||||
],
|
||||
)
|
||||
def test_is_suitable(self, url, is_suitable, tiktok_dropin):
|
||||
@@ -68,10 +69,7 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
|
||||
mock_get.assert_called_once()
|
||||
mock_get.return_value.json.assert_called_once()
|
||||
# first message is just the 'Skipping using ytdlp to download files for TikTok' message
|
||||
assert (
|
||||
"failed to parse JSON response from tikwm.com for url='https://www.tiktok.com/@example/video/1234'"
|
||||
in caplog.text
|
||||
)
|
||||
assert "Failed to parse JSON response from tikwm.com" in caplog.text
|
||||
|
||||
mock_get.return_value.json.side_effect = Exception
|
||||
with caplog.at_level("ERROR"):
|
||||
@@ -79,10 +77,7 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
|
||||
mock_get.assert_called()
|
||||
assert mock_get.call_count == 2
|
||||
assert mock_get.return_value.json.call_count == 2
|
||||
assert (
|
||||
"failed to parse JSON response from tikwm.com for url='https://www.tiktok.com/@example/video/1234'"
|
||||
in caplog.text
|
||||
)
|
||||
assert "Failed to parse JSON response from tikwm.com" in caplog.text
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"response",
|
||||
@@ -98,27 +93,30 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
|
||||
assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False
|
||||
mock_get.assert_called_once()
|
||||
mock_get.return_value.json.assert_called_once()
|
||||
assert "failed to get a valid response from tikwm.com" in caplog.text
|
||||
assert "Unable to download with tikwm.com: " in caplog.text
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"response,has_vid",
|
||||
"response,is_success",
|
||||
[
|
||||
({"data": {"id": 123}}, False),
|
||||
({"data": {"wmplay": "url"}}, True),
|
||||
({"data": {"play": "url"}}, True),
|
||||
({"data": {"id": 123, "images": []}}, False),
|
||||
({"data": {"wmplay": "url", "images": ["img1.jpg"]}}, True),
|
||||
({"data": {"play": "url", "images": ["img1.jpg"]}}, True),
|
||||
({"data": {"images": ["img1.jpg"]}}, True),
|
||||
],
|
||||
)
|
||||
def test_correct_extraction(self, mock_get, make_item, response, has_vid, mocker):
|
||||
def test_correct_extraction(self, mock_get, make_item, response, is_success, mocker):
|
||||
data = {k: v for k, v in response.get("data", {}).items()}
|
||||
mock_get.return_value.status_code = 200
|
||||
mock_get.return_value.json.return_value = {"msg": "success", **response}
|
||||
result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
|
||||
if not has_vid:
|
||||
assert result is False
|
||||
else:
|
||||
total_media = len(data.get("images", [])) + (1 if data.get("wmplay", data.get("play")) else 0)
|
||||
if is_success:
|
||||
assert result.is_success()
|
||||
assert len(result.media) == 1
|
||||
assert len(result.media) == total_media
|
||||
else:
|
||||
assert result is False
|
||||
mock_get.assert_called()
|
||||
assert mock_get.call_count == 1 + int(has_vid)
|
||||
assert mock_get.call_count == 1 + total_media
|
||||
mock_get.return_value.json.assert_called_once()
|
||||
|
||||
def test_correct_data_extracted(self, mock_get, make_item):
|
||||
@@ -142,7 +140,9 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
|
||||
assert len(result.media) == 2
|
||||
assert result.get_title() == "Title"
|
||||
assert result.get("author") == "Author"
|
||||
assert result.get("api_data") == {"other": "data", "id": 123}
|
||||
assert result.get("other") == "data"
|
||||
assert result.get("comments") is None
|
||||
assert result.get("api_data") == {"id": 123, "other": "data"}
|
||||
assert result.media[1].get("duration") == 60
|
||||
assert result.get("timestamp") == datetime.fromtimestamp(1736301699, tz=timezone.utc)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user