mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-08 03:18:28 +03:00
Merge main
This commit is contained in:
@@ -7,7 +7,6 @@ from auto_archiver.core.extractor import Extractor
|
||||
|
||||
|
||||
class TestExtractorBase(object):
|
||||
|
||||
extractor_module: str = None
|
||||
config: dict = None
|
||||
|
||||
@@ -17,7 +16,7 @@ class TestExtractorBase(object):
|
||||
assert self.config is not None, "self.config must be a dict set on the subclass"
|
||||
|
||||
self.extractor: Type[Extractor] = setup_module(self.extractor_module, self.config)
|
||||
|
||||
|
||||
def assertValidResponseMetadata(self, test_response: Metadata, title: str, timestamp: str, status: str = ""):
|
||||
assert test_response is not False
|
||||
|
||||
|
||||
@@ -9,26 +9,28 @@ import pytest
|
||||
from auto_archiver.modules.generic_extractor.generic_extractor import GenericExtractor
|
||||
from .test_extractor_base import TestExtractorBase
|
||||
|
||||
CI=os.getenv("GITHUB_ACTIONS", '') == 'true'
|
||||
CI = os.getenv("GITHUB_ACTIONS", "") == "true"
|
||||
|
||||
|
||||
class TestGenericExtractor(TestExtractorBase):
|
||||
"""Tests Generic Extractor
|
||||
"""
|
||||
extractor_module = 'generic_extractor'
|
||||
"""Tests Generic Extractor"""
|
||||
|
||||
extractor_module = "generic_extractor"
|
||||
extractor: GenericExtractor
|
||||
|
||||
config = {
|
||||
'subtitles': False,
|
||||
'comments': False,
|
||||
'livestreams': False,
|
||||
'live_from_start': False,
|
||||
'end_means_success': True,
|
||||
'allow_playlist': False,
|
||||
'max_downloads': "inf",
|
||||
'proxy': None,
|
||||
'cookies_from_browser': False,
|
||||
'cookie_file': None,
|
||||
}
|
||||
|
||||
"subtitles": False,
|
||||
"comments": False,
|
||||
"livestreams": False,
|
||||
"live_from_start": False,
|
||||
"end_means_success": True,
|
||||
"allow_playlist": False,
|
||||
"max_downloads": "inf",
|
||||
"proxy": None,
|
||||
"cookies_from_browser": False,
|
||||
"cookie_file": None,
|
||||
}
|
||||
|
||||
def test_load_dropin(self):
|
||||
# test loading dropins that are in the generic_archiver package
|
||||
package = "auto_archiver.modules.generic_extractor"
|
||||
@@ -38,32 +40,42 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
path = os.path.join(dirname(dirname(__file__)), "data/")
|
||||
assert self.extractor.dropin_for_name("dropin", additional_paths=[path])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("url, suitable_extractors", [
|
||||
("https://www.youtube.com/watch?v=5qap5aO4i9A", ["youtube"]),
|
||||
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", ["tiktok"]),
|
||||
("https://www.instagram.com/p/CU1J9JYJ9Zz/", ["instagram"]),
|
||||
("https://www.facebook.com/nytimes/videos/10160796550110716", ["facebook"]),
|
||||
("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/", ["facebook"]),])
|
||||
@pytest.mark.parametrize(
|
||||
"url, suitable_extractors",
|
||||
[
|
||||
("https://www.youtube.com/watch?v=5qap5aO4i9A", ["youtube"]),
|
||||
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", ["tiktok"]),
|
||||
("https://www.instagram.com/p/CU1J9JYJ9Zz/", ["instagram"]),
|
||||
("https://www.facebook.com/nytimes/videos/10160796550110716", ["facebook"]),
|
||||
("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/", ["facebook"]),
|
||||
],
|
||||
)
|
||||
def test_suitable_extractors(self, url, suitable_extractors):
|
||||
suitable_extractors = suitable_extractors + ['generic'] # the generic is valid for all
|
||||
suitable_extractors = suitable_extractors + ["generic"] # the generic is valid for all
|
||||
extractors = list(self.extractor.suitable_extractors(url))
|
||||
assert len(extractors) == len(suitable_extractors)
|
||||
assert [e.ie_key().lower() for e in extractors] == suitable_extractors
|
||||
|
||||
@pytest.mark.parametrize("url, is_suitable", [
|
||||
("https://www.youtube.com/watch?v=5qap5aO4i9A", True),
|
||||
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", True),
|
||||
("https://www.instagram.com/p/CU1J9JYJ9Zz/", True),
|
||||
("https://www.facebook.com/nytimes/videos/10160796550110716", True),
|
||||
("https://www.twitch.tv/videos/1167226570", True),
|
||||
("https://bellingcat.com/news/2021/10/08/ukrainian-soldiers-are-being-killed-by-landmines-in-the-donbas/", True),
|
||||
("https://google.com", True)])
|
||||
@pytest.mark.parametrize(
|
||||
"url, is_suitable",
|
||||
[
|
||||
("https://www.youtube.com/watch?v=5qap5aO4i9A", True),
|
||||
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", True),
|
||||
("https://www.instagram.com/p/CU1J9JYJ9Zz/", True),
|
||||
("https://www.facebook.com/nytimes/videos/10160796550110716", True),
|
||||
("https://www.twitch.tv/videos/1167226570", True),
|
||||
(
|
||||
"https://bellingcat.com/news/2021/10/08/ukrainian-soldiers-are-being-killed-by-landmines-in-the-donbas/",
|
||||
True,
|
||||
),
|
||||
("https://google.com", True),
|
||||
],
|
||||
)
|
||||
def test_suitable_urls(self, url, is_suitable):
|
||||
"""
|
||||
Note: expected behaviour is to return True for all URLs, as YoutubeDLArchiver should be able to handle all URLs
|
||||
This behaviour may be changed in the future (e.g. if we want the youtubedl archiver to just handle URLs it has extractors for,
|
||||
and then if and only if all archivers fails, does it fall back to the generic archiver)
|
||||
Note: expected behaviour is to return True for all URLs, as YoutubeDLArchiver should be able to handle all URLs
|
||||
This behaviour may be changed in the future (e.g. if we want the youtubedl archiver to just handle URLs it has extractors for,
|
||||
and then if and only if all archivers fails, does it fall back to the generic archiver)
|
||||
"""
|
||||
assert self.extractor.suitable(url) == is_suitable
|
||||
|
||||
@@ -74,12 +86,15 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
assert result.get_url() == "https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970"
|
||||
|
||||
@pytest.mark.download
|
||||
@pytest.mark.parametrize("url", [
|
||||
"https://bsky.app/profile/colborne.bsky.social/post/3lcxcpgt6j42l",
|
||||
"twitter.com/bellingcat/status/123",
|
||||
"https://www.youtube.com/watch?v=1"
|
||||
])
|
||||
def test_download_nonexistend_media(self, make_item, url):
|
||||
@pytest.mark.parametrize(
|
||||
"url",
|
||||
[
|
||||
"https://bsky.app/profile/colborne.bsky.social/post/3lcxcpgt6j42l",
|
||||
"twitter.com/bellingcat/status/123",
|
||||
"https://www.youtube.com/watch?v=1",
|
||||
],
|
||||
)
|
||||
def test_download_nonexistent_media(self, make_item, url):
|
||||
"""
|
||||
Test to make sure that the extractor doesn't break on non-existend posts/media
|
||||
|
||||
@@ -89,7 +104,10 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
result = self.extractor.download(item)
|
||||
assert not result
|
||||
|
||||
@pytest.mark.skipif(CI, reason="Currently no way to authenticate when on CI. Youtube (yt-dlp) doesn't support logging in with username/password.")
|
||||
@pytest.mark.skipif(
|
||||
CI,
|
||||
reason="Currently no way to authenticate when on CI. Youtube (yt-dlp) doesn't support logging in with username/password.",
|
||||
)
|
||||
@pytest.mark.download
|
||||
def test_youtube_download(self, make_item):
|
||||
# url https://www.youtube.com/watch?v=5qap5aO4i9A
|
||||
@@ -98,7 +116,10 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
result = self.extractor.download(item)
|
||||
assert result.get_url() == "https://www.youtube.com/watch?v=J---aiyznGQ"
|
||||
assert result.get_title() == "Keyboard Cat! - THE ORIGINAL!"
|
||||
assert result.get('description') == "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
|
||||
assert (
|
||||
result.get("description")
|
||||
== "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
|
||||
)
|
||||
assert len(result.media) == 2
|
||||
assert Path(result.media[0].filename).name == "J---aiyznGQ.webm"
|
||||
assert Path(result.media[1].filename).name == "hqdefault.jpg"
|
||||
@@ -114,7 +135,7 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
item = make_item("https://bsky.app/profile/bellingcat.com/post/3lfn3hbcxgc2q")
|
||||
result = self.extractor.download(item)
|
||||
assert result is not False
|
||||
|
||||
|
||||
@pytest.mark.download
|
||||
def test_bluesky_download_no_media(self, make_item):
|
||||
item = make_item("https://bsky.app/profile/bellingcat.com/post/3lfphwmcs4c2z")
|
||||
@@ -126,7 +147,7 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
item = make_item("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
|
||||
result = self.extractor.download(item)
|
||||
assert result is not False
|
||||
|
||||
|
||||
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
|
||||
@pytest.mark.download
|
||||
def test_truthsocial_download_video(self, make_item):
|
||||
@@ -141,14 +162,14 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
item = make_item("https://truthsocial.com/@bbcnewa/posts/109598702184774628")
|
||||
result = self.extractor.download(item)
|
||||
assert result is not False
|
||||
|
||||
|
||||
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
|
||||
@pytest.mark.download
|
||||
def test_truthsocial_download_poll(self, make_item):
|
||||
item = make_item("https://truthsocial.com/@CNN_US/posts/113724326568555098")
|
||||
result = self.extractor.download(item)
|
||||
assert result is not False
|
||||
|
||||
|
||||
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
|
||||
@pytest.mark.download
|
||||
def test_truthsocial_download_single_image(self, make_item):
|
||||
@@ -170,7 +191,7 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
url = "https://x.com/Bellingcat/status/17197025860711058"
|
||||
response = self.extractor.download(make_item(url))
|
||||
assert not response
|
||||
|
||||
|
||||
@pytest.mark.download
|
||||
def test_twitter_download_malformed_tweetid(self, make_item):
|
||||
# this tweet does not exist
|
||||
@@ -180,7 +201,6 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
|
||||
@pytest.mark.download
|
||||
def test_twitter_download_tweet_no_media(self, make_item):
|
||||
|
||||
item = make_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w")
|
||||
post = self.extractor.download(item)
|
||||
|
||||
@@ -188,9 +208,9 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
post,
|
||||
"Onion rings are just vegetable donuts.",
|
||||
datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
|
||||
"yt-dlp_Twitter: success"
|
||||
"yt-dlp_Twitter: success",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.download
|
||||
def test_twitter_download_video(self, make_item):
|
||||
url = "https://x.com/bellingcat/status/1871552600346415571"
|
||||
@@ -198,33 +218,52 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
"Bellingcat - This month's Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it's crucial to investigate the people behind these services",
|
||||
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc)
|
||||
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc),
|
||||
)
|
||||
|
||||
@pytest.mark.xfail(reason="Currently failing, sensitive content requires logged in users/cookies - not yet implemented")
|
||||
@pytest.mark.xfail(
|
||||
reason="Currently failing, sensitive content requires logged in users/cookies - not yet implemented"
|
||||
)
|
||||
@pytest.mark.download
|
||||
@pytest.mark.parametrize("url, title, timestamp, image_hash", [
|
||||
("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"url, title, timestamp, image_hash",
|
||||
[
|
||||
(
|
||||
"https://x.com/SozinhoRamalho/status/1876710769913450647",
|
||||
"ignore tweet, testing sensitivity warning nudity",
|
||||
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
|
||||
"image_hash",
|
||||
),
|
||||
(
|
||||
"https://x.com/SozinhoRamalho/status/1876710875475681357",
|
||||
"ignore tweet, testing sensitivity warning violence",
|
||||
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
|
||||
"image_hash",
|
||||
),
|
||||
(
|
||||
"https://x.com/SozinhoRamalho/status/1876711053813227618",
|
||||
"ignore tweet, testing sensitivity warning sensitive",
|
||||
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
|
||||
"image_hash",
|
||||
),
|
||||
(
|
||||
"https://x.com/SozinhoRamalho/status/1876711141314801937",
|
||||
"ignore tweet, testing sensitivity warning nudity, violence, sensitivity",
|
||||
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
|
||||
"image_hash",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_twitter_download_sensitive_media(self, url, title, timestamp, image_hash, make_item):
|
||||
|
||||
"""Download tweets with sensitive media"""
|
||||
|
||||
post = self.extractor.download(make_item(url))
|
||||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
title,
|
||||
timestamp
|
||||
)
|
||||
self.assertValidResponseMetadata(post, title, timestamp)
|
||||
assert len(post.media) == 1
|
||||
assert post.media[0].hash == image_hash
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_facebook_video(self, make_item):
|
||||
|
||||
post = self.extractor.download(make_item("https://www.facebook.com/bellingcat/videos/588371253839133"))
|
||||
assert len(post.media) == 2
|
||||
assert post.media[0].filename.endswith("588371253839133.mp4")
|
||||
@@ -234,11 +273,12 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
assert post.media[1].mimetype == "image/jpeg"
|
||||
|
||||
assert "Bellingchat Premium is with Kolina Koltai" in post.get_title()
|
||||
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_facebook_image(self, make_item):
|
||||
|
||||
post = self.extractor.download(make_item("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/"))
|
||||
post = self.extractor.download(
|
||||
make_item("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/")
|
||||
)
|
||||
|
||||
assert len(post.media) == 1
|
||||
assert post.media[0].filename.endswith(".png")
|
||||
@@ -248,5 +288,5 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
def test_download_facebook_text_only(self, make_item):
|
||||
url = "https://www.facebook.com/bellingcat/posts/pfbid02rzpwZxAZ8bLkAX8NvHv4DWAidFaqAUfJMbo9vWkpwxL7uMUWzWMiizXLWRSjwihVl"
|
||||
post = self.extractor.download(make_item(url))
|
||||
assert "Bellingcat researcher Kolina Koltai delves deeper into Clothoff" in post.get('content')
|
||||
assert "Bellingcat researcher Kolina Koltai delves deeper into Clothoff" in post.get("content")
|
||||
assert post.get_title() == "Bellingcat"
|
||||
|
||||
@@ -15,10 +15,11 @@ def mock_user_response():
|
||||
"username": "test_user",
|
||||
"full_name": "Test User",
|
||||
"profile_pic_url_hd": "http://example.com/profile.jpg",
|
||||
"profile_pic_url": "http://example.com/profile_lowres.jpg"
|
||||
"profile_pic_url": "http://example.com/profile_lowres.jpg",
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_post_response():
|
||||
return {
|
||||
@@ -27,16 +28,14 @@ def mock_post_response():
|
||||
"caption_text": "Test Caption",
|
||||
"taken_at": datetime.now().timestamp(),
|
||||
"video_url": "http://example.com/video.mp4",
|
||||
"thumbnail_url": "http://example.com/thumbnail.jpg"
|
||||
"thumbnail_url": "http://example.com/thumbnail.jpg",
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_story_response():
|
||||
return [{
|
||||
"id": "story_123",
|
||||
"taken_at": datetime.now().timestamp(),
|
||||
"video_url": "http://example.com/story.mp4"
|
||||
}]
|
||||
return [{"id": "story_123", "taken_at": datetime.now().timestamp(), "video_url": "http://example.com/story.mp4"}]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_highlight_response():
|
||||
@@ -46,11 +45,13 @@ def mock_highlight_response():
|
||||
"highlight:123": {
|
||||
"id": "123",
|
||||
"title": "Test Highlight",
|
||||
"items": [{
|
||||
"id": "item_123",
|
||||
"taken_at": datetime.now().timestamp(),
|
||||
"video_url": "http://example.com/highlight.mp4"
|
||||
}]
|
||||
"items": [
|
||||
{
|
||||
"id": "item_123",
|
||||
"taken_at": datetime.now().timestamp(),
|
||||
"video_url": "http://example.com/highlight.mp4",
|
||||
}
|
||||
],
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -81,24 +82,30 @@ class TestInstagramAPIExtractor(TestExtractorBase):
|
||||
m.set("netloc", "instagram.com")
|
||||
return m
|
||||
|
||||
@pytest.mark.parametrize("url,expected", [
|
||||
("https://instagram.com/user", [("", "user", "")]),
|
||||
("https://instagr.am/p/post_id", []),
|
||||
("https://youtube.com", []),
|
||||
("https://www.instagram.com/reel/reel_id", [("reel", "reel_id", "")]),
|
||||
("https://instagram.com/stories/highlights/123", [("stories/highlights", "123", "")]),
|
||||
("https://instagram.com/stories/user/123", [("stories", "user", "123")]),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"url,expected",
|
||||
[
|
||||
("https://instagram.com/user", [("", "user", "")]),
|
||||
("https://instagr.am/p/post_id", []),
|
||||
("https://youtube.com", []),
|
||||
("https://www.instagram.com/reel/reel_id", [("reel", "reel_id", "")]),
|
||||
("https://instagram.com/stories/highlights/123", [("stories/highlights", "123", "")]),
|
||||
("https://instagram.com/stories/user/123", [("stories", "user", "123")]),
|
||||
],
|
||||
)
|
||||
def test_url_parsing(self, url, expected):
|
||||
assert self.extractor.valid_url.findall(url) == expected
|
||||
|
||||
def test_initialize(self):
|
||||
assert self.extractor.api_endpoint[-1] != "/"
|
||||
|
||||
@pytest.mark.parametrize("input_dict,expected", [
|
||||
({"x": 0, "valid": "data"}, {"valid": "data"}),
|
||||
({"nested": {"y": None, "valid": [{}]}}, {"nested": {"valid": [{}]}}),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"input_dict,expected",
|
||||
[
|
||||
({"x": 0, "valid": "data"}, {"valid": "data"}),
|
||||
({"nested": {"y": None, "valid": [{}]}}, {"nested": {"valid": [{}]}}),
|
||||
],
|
||||
)
|
||||
def test_cleanup_dict(self, input_dict, expected):
|
||||
assert self.extractor.cleanup_dict(input_dict) == expected
|
||||
|
||||
@@ -114,8 +121,8 @@ class TestInstagramAPIExtractor(TestExtractorBase):
|
||||
|
||||
def test_download_profile_basic(self, metadata, mock_user_response, mocker):
|
||||
"""Test basic profile download without full_profile"""
|
||||
mock_call = mocker.patch.object(self.extractor, 'call_api')
|
||||
mock_download = mocker.patch.object(self.extractor, 'download_from_url')
|
||||
mock_call = mocker.patch.object(self.extractor, "call_api")
|
||||
mock_download = mocker.patch.object(self.extractor, "download_from_url")
|
||||
# Mock API responses
|
||||
mock_call.return_value = mock_user_response
|
||||
mock_download.return_value = "profile.jpg"
|
||||
@@ -132,17 +139,14 @@ class TestInstagramAPIExtractor(TestExtractorBase):
|
||||
|
||||
def test_download_profile_full(self, metadata, mock_user_response, mock_story_response, mocker):
|
||||
"""Test full profile download with stories/posts"""
|
||||
mock_call = mocker.patch.object(self.extractor, 'call_api')
|
||||
mock_posts = mocker.patch.object(self.extractor, 'download_all_posts')
|
||||
mock_highlights = mocker.patch.object(self.extractor, 'download_all_highlights')
|
||||
mock_tagged = mocker.patch.object(self.extractor, 'download_all_tagged')
|
||||
mock_stories = mocker.patch.object(self.extractor, '_download_stories_reusable')
|
||||
mock_call = mocker.patch.object(self.extractor, "call_api")
|
||||
mock_posts = mocker.patch.object(self.extractor, "download_all_posts")
|
||||
mock_highlights = mocker.patch.object(self.extractor, "download_all_highlights")
|
||||
mock_tagged = mocker.patch.object(self.extractor, "download_all_tagged")
|
||||
mock_stories = mocker.patch.object(self.extractor, "_download_stories_reusable")
|
||||
|
||||
self.extractor.full_profile = True
|
||||
mock_call.side_effect = [
|
||||
mock_user_response,
|
||||
mock_story_response
|
||||
]
|
||||
mock_call.side_effect = [mock_user_response, mock_story_response]
|
||||
mock_highlights.return_value = None
|
||||
mock_stories.return_value = mock_story_response
|
||||
mock_posts.return_value = None
|
||||
@@ -155,7 +159,7 @@ class TestInstagramAPIExtractor(TestExtractorBase):
|
||||
|
||||
def test_download_profile_not_found(self, metadata, mocker):
|
||||
"""Test profile not found error"""
|
||||
mock_call = mocker.patch.object(self.extractor, 'call_api')
|
||||
mock_call = mocker.patch.object(self.extractor, "call_api")
|
||||
mock_call.return_value = {"user": None}
|
||||
with pytest.raises(AssertionError) as exc_info:
|
||||
self.extractor.download_profile(metadata, "invalid_user")
|
||||
@@ -163,18 +167,14 @@ class TestInstagramAPIExtractor(TestExtractorBase):
|
||||
|
||||
def test_download_profile_error_handling(self, metadata, mock_user_response, mocker):
|
||||
"""Test error handling in full profile mode"""
|
||||
mock_call = mocker.patch.object(self.extractor, 'call_api')
|
||||
mock_highlights = mocker.patch.object(self.extractor, 'download_all_highlights')
|
||||
mock_tagged = mocker.patch.object(self.extractor, 'download_all_tagged')
|
||||
stories_tagged = mocker.patch.object(self.extractor, '_download_stories_reusable')
|
||||
mock_posts = mocker.patch.object(self.extractor, 'download_all_posts')
|
||||
mock_call = mocker.patch.object(self.extractor, "call_api")
|
||||
mock_highlights = mocker.patch.object(self.extractor, "download_all_highlights")
|
||||
mock_tagged = mocker.patch.object(self.extractor, "download_all_tagged")
|
||||
stories_tagged = mocker.patch.object(self.extractor, "_download_stories_reusable")
|
||||
mock_posts = mocker.patch.object(self.extractor, "download_all_posts")
|
||||
|
||||
self.extractor.full_profile = True
|
||||
mock_call.side_effect = [
|
||||
mock_user_response,
|
||||
Exception("Stories API failed"),
|
||||
Exception("Posts API failed")
|
||||
]
|
||||
mock_call.side_effect = [mock_user_response, Exception("Stories API failed"), Exception("Posts API failed")]
|
||||
mock_highlights.return_value = None
|
||||
mock_tagged.return_value = None
|
||||
stories_tagged.return_value = None
|
||||
@@ -182,4 +182,4 @@ class TestInstagramAPIExtractor(TestExtractorBase):
|
||||
result = self.extractor.download_profile(metadata, "test_user")
|
||||
|
||||
assert result.is_success()
|
||||
assert "Error downloading stories for test_user" in result.metadata["errors"]
|
||||
assert "Error downloading stories for test_user" in result.metadata["errors"]
|
||||
|
||||
@@ -1,21 +1,41 @@
|
||||
import pytest
|
||||
|
||||
from auto_archiver.modules.instagram_extractor import InstagramExtractor
|
||||
from .test_extractor_base import TestExtractorBase
|
||||
|
||||
class TestInstagramExtractor(TestExtractorBase):
|
||||
|
||||
extractor_module: str = 'instagram_extractor'
|
||||
config: dict = {}
|
||||
@pytest.fixture
|
||||
def instagram_extractor(setup_module, mocker):
|
||||
extractor_module: str = "instagram_extractor"
|
||||
config: dict = {
|
||||
"username": "user_name",
|
||||
"password": "password123",
|
||||
"download_folder": "instaloader",
|
||||
"session_file": "secrets/instaloader.session",
|
||||
}
|
||||
fake_loader = mocker.MagicMock()
|
||||
fake_loader.load_session_from_file.return_value = None
|
||||
fake_loader.login.return_value = None
|
||||
fake_loader.save_session_to_file.return_value = None
|
||||
mocker.patch(
|
||||
"instaloader.Instaloader",
|
||||
return_value=fake_loader,
|
||||
)
|
||||
return setup_module(extractor_module, config)
|
||||
|
||||
@pytest.mark.parametrize("url", [
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url",
|
||||
[
|
||||
"https://www.instagram.com/p/",
|
||||
"https://www.instagram.com/p/1234567890/",
|
||||
"https://www.instagram.com/reel/1234567890/",
|
||||
"https://www.instagram.com/username/",
|
||||
"https://www.instagram.com/username/stories/",
|
||||
"https://www.instagram.com/username/highlights/",
|
||||
])
|
||||
def test_regex_matches(self, url):
|
||||
# post
|
||||
assert InstagramExtractor.valid_url.match(url)
|
||||
],
|
||||
)
|
||||
def test_regex_matches(url: str, instagram_extractor: InstagramExtractor) -> None:
|
||||
"""
|
||||
Ensure that the valid_url regex matches all provided Instagram URLs.
|
||||
"""
|
||||
assert instagram_extractor.valid_url.match(url)
|
||||
|
||||
@@ -7,10 +7,16 @@ from auto_archiver.modules.instagram_tbot_extractor import InstagramTbotExtracto
|
||||
from tests.extractors.test_extractor_base import TestExtractorBase
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def mock_sleep(mocker):
|
||||
"""Mock time.sleep to avoid delays."""
|
||||
return mocker.patch("time.sleep")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def patch_extractor_methods(request, setup_module, mocker):
|
||||
mocker.patch.object(InstagramTbotExtractor, '_prepare_session_file', return_value=None)
|
||||
mocker.patch.object(InstagramTbotExtractor, '_initialize_telegram_client', return_value=None)
|
||||
mocker.patch.object(InstagramTbotExtractor, "_prepare_session_file", return_value=None)
|
||||
mocker.patch.object(InstagramTbotExtractor, "_initialize_telegram_client", return_value=None)
|
||||
yield
|
||||
|
||||
|
||||
@@ -35,12 +41,7 @@ def mock_telegram_client(mocker):
|
||||
@pytest.fixture
|
||||
def extractor(setup_module, patch_extractor_methods, mocker):
|
||||
extractor_module = "instagram_tbot_extractor"
|
||||
config = {
|
||||
"api_id": 12345,
|
||||
"api_hash": "test_api_hash",
|
||||
"session_file": "test_session",
|
||||
"timeout": 4
|
||||
}
|
||||
config = {"api_id": 12345, "api_hash": "test_api_hash", "session_file": "test_session", "timeout": 4}
|
||||
extractor = setup_module(extractor_module, config)
|
||||
extractor.client = mocker.MagicMock()
|
||||
extractor.session_file = "test_session"
|
||||
@@ -79,21 +80,30 @@ class TestInstagramTbotExtractorReal(TestExtractorBase):
|
||||
"session_file": "secrets/anon-insta",
|
||||
}
|
||||
|
||||
@pytest.mark.parametrize("url, expected_status, message, len_media", [
|
||||
("https://www.instagram.com/p/C4QgLbrIKXG", "insta-via-bot: success",
|
||||
"Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou",
|
||||
6),
|
||||
("https://www.instagram.com/reel/DEVLK8qoIbg/", "insta-via-bot: success",
|
||||
"Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol",
|
||||
3),
|
||||
# instagram tbot not working (potentially intermittently?) for stories - replace with a live story to retest
|
||||
# ("https://www.instagram.com/stories/bellingcatofficial/3556336382743057476/", False, "Media not found or unavailable"),
|
||||
# Seems to be working intermittently for highlights
|
||||
# ("https://www.instagram.com/stories/highlights/17868810693068139/", "insta-via-bot: success", None, 50),
|
||||
# Marking invalid url as success
|
||||
("https://www.instagram.com/p/INVALID", "insta-via-bot: success", "Media not found or unavailable", 0),
|
||||
("https://www.youtube.com/watch?v=ymCMy8OffHM", False, None, 0),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"url, expected_status, message, len_media",
|
||||
[
|
||||
(
|
||||
"https://www.instagram.com/p/C4QgLbrIKXG",
|
||||
"insta-via-bot: success",
|
||||
"Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou",
|
||||
6,
|
||||
),
|
||||
(
|
||||
"https://www.instagram.com/reel/DEVLK8qoIbg/",
|
||||
"insta-via-bot: success",
|
||||
"Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol",
|
||||
3,
|
||||
),
|
||||
# instagram tbot not working (potentially intermittently?) for stories - replace with a live story to retest
|
||||
# ("https://www.instagram.com/stories/bellingcatofficial/3556336382743057476/", False, "Media not found or unavailable"),
|
||||
# Seems to be working intermittently for highlights
|
||||
# ("https://www.instagram.com/stories/highlights/17868810693068139/", "insta-via-bot: success", None, 50),
|
||||
# Marking invalid url as success
|
||||
("https://www.instagram.com/p/INVALID", "insta-via-bot: success", "Media not found or unavailable", 0),
|
||||
("https://www.youtube.com/watch?v=ymCMy8OffHM", False, None, 0),
|
||||
],
|
||||
)
|
||||
def test_download(self, url, expected_status, message, len_media, metadata_sample):
|
||||
"""Test the `download()` method with various Instagram URLs."""
|
||||
metadata_sample.set_url(url)
|
||||
|
||||
151
tests/extractors/test_tiktok_tikwm_extractor.py
Normal file
151
tests/extractors/test_tiktok_tikwm_extractor.py
Normal file
@@ -0,0 +1,151 @@
|
||||
from datetime import datetime, timezone
|
||||
import time
|
||||
import pytest
|
||||
import yt_dlp
|
||||
|
||||
from auto_archiver.modules.generic_extractor.generic_extractor import GenericExtractor
|
||||
from .test_extractor_base import TestExtractorBase
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def skip_ytdlp_own_methods(mocker):
|
||||
# mock this method, so that we skip the ytdlp download in these tests
|
||||
mocker.patch("auto_archiver.modules.generic_extractor.tiktok.Tiktok.skip_ytdlp_download", return_value=True)
|
||||
mocker.patch(
|
||||
"auto_archiver.modules.generic_extractor.generic_extractor.GenericExtractor.suitable_extractors",
|
||||
return_value=[e for e in yt_dlp.YoutubeDL()._ies.values() if e.IE_NAME == "TikTok"],
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def mock_get(mocker):
|
||||
return mocker.patch("auto_archiver.modules.generic_extractor.tiktok.requests.get")
|
||||
|
||||
|
||||
class TestTiktokTikwmExtractor(TestExtractorBase):
|
||||
"""
|
||||
Test suite for TestTiktokTikwmExtractor.
|
||||
"""
|
||||
|
||||
extractor_module = "generic_extractor"
|
||||
extractor: GenericExtractor
|
||||
|
||||
config = {}
|
||||
|
||||
VALID_EXAMPLE_URL = "https://www.tiktok.com/@example/video/1234"
|
||||
|
||||
def test_invalid_json_responses(self, mock_get, make_item, caplog):
|
||||
mock_get.return_value.status_code = 200
|
||||
mock_get.return_value.json.side_effect = ValueError
|
||||
with caplog.at_level("DEBUG"):
|
||||
assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False
|
||||
mock_get.assert_called_once()
|
||||
mock_get.return_value.json.assert_called_once()
|
||||
# first message is just the 'Skipping using ytdlp to download files for TikTok' message
|
||||
assert (
|
||||
"failed to parse JSON response from tikwm.com for url='https://www.tiktok.com/@example/video/1234'"
|
||||
in caplog.text
|
||||
)
|
||||
|
||||
mock_get.return_value.json.side_effect = Exception
|
||||
with caplog.at_level("ERROR"):
|
||||
assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False
|
||||
mock_get.assert_called()
|
||||
assert mock_get.call_count == 2
|
||||
assert mock_get.return_value.json.call_count == 2
|
||||
assert (
|
||||
"failed to parse JSON response from tikwm.com for url='https://www.tiktok.com/@example/video/1234'"
|
||||
in caplog.text
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"response",
|
||||
[
|
||||
({"msg": "failure"}),
|
||||
({"msg": "success"}),
|
||||
],
|
||||
)
|
||||
def test_unsuccessful_responses(self, mock_get, make_item, response, caplog):
|
||||
mock_get.return_value.status_code = 200
|
||||
mock_get.return_value.json.return_value = response
|
||||
with caplog.at_level("DEBUG"):
|
||||
assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False
|
||||
mock_get.assert_called_once()
|
||||
mock_get.return_value.json.assert_called_once()
|
||||
assert "failed to get a valid response from tikwm.com" in caplog.text
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"response,has_vid",
|
||||
[
|
||||
({"data": {"id": 123}}, False),
|
||||
({"data": {"wmplay": "url"}}, True),
|
||||
({"data": {"play": "url"}}, True),
|
||||
],
|
||||
)
|
||||
def test_correct_extraction(self, mock_get, make_item, response, has_vid, mocker):
|
||||
mock_get.return_value.status_code = 200
|
||||
mock_get.return_value.json.return_value = {"msg": "success", **response}
|
||||
result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
|
||||
if not has_vid:
|
||||
assert result is False
|
||||
else:
|
||||
assert result.is_success()
|
||||
assert len(result.media) == 1
|
||||
mock_get.assert_called()
|
||||
assert mock_get.call_count == 1 + int(has_vid)
|
||||
mock_get.return_value.json.assert_called_once()
|
||||
|
||||
def test_correct_data_extracted(self, mock_get, make_item):
|
||||
mock_get.return_value.status_code = 200
|
||||
mock_get.return_value.json.return_value = {
|
||||
"msg": "success",
|
||||
"data": {
|
||||
"wmplay": "url",
|
||||
"origin_cover": "cover.jpg",
|
||||
"title": "Title",
|
||||
"id": 123,
|
||||
"duration": 60,
|
||||
"create_time": 1736301699,
|
||||
"author": "Author",
|
||||
"other": "data",
|
||||
},
|
||||
}
|
||||
|
||||
result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
|
||||
assert result.is_success()
|
||||
assert len(result.media) == 2
|
||||
assert result.get_title() == "Title"
|
||||
assert result.get("author") == "Author"
|
||||
assert result.get("api_data") == {"other": "data", "id": 123}
|
||||
assert result.media[1].get("duration") == 60
|
||||
assert result.get("timestamp") == datetime.fromtimestamp(1736301699, tz=timezone.utc)
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_video(self, make_item):
|
||||
url = "https://www.tiktok.com/@bbcnews/video/7478038212070411542"
|
||||
|
||||
result = self.extractor.download(make_item(url))
|
||||
assert result.is_success()
|
||||
assert len(result.media) == 2
|
||||
assert (
|
||||
result.get_title()
|
||||
== "The A23a iceberg is one of the world's oldest and it's so big you can see it from space. #Iceberg #A23a #Antarctica #Ice #ClimateChange #DavidAttenborough #Ocean #Sea #SouthGeorgia #BBCNews "
|
||||
)
|
||||
assert result.get("author").get("unique_id") == "bbcnews"
|
||||
assert result.get("api_data").get("id") == "7478038212070411542"
|
||||
assert result.media[1].get("duration") == 59
|
||||
assert result.get("timestamp") == datetime.fromtimestamp(1741122000, tz=timezone.utc)
|
||||
|
||||
@pytest.mark.download
|
||||
def test_download_sensitive_video(self, make_item):
|
||||
url = "https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375"
|
||||
# Required for rate limiting
|
||||
time.sleep(1.1)
|
||||
result = self.extractor.download(make_item(url))
|
||||
assert result.is_success()
|
||||
assert len(result.media) == 2
|
||||
assert result.get_title() == "Căng nhất lúc này #ggs68 #ggs68taiwan #taiwan #dailoan #tiktoknews"
|
||||
assert result.get("author").get("id") == "7197400619475649562"
|
||||
assert result.get("api_data").get("id") == "7441821351142362375"
|
||||
assert result.media[1].get("duration") == 34
|
||||
assert result.get("timestamp") == datetime.fromtimestamp(1732684060, tz=timezone.utc)
|
||||
@@ -1,6 +1,5 @@
|
||||
import os
|
||||
import datetime
|
||||
import hashlib
|
||||
import pytest
|
||||
|
||||
from pytwitter.models.media import MediaVariant
|
||||
@@ -10,8 +9,7 @@ from auto_archiver.modules.twitter_api_extractor import TwitterApiExtractor
|
||||
|
||||
@pytest.mark.incremental
|
||||
class TestTwitterApiExtractor(TestExtractorBase):
|
||||
|
||||
extractor_module = 'twitter_api_extractor'
|
||||
extractor_module: TwitterApiExtractor = "twitter_api_extractor"
|
||||
|
||||
config = {
|
||||
"bearer_tokens": [],
|
||||
@@ -22,41 +20,79 @@ class TestTwitterApiExtractor(TestExtractorBase):
|
||||
"access_secret": os.environ.get("TWITTER_ACCESS_SECRET"),
|
||||
}
|
||||
|
||||
@pytest.mark.parametrize("url, expected", [
|
||||
("https://x.com/bellingcat/status/1874097816571961839", "https://x.com/bellingcat/status/1874097816571961839"), # x.com urls unchanged
|
||||
("https://twitter.com/bellingcat/status/1874097816571961839", "https://twitter.com/bellingcat/status/1874097816571961839"), # twitter urls unchanged
|
||||
("https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # don't strip params from twitter urls (changed Jan 2025)
|
||||
("https://www.bellingcat.com/category/resources/", "https://www.bellingcat.com/category/resources/"), # non-twitter/x urls unchanged
|
||||
("https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # shouldn't strip params from non-twitter/x URLs
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"url, expected",
|
||||
[
|
||||
(
|
||||
"https://x.com/bellingcat/status/1874097816571961839",
|
||||
"https://x.com/bellingcat/status/1874097816571961839",
|
||||
), # x.com urls unchanged
|
||||
(
|
||||
"https://twitter.com/bellingcat/status/1874097816571961839",
|
||||
"https://twitter.com/bellingcat/status/1874097816571961839",
|
||||
), # twitter urls unchanged
|
||||
(
|
||||
"https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
|
||||
"https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
|
||||
), # don't strip params from twitter urls (changed Jan 2025)
|
||||
(
|
||||
"https://www.bellingcat.com/category/resources/",
|
||||
"https://www.bellingcat.com/category/resources/",
|
||||
), # non-twitter/x urls unchanged
|
||||
(
|
||||
"https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
|
||||
"https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
|
||||
), # shouldn't strip params from non-twitter/x URLs
|
||||
],
|
||||
)
|
||||
def test_sanitize_url(self, url, expected):
|
||||
assert expected == self.extractor.sanitize_url(url)
|
||||
|
||||
@pytest.mark.download
|
||||
def test_sanitize_url_download(self):
|
||||
assert "https://www.bellingcat.com/category/resources/" == self.extractor.sanitize_url("https://t.co/yl3oOJatFp")
|
||||
assert "https://www.bellingcat.com/category/resources/" == self.extractor.sanitize_url(
|
||||
"https://t.co/yl3oOJatFp"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize("url, exptected_username, exptected_tweetid", [
|
||||
("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
|
||||
("https://x.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
|
||||
("https://www.bellingcat.com/category/resources/", False, False)
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"url, exptected_username, exptected_tweetid",
|
||||
[
|
||||
("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
|
||||
("https://x.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
|
||||
("https://www.bellingcat.com/category/resources/", False, False),
|
||||
],
|
||||
)
|
||||
def test_get_username_tweet_id_from_url(self, url, exptected_username, exptected_tweetid):
|
||||
|
||||
username, tweet_id = self.extractor.get_username_tweet_id(url)
|
||||
assert exptected_username == username
|
||||
assert exptected_tweetid == tweet_id
|
||||
|
||||
def test_choose_variants(self):
|
||||
# taken from the response for url https://x.com/bellingcat/status/1871552600346415571
|
||||
variant_list = [MediaVariant(content_type='application/x-mpegURL', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/pl/ovWo7ux-bKROwYIC.m3u8?tag=12&v=e1b'),
|
||||
MediaVariant(bit_rate=256000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/480x270/OqZIrKV0LFswMvxS.mp4?tag=12'),
|
||||
MediaVariant(bit_rate=832000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/640x360/uiDZDSmZ8MZn9hsi.mp4?tag=12'),
|
||||
MediaVariant(bit_rate=2176000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/1280x720/6Y340Esh568WZnRZ.mp4?tag=12')
|
||||
]
|
||||
variant_list = [
|
||||
MediaVariant(
|
||||
content_type="application/x-mpegURL",
|
||||
url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/pl/ovWo7ux-bKROwYIC.m3u8?tag=12&v=e1b",
|
||||
),
|
||||
MediaVariant(
|
||||
bit_rate=256000,
|
||||
content_type="video/mp4",
|
||||
url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/480x270/OqZIrKV0LFswMvxS.mp4?tag=12",
|
||||
),
|
||||
MediaVariant(
|
||||
bit_rate=832000,
|
||||
content_type="video/mp4",
|
||||
url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/640x360/uiDZDSmZ8MZn9hsi.mp4?tag=12",
|
||||
),
|
||||
MediaVariant(
|
||||
bit_rate=2176000,
|
||||
content_type="video/mp4",
|
||||
url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/1280x720/6Y340Esh568WZnRZ.mp4?tag=12",
|
||||
),
|
||||
]
|
||||
chosen_variant = self.extractor.choose_variant(variant_list)
|
||||
assert chosen_variant == variant_list[3]
|
||||
|
||||
|
||||
@pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
|
||||
@pytest.mark.download
|
||||
def test_download_nonexistent_tweet(self, make_item):
|
||||
@@ -76,7 +112,6 @@ class TestTwitterApiExtractor(TestExtractorBase):
|
||||
@pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
|
||||
@pytest.mark.download
|
||||
def test_download_tweet_no_media(self, make_item):
|
||||
|
||||
item = make_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w")
|
||||
post = self.extractor.download(item)
|
||||
|
||||
@@ -84,7 +119,7 @@ class TestTwitterApiExtractor(TestExtractorBase):
|
||||
post,
|
||||
"Onion rings are just vegetable donuts.",
|
||||
datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
|
||||
"twitter-api: success"
|
||||
"twitter-api: success",
|
||||
)
|
||||
|
||||
@pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
|
||||
@@ -95,27 +130,41 @@ class TestTwitterApiExtractor(TestExtractorBase):
|
||||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
"This month's Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it's crucial to investigate the people behind these services https://t.co/SfBUq0hSD0 https://t.co/rIHx0WlKp8",
|
||||
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc)
|
||||
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc),
|
||||
)
|
||||
|
||||
@pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
|
||||
@pytest.mark.parametrize("url, title, timestamp", [
|
||||
("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity https://t.co/t3u0hQsSB1", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
|
||||
("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence https://t.co/syYDSkpjZD", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
|
||||
("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive https://t.co/XE7cRdjzYq", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
|
||||
("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity https://t.co/YxCFbbhYE3", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"url, title, timestamp",
|
||||
[
|
||||
(
|
||||
"https://x.com/SozinhoRamalho/status/1876710769913450647",
|
||||
"ignore tweet, testing sensitivity warning nudity https://t.co/t3u0hQsSB1",
|
||||
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
|
||||
),
|
||||
(
|
||||
"https://x.com/SozinhoRamalho/status/1876710875475681357",
|
||||
"ignore tweet, testing sensitivity warning violence https://t.co/syYDSkpjZD",
|
||||
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
|
||||
),
|
||||
(
|
||||
"https://x.com/SozinhoRamalho/status/1876711053813227618",
|
||||
"ignore tweet, testing sensitivity warning sensitive https://t.co/XE7cRdjzYq",
|
||||
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
|
||||
),
|
||||
(
|
||||
"https://x.com/SozinhoRamalho/status/1876711141314801937",
|
||||
"ignore tweet, testing sensitivity warning nudity, violence, sensitivity https://t.co/YxCFbbhYE3",
|
||||
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.download
|
||||
def test_download_sensitive_media(self, url, title, timestamp, check_hash, make_item):
|
||||
|
||||
"""Download tweets with sensitive media"""
|
||||
|
||||
post = self.extractor.download(make_item(url))
|
||||
self.assertValidResponseMetadata(
|
||||
post,
|
||||
title,
|
||||
timestamp
|
||||
)
|
||||
self.assertValidResponseMetadata(post, title, timestamp)
|
||||
assert len(post.media) == 1
|
||||
# check the SHA1 hash (quick) of the media, to make sure it's valid
|
||||
check_hash(post.media[0].filename, "3eea9c03b2dcedd1eb9a169d8bfd1cf877996fab4961de019a96eb9d32d2d733")
|
||||
check_hash(post.media[0].filename, "3eea9c03b2dcedd1eb9a169d8bfd1cf877996fab4961de019a96eb9d32d2d733")
|
||||
|
||||
77
tests/extractors/test_vk_extractor.py
Normal file
77
tests/extractors/test_vk_extractor.py
Normal file
@@ -0,0 +1,77 @@
|
||||
import pytest
|
||||
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.modules.vk_extractor import VkExtractor
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_vk_scraper(mocker):
|
||||
"""Fixture to mock VkScraper."""
|
||||
return mocker.patch("auto_archiver.modules.vk_extractor.vk_extractor.VkScraper")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vk_extractor(setup_module, mock_vk_scraper) -> VkExtractor:
|
||||
"""Fixture to initialize VkExtractor with mocked VkScraper."""
|
||||
extractor_module = "vk_extractor"
|
||||
configs = {
|
||||
"username": "name",
|
||||
"password": "password123",
|
||||
"session_file": "secrets/vk_config.v2.json",
|
||||
}
|
||||
vk = setup_module(extractor_module, configs)
|
||||
vk.vks = mock_vk_scraper.return_value
|
||||
return vk
|
||||
|
||||
|
||||
def test_netloc(vk_extractor, metadata):
|
||||
# metadata url set as: "https://example.com/"
|
||||
assert vk_extractor.download(metadata) is False
|
||||
|
||||
|
||||
def test_vk_url_but_scrape_returns_empty(vk_extractor, metadata):
|
||||
metadata.set_url("https://vk.com/valid-wall")
|
||||
vk_extractor.vks.scrape.return_value = []
|
||||
assert vk_extractor.download(metadata) is False
|
||||
assert metadata.netloc == "vk.com"
|
||||
vk_extractor.vks.scrape.assert_called_once_with(metadata.get_url())
|
||||
|
||||
|
||||
def test_successful_scrape_and_download(vk_extractor, metadata, mocker):
|
||||
mock_scrapes = [
|
||||
{"text": "Post Title", "datetime": "2023-01-01T00:00:00", "id": 1},
|
||||
{"text": "Another Post", "datetime": "2023-01-02T00:00:00", "id": 2},
|
||||
]
|
||||
mock_filenames = ["image1.jpg", "image2.png"]
|
||||
vk_extractor.vks.scrape.return_value = mock_scrapes
|
||||
vk_extractor.vks.download_media.return_value = mock_filenames
|
||||
metadata.set_url("https://vk.com/valid-wall")
|
||||
result = vk_extractor.download(metadata)
|
||||
# Test metadata
|
||||
assert result.is_success()
|
||||
assert result.status == "vk: success"
|
||||
assert result.get_title() == "Post Title"
|
||||
assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
|
||||
assert "Another Post" in result.metadata["content"]
|
||||
# Test Media objects
|
||||
assert len(result.media) == 2
|
||||
assert result.media[0].filename == "image1.jpg"
|
||||
assert result.media[1].filename == "image2.png"
|
||||
vk_extractor.vks.download_media.assert_called_once_with(mock_scrapes, vk_extractor.tmp_dir)
|
||||
|
||||
|
||||
def test_adds_first_title_and_timestamp(vk_extractor):
|
||||
metadata = Metadata().set_url("https://vk.com/no-metadata")
|
||||
metadata.set_url("https://vk.com/no-metadata")
|
||||
mock_scrapes = [
|
||||
{"text": "value", "datetime": "2023-01-01T00:00:00"},
|
||||
{"text": "value2", "datetime": "2023-01-02T00:00:00"},
|
||||
]
|
||||
vk_extractor.vks.scrape.return_value = mock_scrapes
|
||||
vk_extractor.vks.download_media.return_value = []
|
||||
result = vk_extractor.download(metadata)
|
||||
|
||||
assert result.get_title() == "value"
|
||||
# formatted timestamp
|
||||
assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
|
||||
assert result.is_success()
|
||||
Reference in New Issue
Block a user