Merge main

This commit is contained in:
Patrick Robertson
2025-03-17 10:05:11 +00:00
229 changed files with 61430 additions and 3147 deletions

View File

@@ -7,7 +7,6 @@ from auto_archiver.core.extractor import Extractor
class TestExtractorBase(object):
extractor_module: str = None
config: dict = None
@@ -17,7 +16,7 @@ class TestExtractorBase(object):
assert self.config is not None, "self.config must be a dict set on the subclass"
self.extractor: Type[Extractor] = setup_module(self.extractor_module, self.config)
def assertValidResponseMetadata(self, test_response: Metadata, title: str, timestamp: str, status: str = ""):
assert test_response is not False

View File

@@ -9,26 +9,28 @@ import pytest
from auto_archiver.modules.generic_extractor.generic_extractor import GenericExtractor
from .test_extractor_base import TestExtractorBase
CI=os.getenv("GITHUB_ACTIONS", '') == 'true'
CI = os.getenv("GITHUB_ACTIONS", "") == "true"
class TestGenericExtractor(TestExtractorBase):
"""Tests Generic Extractor
"""
extractor_module = 'generic_extractor'
"""Tests Generic Extractor"""
extractor_module = "generic_extractor"
extractor: GenericExtractor
config = {
'subtitles': False,
'comments': False,
'livestreams': False,
'live_from_start': False,
'end_means_success': True,
'allow_playlist': False,
'max_downloads': "inf",
'proxy': None,
'cookies_from_browser': False,
'cookie_file': None,
}
"subtitles": False,
"comments": False,
"livestreams": False,
"live_from_start": False,
"end_means_success": True,
"allow_playlist": False,
"max_downloads": "inf",
"proxy": None,
"cookies_from_browser": False,
"cookie_file": None,
}
def test_load_dropin(self):
# test loading dropins that are in the generic_archiver package
package = "auto_archiver.modules.generic_extractor"
@@ -38,32 +40,42 @@ class TestGenericExtractor(TestExtractorBase):
path = os.path.join(dirname(dirname(__file__)), "data/")
assert self.extractor.dropin_for_name("dropin", additional_paths=[path])
@pytest.mark.parametrize("url, suitable_extractors", [
("https://www.youtube.com/watch?v=5qap5aO4i9A", ["youtube"]),
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", ["tiktok"]),
("https://www.instagram.com/p/CU1J9JYJ9Zz/", ["instagram"]),
("https://www.facebook.com/nytimes/videos/10160796550110716", ["facebook"]),
("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/", ["facebook"]),])
@pytest.mark.parametrize(
"url, suitable_extractors",
[
("https://www.youtube.com/watch?v=5qap5aO4i9A", ["youtube"]),
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", ["tiktok"]),
("https://www.instagram.com/p/CU1J9JYJ9Zz/", ["instagram"]),
("https://www.facebook.com/nytimes/videos/10160796550110716", ["facebook"]),
("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/", ["facebook"]),
],
)
def test_suitable_extractors(self, url, suitable_extractors):
suitable_extractors = suitable_extractors + ['generic'] # the generic is valid for all
suitable_extractors = suitable_extractors + ["generic"] # the generic is valid for all
extractors = list(self.extractor.suitable_extractors(url))
assert len(extractors) == len(suitable_extractors)
assert [e.ie_key().lower() for e in extractors] == suitable_extractors
@pytest.mark.parametrize("url, is_suitable", [
("https://www.youtube.com/watch?v=5qap5aO4i9A", True),
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", True),
("https://www.instagram.com/p/CU1J9JYJ9Zz/", True),
("https://www.facebook.com/nytimes/videos/10160796550110716", True),
("https://www.twitch.tv/videos/1167226570", True),
("https://bellingcat.com/news/2021/10/08/ukrainian-soldiers-are-being-killed-by-landmines-in-the-donbas/", True),
("https://google.com", True)])
@pytest.mark.parametrize(
"url, is_suitable",
[
("https://www.youtube.com/watch?v=5qap5aO4i9A", True),
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", True),
("https://www.instagram.com/p/CU1J9JYJ9Zz/", True),
("https://www.facebook.com/nytimes/videos/10160796550110716", True),
("https://www.twitch.tv/videos/1167226570", True),
(
"https://bellingcat.com/news/2021/10/08/ukrainian-soldiers-are-being-killed-by-landmines-in-the-donbas/",
True,
),
("https://google.com", True),
],
)
def test_suitable_urls(self, url, is_suitable):
"""
Note: expected behaviour is to return True for all URLs, as YoutubeDLArchiver should be able to handle all URLs
This behaviour may be changed in the future (e.g. if we want the youtubedl archiver to just handle URLs it has extractors for,
and then if and only if all archivers fails, does it fall back to the generic archiver)
Note: expected behaviour is to return True for all URLs, as YoutubeDLArchiver should be able to handle all URLs
This behaviour may be changed in the future (e.g. if we want the youtubedl archiver to just handle URLs it has extractors for,
and then if and only if all archivers fails, does it fall back to the generic archiver)
"""
assert self.extractor.suitable(url) == is_suitable
@@ -74,12 +86,15 @@ class TestGenericExtractor(TestExtractorBase):
assert result.get_url() == "https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970"
@pytest.mark.download
@pytest.mark.parametrize("url", [
"https://bsky.app/profile/colborne.bsky.social/post/3lcxcpgt6j42l",
"twitter.com/bellingcat/status/123",
"https://www.youtube.com/watch?v=1"
])
def test_download_nonexistend_media(self, make_item, url):
@pytest.mark.parametrize(
"url",
[
"https://bsky.app/profile/colborne.bsky.social/post/3lcxcpgt6j42l",
"twitter.com/bellingcat/status/123",
"https://www.youtube.com/watch?v=1",
],
)
def test_download_nonexistent_media(self, make_item, url):
"""
Test to make sure that the extractor doesn't break on non-existend posts/media
@@ -89,7 +104,10 @@ class TestGenericExtractor(TestExtractorBase):
result = self.extractor.download(item)
assert not result
@pytest.mark.skipif(CI, reason="Currently no way to authenticate when on CI. Youtube (yt-dlp) doesn't support logging in with username/password.")
@pytest.mark.skipif(
CI,
reason="Currently no way to authenticate when on CI. Youtube (yt-dlp) doesn't support logging in with username/password.",
)
@pytest.mark.download
def test_youtube_download(self, make_item):
# url https://www.youtube.com/watch?v=5qap5aO4i9A
@@ -98,7 +116,10 @@ class TestGenericExtractor(TestExtractorBase):
result = self.extractor.download(item)
assert result.get_url() == "https://www.youtube.com/watch?v=J---aiyznGQ"
assert result.get_title() == "Keyboard Cat! - THE ORIGINAL!"
assert result.get('description') == "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
assert (
result.get("description")
== "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
)
assert len(result.media) == 2
assert Path(result.media[0].filename).name == "J---aiyznGQ.webm"
assert Path(result.media[1].filename).name == "hqdefault.jpg"
@@ -114,7 +135,7 @@ class TestGenericExtractor(TestExtractorBase):
item = make_item("https://bsky.app/profile/bellingcat.com/post/3lfn3hbcxgc2q")
result = self.extractor.download(item)
assert result is not False
@pytest.mark.download
def test_bluesky_download_no_media(self, make_item):
item = make_item("https://bsky.app/profile/bellingcat.com/post/3lfphwmcs4c2z")
@@ -126,7 +147,7 @@ class TestGenericExtractor(TestExtractorBase):
item = make_item("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
result = self.extractor.download(item)
assert result is not False
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
@pytest.mark.download
def test_truthsocial_download_video(self, make_item):
@@ -141,14 +162,14 @@ class TestGenericExtractor(TestExtractorBase):
item = make_item("https://truthsocial.com/@bbcnewa/posts/109598702184774628")
result = self.extractor.download(item)
assert result is not False
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
@pytest.mark.download
def test_truthsocial_download_poll(self, make_item):
item = make_item("https://truthsocial.com/@CNN_US/posts/113724326568555098")
result = self.extractor.download(item)
assert result is not False
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
@pytest.mark.download
def test_truthsocial_download_single_image(self, make_item):
@@ -170,7 +191,7 @@ class TestGenericExtractor(TestExtractorBase):
url = "https://x.com/Bellingcat/status/17197025860711058"
response = self.extractor.download(make_item(url))
assert not response
@pytest.mark.download
def test_twitter_download_malformed_tweetid(self, make_item):
# this tweet does not exist
@@ -180,7 +201,6 @@ class TestGenericExtractor(TestExtractorBase):
@pytest.mark.download
def test_twitter_download_tweet_no_media(self, make_item):
item = make_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w")
post = self.extractor.download(item)
@@ -188,9 +208,9 @@ class TestGenericExtractor(TestExtractorBase):
post,
"Onion rings are just vegetable donuts.",
datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
"yt-dlp_Twitter: success"
"yt-dlp_Twitter: success",
)
@pytest.mark.download
def test_twitter_download_video(self, make_item):
url = "https://x.com/bellingcat/status/1871552600346415571"
@@ -198,33 +218,52 @@ class TestGenericExtractor(TestExtractorBase):
self.assertValidResponseMetadata(
post,
"Bellingcat - This month's Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it's crucial to investigate the people behind these services",
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc)
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc),
)
@pytest.mark.xfail(reason="Currently failing, sensitive content requires logged in users/cookies - not yet implemented")
@pytest.mark.xfail(
reason="Currently failing, sensitive content requires logged in users/cookies - not yet implemented"
)
@pytest.mark.download
@pytest.mark.parametrize("url, title, timestamp, image_hash", [
("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
])
@pytest.mark.parametrize(
"url, title, timestamp, image_hash",
[
(
"https://x.com/SozinhoRamalho/status/1876710769913450647",
"ignore tweet, testing sensitivity warning nudity",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
"image_hash",
),
(
"https://x.com/SozinhoRamalho/status/1876710875475681357",
"ignore tweet, testing sensitivity warning violence",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
"image_hash",
),
(
"https://x.com/SozinhoRamalho/status/1876711053813227618",
"ignore tweet, testing sensitivity warning sensitive",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
"image_hash",
),
(
"https://x.com/SozinhoRamalho/status/1876711141314801937",
"ignore tweet, testing sensitivity warning nudity, violence, sensitivity",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
"image_hash",
),
],
)
def test_twitter_download_sensitive_media(self, url, title, timestamp, image_hash, make_item):
"""Download tweets with sensitive media"""
post = self.extractor.download(make_item(url))
self.assertValidResponseMetadata(
post,
title,
timestamp
)
self.assertValidResponseMetadata(post, title, timestamp)
assert len(post.media) == 1
assert post.media[0].hash == image_hash
@pytest.mark.download
def test_download_facebook_video(self, make_item):
post = self.extractor.download(make_item("https://www.facebook.com/bellingcat/videos/588371253839133"))
assert len(post.media) == 2
assert post.media[0].filename.endswith("588371253839133.mp4")
@@ -234,11 +273,12 @@ class TestGenericExtractor(TestExtractorBase):
assert post.media[1].mimetype == "image/jpeg"
assert "Bellingchat Premium is with Kolina Koltai" in post.get_title()
@pytest.mark.download
def test_download_facebook_image(self, make_item):
post = self.extractor.download(make_item("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/"))
post = self.extractor.download(
make_item("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/")
)
assert len(post.media) == 1
assert post.media[0].filename.endswith(".png")
@@ -248,5 +288,5 @@ class TestGenericExtractor(TestExtractorBase):
def test_download_facebook_text_only(self, make_item):
url = "https://www.facebook.com/bellingcat/posts/pfbid02rzpwZxAZ8bLkAX8NvHv4DWAidFaqAUfJMbo9vWkpwxL7uMUWzWMiizXLWRSjwihVl"
post = self.extractor.download(make_item(url))
assert "Bellingcat researcher Kolina Koltai delves deeper into Clothoff" in post.get('content')
assert "Bellingcat researcher Kolina Koltai delves deeper into Clothoff" in post.get("content")
assert post.get_title() == "Bellingcat"

View File

@@ -15,10 +15,11 @@ def mock_user_response():
"username": "test_user",
"full_name": "Test User",
"profile_pic_url_hd": "http://example.com/profile.jpg",
"profile_pic_url": "http://example.com/profile_lowres.jpg"
"profile_pic_url": "http://example.com/profile_lowres.jpg",
}
}
@pytest.fixture
def mock_post_response():
return {
@@ -27,16 +28,14 @@ def mock_post_response():
"caption_text": "Test Caption",
"taken_at": datetime.now().timestamp(),
"video_url": "http://example.com/video.mp4",
"thumbnail_url": "http://example.com/thumbnail.jpg"
"thumbnail_url": "http://example.com/thumbnail.jpg",
}
@pytest.fixture
def mock_story_response():
return [{
"id": "story_123",
"taken_at": datetime.now().timestamp(),
"video_url": "http://example.com/story.mp4"
}]
return [{"id": "story_123", "taken_at": datetime.now().timestamp(), "video_url": "http://example.com/story.mp4"}]
@pytest.fixture
def mock_highlight_response():
@@ -46,11 +45,13 @@ def mock_highlight_response():
"highlight:123": {
"id": "123",
"title": "Test Highlight",
"items": [{
"id": "item_123",
"taken_at": datetime.now().timestamp(),
"video_url": "http://example.com/highlight.mp4"
}]
"items": [
{
"id": "item_123",
"taken_at": datetime.now().timestamp(),
"video_url": "http://example.com/highlight.mp4",
}
],
}
}
}
@@ -81,24 +82,30 @@ class TestInstagramAPIExtractor(TestExtractorBase):
m.set("netloc", "instagram.com")
return m
@pytest.mark.parametrize("url,expected", [
("https://instagram.com/user", [("", "user", "")]),
("https://instagr.am/p/post_id", []),
("https://youtube.com", []),
("https://www.instagram.com/reel/reel_id", [("reel", "reel_id", "")]),
("https://instagram.com/stories/highlights/123", [("stories/highlights", "123", "")]),
("https://instagram.com/stories/user/123", [("stories", "user", "123")]),
])
@pytest.mark.parametrize(
"url,expected",
[
("https://instagram.com/user", [("", "user", "")]),
("https://instagr.am/p/post_id", []),
("https://youtube.com", []),
("https://www.instagram.com/reel/reel_id", [("reel", "reel_id", "")]),
("https://instagram.com/stories/highlights/123", [("stories/highlights", "123", "")]),
("https://instagram.com/stories/user/123", [("stories", "user", "123")]),
],
)
def test_url_parsing(self, url, expected):
assert self.extractor.valid_url.findall(url) == expected
def test_initialize(self):
assert self.extractor.api_endpoint[-1] != "/"
@pytest.mark.parametrize("input_dict,expected", [
({"x": 0, "valid": "data"}, {"valid": "data"}),
({"nested": {"y": None, "valid": [{}]}}, {"nested": {"valid": [{}]}}),
])
@pytest.mark.parametrize(
"input_dict,expected",
[
({"x": 0, "valid": "data"}, {"valid": "data"}),
({"nested": {"y": None, "valid": [{}]}}, {"nested": {"valid": [{}]}}),
],
)
def test_cleanup_dict(self, input_dict, expected):
assert self.extractor.cleanup_dict(input_dict) == expected
@@ -114,8 +121,8 @@ class TestInstagramAPIExtractor(TestExtractorBase):
def test_download_profile_basic(self, metadata, mock_user_response, mocker):
"""Test basic profile download without full_profile"""
mock_call = mocker.patch.object(self.extractor, 'call_api')
mock_download = mocker.patch.object(self.extractor, 'download_from_url')
mock_call = mocker.patch.object(self.extractor, "call_api")
mock_download = mocker.patch.object(self.extractor, "download_from_url")
# Mock API responses
mock_call.return_value = mock_user_response
mock_download.return_value = "profile.jpg"
@@ -132,17 +139,14 @@ class TestInstagramAPIExtractor(TestExtractorBase):
def test_download_profile_full(self, metadata, mock_user_response, mock_story_response, mocker):
"""Test full profile download with stories/posts"""
mock_call = mocker.patch.object(self.extractor, 'call_api')
mock_posts = mocker.patch.object(self.extractor, 'download_all_posts')
mock_highlights = mocker.patch.object(self.extractor, 'download_all_highlights')
mock_tagged = mocker.patch.object(self.extractor, 'download_all_tagged')
mock_stories = mocker.patch.object(self.extractor, '_download_stories_reusable')
mock_call = mocker.patch.object(self.extractor, "call_api")
mock_posts = mocker.patch.object(self.extractor, "download_all_posts")
mock_highlights = mocker.patch.object(self.extractor, "download_all_highlights")
mock_tagged = mocker.patch.object(self.extractor, "download_all_tagged")
mock_stories = mocker.patch.object(self.extractor, "_download_stories_reusable")
self.extractor.full_profile = True
mock_call.side_effect = [
mock_user_response,
mock_story_response
]
mock_call.side_effect = [mock_user_response, mock_story_response]
mock_highlights.return_value = None
mock_stories.return_value = mock_story_response
mock_posts.return_value = None
@@ -155,7 +159,7 @@ class TestInstagramAPIExtractor(TestExtractorBase):
def test_download_profile_not_found(self, metadata, mocker):
"""Test profile not found error"""
mock_call = mocker.patch.object(self.extractor, 'call_api')
mock_call = mocker.patch.object(self.extractor, "call_api")
mock_call.return_value = {"user": None}
with pytest.raises(AssertionError) as exc_info:
self.extractor.download_profile(metadata, "invalid_user")
@@ -163,18 +167,14 @@ class TestInstagramAPIExtractor(TestExtractorBase):
def test_download_profile_error_handling(self, metadata, mock_user_response, mocker):
"""Test error handling in full profile mode"""
mock_call = mocker.patch.object(self.extractor, 'call_api')
mock_highlights = mocker.patch.object(self.extractor, 'download_all_highlights')
mock_tagged = mocker.patch.object(self.extractor, 'download_all_tagged')
stories_tagged = mocker.patch.object(self.extractor, '_download_stories_reusable')
mock_posts = mocker.patch.object(self.extractor, 'download_all_posts')
mock_call = mocker.patch.object(self.extractor, "call_api")
mock_highlights = mocker.patch.object(self.extractor, "download_all_highlights")
mock_tagged = mocker.patch.object(self.extractor, "download_all_tagged")
stories_tagged = mocker.patch.object(self.extractor, "_download_stories_reusable")
mock_posts = mocker.patch.object(self.extractor, "download_all_posts")
self.extractor.full_profile = True
mock_call.side_effect = [
mock_user_response,
Exception("Stories API failed"),
Exception("Posts API failed")
]
mock_call.side_effect = [mock_user_response, Exception("Stories API failed"), Exception("Posts API failed")]
mock_highlights.return_value = None
mock_tagged.return_value = None
stories_tagged.return_value = None
@@ -182,4 +182,4 @@ class TestInstagramAPIExtractor(TestExtractorBase):
result = self.extractor.download_profile(metadata, "test_user")
assert result.is_success()
assert "Error downloading stories for test_user" in result.metadata["errors"]
assert "Error downloading stories for test_user" in result.metadata["errors"]

View File

@@ -1,21 +1,41 @@
import pytest
from auto_archiver.modules.instagram_extractor import InstagramExtractor
from .test_extractor_base import TestExtractorBase
class TestInstagramExtractor(TestExtractorBase):
extractor_module: str = 'instagram_extractor'
config: dict = {}
@pytest.fixture
def instagram_extractor(setup_module, mocker):
extractor_module: str = "instagram_extractor"
config: dict = {
"username": "user_name",
"password": "password123",
"download_folder": "instaloader",
"session_file": "secrets/instaloader.session",
}
fake_loader = mocker.MagicMock()
fake_loader.load_session_from_file.return_value = None
fake_loader.login.return_value = None
fake_loader.save_session_to_file.return_value = None
mocker.patch(
"instaloader.Instaloader",
return_value=fake_loader,
)
return setup_module(extractor_module, config)
@pytest.mark.parametrize("url", [
@pytest.mark.parametrize(
"url",
[
"https://www.instagram.com/p/",
"https://www.instagram.com/p/1234567890/",
"https://www.instagram.com/reel/1234567890/",
"https://www.instagram.com/username/",
"https://www.instagram.com/username/stories/",
"https://www.instagram.com/username/highlights/",
])
def test_regex_matches(self, url):
# post
assert InstagramExtractor.valid_url.match(url)
],
)
def test_regex_matches(url: str, instagram_extractor: InstagramExtractor) -> None:
"""
Ensure that the valid_url regex matches all provided Instagram URLs.
"""
assert instagram_extractor.valid_url.match(url)

View File

@@ -7,10 +7,16 @@ from auto_archiver.modules.instagram_tbot_extractor import InstagramTbotExtracto
from tests.extractors.test_extractor_base import TestExtractorBase
@pytest.fixture(autouse=True)
def mock_sleep(mocker):
"""Mock time.sleep to avoid delays."""
return mocker.patch("time.sleep")
@pytest.fixture
def patch_extractor_methods(request, setup_module, mocker):
mocker.patch.object(InstagramTbotExtractor, '_prepare_session_file', return_value=None)
mocker.patch.object(InstagramTbotExtractor, '_initialize_telegram_client', return_value=None)
mocker.patch.object(InstagramTbotExtractor, "_prepare_session_file", return_value=None)
mocker.patch.object(InstagramTbotExtractor, "_initialize_telegram_client", return_value=None)
yield
@@ -35,12 +41,7 @@ def mock_telegram_client(mocker):
@pytest.fixture
def extractor(setup_module, patch_extractor_methods, mocker):
extractor_module = "instagram_tbot_extractor"
config = {
"api_id": 12345,
"api_hash": "test_api_hash",
"session_file": "test_session",
"timeout": 4
}
config = {"api_id": 12345, "api_hash": "test_api_hash", "session_file": "test_session", "timeout": 4}
extractor = setup_module(extractor_module, config)
extractor.client = mocker.MagicMock()
extractor.session_file = "test_session"
@@ -79,21 +80,30 @@ class TestInstagramTbotExtractorReal(TestExtractorBase):
"session_file": "secrets/anon-insta",
}
@pytest.mark.parametrize("url, expected_status, message, len_media", [
("https://www.instagram.com/p/C4QgLbrIKXG", "insta-via-bot: success",
"Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou",
6),
("https://www.instagram.com/reel/DEVLK8qoIbg/", "insta-via-bot: success",
"Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol",
3),
# instagram tbot not working (potentially intermittently?) for stories - replace with a live story to retest
# ("https://www.instagram.com/stories/bellingcatofficial/3556336382743057476/", False, "Media not found or unavailable"),
# Seems to be working intermittently for highlights
# ("https://www.instagram.com/stories/highlights/17868810693068139/", "insta-via-bot: success", None, 50),
# Marking invalid url as success
("https://www.instagram.com/p/INVALID", "insta-via-bot: success", "Media not found or unavailable", 0),
("https://www.youtube.com/watch?v=ymCMy8OffHM", False, None, 0),
])
@pytest.mark.parametrize(
"url, expected_status, message, len_media",
[
(
"https://www.instagram.com/p/C4QgLbrIKXG",
"insta-via-bot: success",
"Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou",
6,
),
(
"https://www.instagram.com/reel/DEVLK8qoIbg/",
"insta-via-bot: success",
"Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol",
3,
),
# instagram tbot not working (potentially intermittently?) for stories - replace with a live story to retest
# ("https://www.instagram.com/stories/bellingcatofficial/3556336382743057476/", False, "Media not found or unavailable"),
# Seems to be working intermittently for highlights
# ("https://www.instagram.com/stories/highlights/17868810693068139/", "insta-via-bot: success", None, 50),
# Marking invalid url as success
("https://www.instagram.com/p/INVALID", "insta-via-bot: success", "Media not found or unavailable", 0),
("https://www.youtube.com/watch?v=ymCMy8OffHM", False, None, 0),
],
)
def test_download(self, url, expected_status, message, len_media, metadata_sample):
"""Test the `download()` method with various Instagram URLs."""
metadata_sample.set_url(url)

View File

@@ -0,0 +1,151 @@
from datetime import datetime, timezone
import time
import pytest
import yt_dlp
from auto_archiver.modules.generic_extractor.generic_extractor import GenericExtractor
from .test_extractor_base import TestExtractorBase
@pytest.fixture(autouse=True)
def skip_ytdlp_own_methods(mocker):
# mock this method, so that we skip the ytdlp download in these tests
mocker.patch("auto_archiver.modules.generic_extractor.tiktok.Tiktok.skip_ytdlp_download", return_value=True)
mocker.patch(
"auto_archiver.modules.generic_extractor.generic_extractor.GenericExtractor.suitable_extractors",
return_value=[e for e in yt_dlp.YoutubeDL()._ies.values() if e.IE_NAME == "TikTok"],
)
@pytest.fixture()
def mock_get(mocker):
return mocker.patch("auto_archiver.modules.generic_extractor.tiktok.requests.get")
class TestTiktokTikwmExtractor(TestExtractorBase):
"""
Test suite for TestTiktokTikwmExtractor.
"""
extractor_module = "generic_extractor"
extractor: GenericExtractor
config = {}
VALID_EXAMPLE_URL = "https://www.tiktok.com/@example/video/1234"
def test_invalid_json_responses(self, mock_get, make_item, caplog):
mock_get.return_value.status_code = 200
mock_get.return_value.json.side_effect = ValueError
with caplog.at_level("DEBUG"):
assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False
mock_get.assert_called_once()
mock_get.return_value.json.assert_called_once()
# first message is just the 'Skipping using ytdlp to download files for TikTok' message
assert (
"failed to parse JSON response from tikwm.com for url='https://www.tiktok.com/@example/video/1234'"
in caplog.text
)
mock_get.return_value.json.side_effect = Exception
with caplog.at_level("ERROR"):
assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False
mock_get.assert_called()
assert mock_get.call_count == 2
assert mock_get.return_value.json.call_count == 2
assert (
"failed to parse JSON response from tikwm.com for url='https://www.tiktok.com/@example/video/1234'"
in caplog.text
)
@pytest.mark.parametrize(
"response",
[
({"msg": "failure"}),
({"msg": "success"}),
],
)
def test_unsuccessful_responses(self, mock_get, make_item, response, caplog):
mock_get.return_value.status_code = 200
mock_get.return_value.json.return_value = response
with caplog.at_level("DEBUG"):
assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False
mock_get.assert_called_once()
mock_get.return_value.json.assert_called_once()
assert "failed to get a valid response from tikwm.com" in caplog.text
@pytest.mark.parametrize(
"response,has_vid",
[
({"data": {"id": 123}}, False),
({"data": {"wmplay": "url"}}, True),
({"data": {"play": "url"}}, True),
],
)
def test_correct_extraction(self, mock_get, make_item, response, has_vid, mocker):
mock_get.return_value.status_code = 200
mock_get.return_value.json.return_value = {"msg": "success", **response}
result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
if not has_vid:
assert result is False
else:
assert result.is_success()
assert len(result.media) == 1
mock_get.assert_called()
assert mock_get.call_count == 1 + int(has_vid)
mock_get.return_value.json.assert_called_once()
def test_correct_data_extracted(self, mock_get, make_item):
mock_get.return_value.status_code = 200
mock_get.return_value.json.return_value = {
"msg": "success",
"data": {
"wmplay": "url",
"origin_cover": "cover.jpg",
"title": "Title",
"id": 123,
"duration": 60,
"create_time": 1736301699,
"author": "Author",
"other": "data",
},
}
result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
assert result.is_success()
assert len(result.media) == 2
assert result.get_title() == "Title"
assert result.get("author") == "Author"
assert result.get("api_data") == {"other": "data", "id": 123}
assert result.media[1].get("duration") == 60
assert result.get("timestamp") == datetime.fromtimestamp(1736301699, tz=timezone.utc)
@pytest.mark.download
def test_download_video(self, make_item):
url = "https://www.tiktok.com/@bbcnews/video/7478038212070411542"
result = self.extractor.download(make_item(url))
assert result.is_success()
assert len(result.media) == 2
assert (
result.get_title()
== "The A23a iceberg is one of the world's oldest and it's so big you can see it from space. #Iceberg #A23a #Antarctica #Ice #ClimateChange #DavidAttenborough #Ocean #Sea #SouthGeorgia #BBCNews "
)
assert result.get("author").get("unique_id") == "bbcnews"
assert result.get("api_data").get("id") == "7478038212070411542"
assert result.media[1].get("duration") == 59
assert result.get("timestamp") == datetime.fromtimestamp(1741122000, tz=timezone.utc)
@pytest.mark.download
def test_download_sensitive_video(self, make_item):
url = "https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375"
# Required for rate limiting
time.sleep(1.1)
result = self.extractor.download(make_item(url))
assert result.is_success()
assert len(result.media) == 2
assert result.get_title() == "Căng nhất lúc này #ggs68 #ggs68taiwan #taiwan #dailoan #tiktoknews"
assert result.get("author").get("id") == "7197400619475649562"
assert result.get("api_data").get("id") == "7441821351142362375"
assert result.media[1].get("duration") == 34
assert result.get("timestamp") == datetime.fromtimestamp(1732684060, tz=timezone.utc)

View File

@@ -1,6 +1,5 @@
import os
import datetime
import hashlib
import pytest
from pytwitter.models.media import MediaVariant
@@ -10,8 +9,7 @@ from auto_archiver.modules.twitter_api_extractor import TwitterApiExtractor
@pytest.mark.incremental
class TestTwitterApiExtractor(TestExtractorBase):
extractor_module = 'twitter_api_extractor'
extractor_module: TwitterApiExtractor = "twitter_api_extractor"
config = {
"bearer_tokens": [],
@@ -22,41 +20,79 @@ class TestTwitterApiExtractor(TestExtractorBase):
"access_secret": os.environ.get("TWITTER_ACCESS_SECRET"),
}
@pytest.mark.parametrize("url, expected", [
("https://x.com/bellingcat/status/1874097816571961839", "https://x.com/bellingcat/status/1874097816571961839"), # x.com urls unchanged
("https://twitter.com/bellingcat/status/1874097816571961839", "https://twitter.com/bellingcat/status/1874097816571961839"), # twitter urls unchanged
("https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # don't strip params from twitter urls (changed Jan 2025)
("https://www.bellingcat.com/category/resources/", "https://www.bellingcat.com/category/resources/"), # non-twitter/x urls unchanged
("https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # shouldn't strip params from non-twitter/x URLs
])
@pytest.mark.parametrize(
"url, expected",
[
(
"https://x.com/bellingcat/status/1874097816571961839",
"https://x.com/bellingcat/status/1874097816571961839",
), # x.com urls unchanged
(
"https://twitter.com/bellingcat/status/1874097816571961839",
"https://twitter.com/bellingcat/status/1874097816571961839",
), # twitter urls unchanged
(
"https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
"https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
), # don't strip params from twitter urls (changed Jan 2025)
(
"https://www.bellingcat.com/category/resources/",
"https://www.bellingcat.com/category/resources/",
), # non-twitter/x urls unchanged
(
"https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
"https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
), # shouldn't strip params from non-twitter/x URLs
],
)
def test_sanitize_url(self, url, expected):
assert expected == self.extractor.sanitize_url(url)
@pytest.mark.download
def test_sanitize_url_download(self):
assert "https://www.bellingcat.com/category/resources/" == self.extractor.sanitize_url("https://t.co/yl3oOJatFp")
assert "https://www.bellingcat.com/category/resources/" == self.extractor.sanitize_url(
"https://t.co/yl3oOJatFp"
)
@pytest.mark.parametrize("url, exptected_username, exptected_tweetid", [
("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
("https://x.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
("https://www.bellingcat.com/category/resources/", False, False)
])
@pytest.mark.parametrize(
"url, exptected_username, exptected_tweetid",
[
("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
("https://x.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
("https://www.bellingcat.com/category/resources/", False, False),
],
)
def test_get_username_tweet_id_from_url(self, url, exptected_username, exptected_tweetid):
username, tweet_id = self.extractor.get_username_tweet_id(url)
assert exptected_username == username
assert exptected_tweetid == tweet_id
def test_choose_variants(self):
# taken from the response for url https://x.com/bellingcat/status/1871552600346415571
variant_list = [MediaVariant(content_type='application/x-mpegURL', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/pl/ovWo7ux-bKROwYIC.m3u8?tag=12&v=e1b'),
MediaVariant(bit_rate=256000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/480x270/OqZIrKV0LFswMvxS.mp4?tag=12'),
MediaVariant(bit_rate=832000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/640x360/uiDZDSmZ8MZn9hsi.mp4?tag=12'),
MediaVariant(bit_rate=2176000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/1280x720/6Y340Esh568WZnRZ.mp4?tag=12')
]
variant_list = [
MediaVariant(
content_type="application/x-mpegURL",
url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/pl/ovWo7ux-bKROwYIC.m3u8?tag=12&v=e1b",
),
MediaVariant(
bit_rate=256000,
content_type="video/mp4",
url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/480x270/OqZIrKV0LFswMvxS.mp4?tag=12",
),
MediaVariant(
bit_rate=832000,
content_type="video/mp4",
url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/640x360/uiDZDSmZ8MZn9hsi.mp4?tag=12",
),
MediaVariant(
bit_rate=2176000,
content_type="video/mp4",
url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/1280x720/6Y340Esh568WZnRZ.mp4?tag=12",
),
]
chosen_variant = self.extractor.choose_variant(variant_list)
assert chosen_variant == variant_list[3]
@pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
@pytest.mark.download
def test_download_nonexistent_tweet(self, make_item):
@@ -76,7 +112,6 @@ class TestTwitterApiExtractor(TestExtractorBase):
@pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
@pytest.mark.download
def test_download_tweet_no_media(self, make_item):
item = make_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w")
post = self.extractor.download(item)
@@ -84,7 +119,7 @@ class TestTwitterApiExtractor(TestExtractorBase):
post,
"Onion rings are just vegetable donuts.",
datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
"twitter-api: success"
"twitter-api: success",
)
@pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
@@ -95,27 +130,41 @@ class TestTwitterApiExtractor(TestExtractorBase):
self.assertValidResponseMetadata(
post,
"This month's Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it's crucial to investigate the people behind these services https://t.co/SfBUq0hSD0 https://t.co/rIHx0WlKp8",
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc)
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc),
)
@pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
@pytest.mark.parametrize("url, title, timestamp", [
("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity https://t.co/t3u0hQsSB1", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence https://t.co/syYDSkpjZD", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive https://t.co/XE7cRdjzYq", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity https://t.co/YxCFbbhYE3", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
])
@pytest.mark.parametrize(
"url, title, timestamp",
[
(
"https://x.com/SozinhoRamalho/status/1876710769913450647",
"ignore tweet, testing sensitivity warning nudity https://t.co/t3u0hQsSB1",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
),
(
"https://x.com/SozinhoRamalho/status/1876710875475681357",
"ignore tweet, testing sensitivity warning violence https://t.co/syYDSkpjZD",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
),
(
"https://x.com/SozinhoRamalho/status/1876711053813227618",
"ignore tweet, testing sensitivity warning sensitive https://t.co/XE7cRdjzYq",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
),
(
"https://x.com/SozinhoRamalho/status/1876711141314801937",
"ignore tweet, testing sensitivity warning nudity, violence, sensitivity https://t.co/YxCFbbhYE3",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
),
],
)
@pytest.mark.download
def test_download_sensitive_media(self, url, title, timestamp, check_hash, make_item):
"""Download tweets with sensitive media"""
post = self.extractor.download(make_item(url))
self.assertValidResponseMetadata(
post,
title,
timestamp
)
self.assertValidResponseMetadata(post, title, timestamp)
assert len(post.media) == 1
# check the SHA1 hash (quick) of the media, to make sure it's valid
check_hash(post.media[0].filename, "3eea9c03b2dcedd1eb9a169d8bfd1cf877996fab4961de019a96eb9d32d2d733")
check_hash(post.media[0].filename, "3eea9c03b2dcedd1eb9a169d8bfd1cf877996fab4961de019a96eb9d32d2d733")

View File

@@ -0,0 +1,77 @@
import pytest
from auto_archiver.core import Metadata
from auto_archiver.modules.vk_extractor import VkExtractor
@pytest.fixture
def mock_vk_scraper(mocker):
"""Fixture to mock VkScraper."""
return mocker.patch("auto_archiver.modules.vk_extractor.vk_extractor.VkScraper")
@pytest.fixture
def vk_extractor(setup_module, mock_vk_scraper) -> VkExtractor:
"""Fixture to initialize VkExtractor with mocked VkScraper."""
extractor_module = "vk_extractor"
configs = {
"username": "name",
"password": "password123",
"session_file": "secrets/vk_config.v2.json",
}
vk = setup_module(extractor_module, configs)
vk.vks = mock_vk_scraper.return_value
return vk
def test_netloc(vk_extractor, metadata):
# metadata url set as: "https://example.com/"
assert vk_extractor.download(metadata) is False
def test_vk_url_but_scrape_returns_empty(vk_extractor, metadata):
metadata.set_url("https://vk.com/valid-wall")
vk_extractor.vks.scrape.return_value = []
assert vk_extractor.download(metadata) is False
assert metadata.netloc == "vk.com"
vk_extractor.vks.scrape.assert_called_once_with(metadata.get_url())
def test_successful_scrape_and_download(vk_extractor, metadata, mocker):
mock_scrapes = [
{"text": "Post Title", "datetime": "2023-01-01T00:00:00", "id": 1},
{"text": "Another Post", "datetime": "2023-01-02T00:00:00", "id": 2},
]
mock_filenames = ["image1.jpg", "image2.png"]
vk_extractor.vks.scrape.return_value = mock_scrapes
vk_extractor.vks.download_media.return_value = mock_filenames
metadata.set_url("https://vk.com/valid-wall")
result = vk_extractor.download(metadata)
# Test metadata
assert result.is_success()
assert result.status == "vk: success"
assert result.get_title() == "Post Title"
assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
assert "Another Post" in result.metadata["content"]
# Test Media objects
assert len(result.media) == 2
assert result.media[0].filename == "image1.jpg"
assert result.media[1].filename == "image2.png"
vk_extractor.vks.download_media.assert_called_once_with(mock_scrapes, vk_extractor.tmp_dir)
def test_adds_first_title_and_timestamp(vk_extractor):
metadata = Metadata().set_url("https://vk.com/no-metadata")
metadata.set_url("https://vk.com/no-metadata")
mock_scrapes = [
{"text": "value", "datetime": "2023-01-01T00:00:00"},
{"text": "value2", "datetime": "2023-01-02T00:00:00"},
]
vk_extractor.vks.scrape.return_value = mock_scrapes
vk_extractor.vks.download_media.return_value = []
result = vk_extractor.download(metadata)
assert result.get_title() == "value"
# formatted timestamp
assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
assert result.is_success()