Merge main

2026-06-08 03:18:28 +03:00 · 2025-03-17 10:05:11 +00:00
parent 7e360240bf b2238427a0
commit 59b910ec30
229 changed files with 61430 additions and 3147 deletions
--- a/tests/extractors/test_extractor_base.py
+++ b/tests/extractors/test_extractor_base.py
@@ -7,7 +7,6 @@ from auto_archiver.core.extractor import Extractor


 class TestExtractorBase(object):
-
    extractor_module: str = None
    config: dict = None

@@ -17,7 +16,7 @@ class TestExtractorBase(object):
        assert self.config is not None, "self.config must be a dict set on the subclass"

        self.extractor: Type[Extractor] = setup_module(self.extractor_module, self.config)
-    
+
    def assertValidResponseMetadata(self, test_response: Metadata, title: str, timestamp: str, status: str = ""):
        assert test_response is not False

--- a/tests/extractors/test_generic_extractor.py
+++ b/tests/extractors/test_generic_extractor.py
@@ -9,26 +9,28 @@ import pytest
 from auto_archiver.modules.generic_extractor.generic_extractor import GenericExtractor
 from .test_extractor_base import TestExtractorBase

-CI=os.getenv("GITHUB_ACTIONS", '') == 'true'
+CI = os.getenv("GITHUB_ACTIONS", "") == "true"
+
+
 class TestGenericExtractor(TestExtractorBase):
-    """Tests Generic Extractor
-    """
-    extractor_module = 'generic_extractor'
+    """Tests Generic Extractor"""
+
+    extractor_module = "generic_extractor"
    extractor: GenericExtractor

    config = {
-        'subtitles': False,
-        'comments': False,
-        'livestreams': False,
-        'live_from_start': False,
-        'end_means_success': True,
-        'allow_playlist': False,
-        'max_downloads': "inf",
-        'proxy': None,
-        'cookies_from_browser': False,
-        'cookie_file': None,
-        }
-    
+        "subtitles": False,
+        "comments": False,
+        "livestreams": False,
+        "live_from_start": False,
+        "end_means_success": True,
+        "allow_playlist": False,
+        "max_downloads": "inf",
+        "proxy": None,
+        "cookies_from_browser": False,
+        "cookie_file": None,
+    }
+
    def test_load_dropin(self):
        # test loading dropins that are in the generic_archiver package
        package = "auto_archiver.modules.generic_extractor"
@@ -38,32 +40,42 @@ class TestGenericExtractor(TestExtractorBase):
        path = os.path.join(dirname(dirname(__file__)), "data/")
        assert self.extractor.dropin_for_name("dropin", additional_paths=[path])

-
-    @pytest.mark.parametrize("url, suitable_extractors", [
-        ("https://www.youtube.com/watch?v=5qap5aO4i9A", ["youtube"]),
-        ("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", ["tiktok"]),
-        ("https://www.instagram.com/p/CU1J9JYJ9Zz/", ["instagram"]),
-        ("https://www.facebook.com/nytimes/videos/10160796550110716", ["facebook"]),
-        ("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/", ["facebook"]),])
+    @pytest.mark.parametrize(
+        "url, suitable_extractors",
+        [
+            ("https://www.youtube.com/watch?v=5qap5aO4i9A", ["youtube"]),
+            ("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", ["tiktok"]),
+            ("https://www.instagram.com/p/CU1J9JYJ9Zz/", ["instagram"]),
+            ("https://www.facebook.com/nytimes/videos/10160796550110716", ["facebook"]),
+            ("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/", ["facebook"]),
+        ],
+    )
    def test_suitable_extractors(self, url, suitable_extractors):
-        suitable_extractors = suitable_extractors + ['generic'] # the generic is valid for all
+        suitable_extractors = suitable_extractors + ["generic"]  # the generic is valid for all
        extractors = list(self.extractor.suitable_extractors(url))
        assert len(extractors) == len(suitable_extractors)
        assert [e.ie_key().lower() for e in extractors] == suitable_extractors

-    @pytest.mark.parametrize("url, is_suitable", [
-        ("https://www.youtube.com/watch?v=5qap5aO4i9A", True),
-        ("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", True),
-        ("https://www.instagram.com/p/CU1J9JYJ9Zz/", True),
-        ("https://www.facebook.com/nytimes/videos/10160796550110716", True),
-        ("https://www.twitch.tv/videos/1167226570", True),
-        ("https://bellingcat.com/news/2021/10/08/ukrainian-soldiers-are-being-killed-by-landmines-in-the-donbas/", True),
-        ("https://google.com", True)])
+    @pytest.mark.parametrize(
+        "url, is_suitable",
+        [
+            ("https://www.youtube.com/watch?v=5qap5aO4i9A", True),
+            ("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", True),
+            ("https://www.instagram.com/p/CU1J9JYJ9Zz/", True),
+            ("https://www.facebook.com/nytimes/videos/10160796550110716", True),
+            ("https://www.twitch.tv/videos/1167226570", True),
+            (
+                "https://bellingcat.com/news/2021/10/08/ukrainian-soldiers-are-being-killed-by-landmines-in-the-donbas/",
+                True,
+            ),
+            ("https://google.com", True),
+        ],
+    )
    def test_suitable_urls(self, url, is_suitable):
        """
-            Note: expected behaviour is to return True for all URLs, as YoutubeDLArchiver should be able to handle all URLs
-            This behaviour may be changed in the future (e.g. if we want the youtubedl archiver to just handle URLs it has extractors for,
-            and then if and only if all archivers fails, does it fall back to the generic archiver)
+        Note: expected behaviour is to return True for all URLs, as YoutubeDLArchiver should be able to handle all URLs
+        This behaviour may be changed in the future (e.g. if we want the youtubedl archiver to just handle URLs it has extractors for,
+        and then if and only if all archivers fails, does it fall back to the generic archiver)
        """
        assert self.extractor.suitable(url) == is_suitable

@@ -74,12 +86,15 @@ class TestGenericExtractor(TestExtractorBase):
        assert result.get_url() == "https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970"

    @pytest.mark.download
-    @pytest.mark.parametrize("url", [
-        "https://bsky.app/profile/colborne.bsky.social/post/3lcxcpgt6j42l",
-        "twitter.com/bellingcat/status/123",
-        "https://www.youtube.com/watch?v=1"
-    ])
-    def test_download_nonexistend_media(self, make_item, url):
+    @pytest.mark.parametrize(
+        "url",
+        [
+            "https://bsky.app/profile/colborne.bsky.social/post/3lcxcpgt6j42l",
+            "twitter.com/bellingcat/status/123",
+            "https://www.youtube.com/watch?v=1",
+        ],
+    )
+    def test_download_nonexistent_media(self, make_item, url):
        """
        Test to make sure that the extractor doesn't break on non-existend posts/media

@@ -89,7 +104,10 @@ class TestGenericExtractor(TestExtractorBase):
        result = self.extractor.download(item)
        assert not result

-    @pytest.mark.skipif(CI, reason="Currently no way to authenticate when on CI. Youtube (yt-dlp) doesn't support logging in with username/password.")
+    @pytest.mark.skipif(
+        CI,
+        reason="Currently no way to authenticate when on CI. Youtube (yt-dlp) doesn't support logging in with username/password.",
+    )
    @pytest.mark.download
    def test_youtube_download(self, make_item):
        # url https://www.youtube.com/watch?v=5qap5aO4i9A
@@ -98,7 +116,10 @@ class TestGenericExtractor(TestExtractorBase):
        result = self.extractor.download(item)
        assert result.get_url() == "https://www.youtube.com/watch?v=J---aiyznGQ"
        assert result.get_title() == "Keyboard Cat! - THE ORIGINAL!"
-        assert result.get('description') == "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
+        assert (
+            result.get("description")
+            == "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
+        )
        assert len(result.media) == 2
        assert Path(result.media[0].filename).name == "J---aiyznGQ.webm"
        assert Path(result.media[1].filename).name == "hqdefault.jpg"
@@ -114,7 +135,7 @@ class TestGenericExtractor(TestExtractorBase):
        item = make_item("https://bsky.app/profile/bellingcat.com/post/3lfn3hbcxgc2q")
        result = self.extractor.download(item)
        assert result is not False
-    
+
    @pytest.mark.download
    def test_bluesky_download_no_media(self, make_item):
        item = make_item("https://bsky.app/profile/bellingcat.com/post/3lfphwmcs4c2z")
@@ -126,7 +147,7 @@ class TestGenericExtractor(TestExtractorBase):
        item = make_item("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
        result = self.extractor.download(item)
        assert result is not False
-    
+
    @pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
    @pytest.mark.download
    def test_truthsocial_download_video(self, make_item):
@@ -141,14 +162,14 @@ class TestGenericExtractor(TestExtractorBase):
        item = make_item("https://truthsocial.com/@bbcnewa/posts/109598702184774628")
        result = self.extractor.download(item)
        assert result is not False
-    
+
    @pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
    @pytest.mark.download
    def test_truthsocial_download_poll(self, make_item):
        item = make_item("https://truthsocial.com/@CNN_US/posts/113724326568555098")
        result = self.extractor.download(item)
        assert result is not False
-    
+
    @pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
    @pytest.mark.download
    def test_truthsocial_download_single_image(self, make_item):
@@ -170,7 +191,7 @@ class TestGenericExtractor(TestExtractorBase):
        url = "https://x.com/Bellingcat/status/17197025860711058"
        response = self.extractor.download(make_item(url))
        assert not response
-    
+
    @pytest.mark.download
    def test_twitter_download_malformed_tweetid(self, make_item):
        # this tweet does not exist
@@ -180,7 +201,6 @@ class TestGenericExtractor(TestExtractorBase):

    @pytest.mark.download
    def test_twitter_download_tweet_no_media(self, make_item):
-        
        item = make_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w")
        post = self.extractor.download(item)

@@ -188,9 +208,9 @@ class TestGenericExtractor(TestExtractorBase):
            post,
            "Onion rings are just vegetable donuts.",
            datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
-            "yt-dlp_Twitter: success"
+            "yt-dlp_Twitter: success",
        )
-    
+
    @pytest.mark.download
    def test_twitter_download_video(self, make_item):
        url = "https://x.com/bellingcat/status/1871552600346415571"
@@ -198,33 +218,52 @@ class TestGenericExtractor(TestExtractorBase):
        self.assertValidResponseMetadata(
            post,
            "Bellingcat - This month's Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it's crucial to investigate the people behind these services",
-            datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc)
+            datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc),
        )

-    @pytest.mark.xfail(reason="Currently failing, sensitive content requires logged in users/cookies - not yet implemented")
+    @pytest.mark.xfail(
+        reason="Currently failing, sensitive content requires logged in users/cookies - not yet implemented"
+    )
    @pytest.mark.download
-    @pytest.mark.parametrize("url, title, timestamp, image_hash", [
-            ("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
-            ("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
-            ("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
-            ("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
-        ])
+    @pytest.mark.parametrize(
+        "url, title, timestamp, image_hash",
+        [
+            (
+                "https://x.com/SozinhoRamalho/status/1876710769913450647",
+                "ignore tweet, testing sensitivity warning nudity",
+                datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
+                "image_hash",
+            ),
+            (
+                "https://x.com/SozinhoRamalho/status/1876710875475681357",
+                "ignore tweet, testing sensitivity warning violence",
+                datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
+                "image_hash",
+            ),
+            (
+                "https://x.com/SozinhoRamalho/status/1876711053813227618",
+                "ignore tweet, testing sensitivity warning sensitive",
+                datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
+                "image_hash",
+            ),
+            (
+                "https://x.com/SozinhoRamalho/status/1876711141314801937",
+                "ignore tweet, testing sensitivity warning nudity, violence, sensitivity",
+                datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
+                "image_hash",
+            ),
+        ],
+    )
    def test_twitter_download_sensitive_media(self, url, title, timestamp, image_hash, make_item):
-
        """Download tweets with sensitive media"""

        post = self.extractor.download(make_item(url))
-        self.assertValidResponseMetadata(
-            post,
-            title,
-            timestamp
-        )
+        self.assertValidResponseMetadata(post, title, timestamp)
        assert len(post.media) == 1
        assert post.media[0].hash == image_hash

    @pytest.mark.download
    def test_download_facebook_video(self, make_item):
-
        post = self.extractor.download(make_item("https://www.facebook.com/bellingcat/videos/588371253839133"))
        assert len(post.media) == 2
        assert post.media[0].filename.endswith("588371253839133.mp4")
@@ -234,11 +273,12 @@ class TestGenericExtractor(TestExtractorBase):
        assert post.media[1].mimetype == "image/jpeg"

        assert "Bellingchat Premium is with Kolina Koltai" in post.get_title()
-    
+
    @pytest.mark.download
    def test_download_facebook_image(self, make_item):
-
-        post = self.extractor.download(make_item("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/"))
+        post = self.extractor.download(
+            make_item("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/")
+        )

        assert len(post.media) == 1
        assert post.media[0].filename.endswith(".png")
@@ -248,5 +288,5 @@ class TestGenericExtractor(TestExtractorBase):
    def test_download_facebook_text_only(self, make_item):
        url = "https://www.facebook.com/bellingcat/posts/pfbid02rzpwZxAZ8bLkAX8NvHv4DWAidFaqAUfJMbo9vWkpwxL7uMUWzWMiizXLWRSjwihVl"
        post = self.extractor.download(make_item(url))
-        assert "Bellingcat researcher Kolina Koltai delves deeper into Clothoff" in post.get('content')
+        assert "Bellingcat researcher Kolina Koltai delves deeper into Clothoff" in post.get("content")
        assert post.get_title() == "Bellingcat"
--- a/tests/extractors/test_instagram_api_extractor.py
+++ b/tests/extractors/test_instagram_api_extractor.py
@@ -15,10 +15,11 @@ def mock_user_response():
            "username": "test_user",
            "full_name": "Test User",
            "profile_pic_url_hd": "http://example.com/profile.jpg",
-            "profile_pic_url": "http://example.com/profile_lowres.jpg"
+            "profile_pic_url": "http://example.com/profile_lowres.jpg",
        }
    }

+
@pytest.fixture
 def mock_post_response():
    return {
@@ -27,16 +28,14 @@ def mock_post_response():
        "caption_text": "Test Caption",
        "taken_at": datetime.now().timestamp(),
        "video_url": "http://example.com/video.mp4",
-        "thumbnail_url": "http://example.com/thumbnail.jpg"
+        "thumbnail_url": "http://example.com/thumbnail.jpg",
    }

+
@pytest.fixture
 def mock_story_response():
-    return [{
-        "id": "story_123",
-        "taken_at": datetime.now().timestamp(),
-        "video_url": "http://example.com/story.mp4"
-    }]
+    return [{"id": "story_123", "taken_at": datetime.now().timestamp(), "video_url": "http://example.com/story.mp4"}]
+

@pytest.fixture
 def mock_highlight_response():
@@ -46,11 +45,13 @@ def mock_highlight_response():
                "highlight:123": {
                    "id": "123",
                    "title": "Test Highlight",
-                    "items": [{
-                        "id": "item_123",
-                        "taken_at": datetime.now().timestamp(),
-                        "video_url": "http://example.com/highlight.mp4"
-                    }]
+                    "items": [
+                        {
+                            "id": "item_123",
+                            "taken_at": datetime.now().timestamp(),
+                            "video_url": "http://example.com/highlight.mp4",
+                        }
+                    ],
                }
            }
        }
@@ -81,24 +82,30 @@ class TestInstagramAPIExtractor(TestExtractorBase):
        m.set("netloc", "instagram.com")
        return m

-    @pytest.mark.parametrize("url,expected", [
-        ("https://instagram.com/user", [("", "user", "")]),
-        ("https://instagr.am/p/post_id", []),
-        ("https://youtube.com", []),
-        ("https://www.instagram.com/reel/reel_id", [("reel", "reel_id", "")]),
-        ("https://instagram.com/stories/highlights/123", [("stories/highlights", "123", "")]),
-        ("https://instagram.com/stories/user/123", [("stories", "user", "123")]),
-    ])
+    @pytest.mark.parametrize(
+        "url,expected",
+        [
+            ("https://instagram.com/user", [("", "user", "")]),
+            ("https://instagr.am/p/post_id", []),
+            ("https://youtube.com", []),
+            ("https://www.instagram.com/reel/reel_id", [("reel", "reel_id", "")]),
+            ("https://instagram.com/stories/highlights/123", [("stories/highlights", "123", "")]),
+            ("https://instagram.com/stories/user/123", [("stories", "user", "123")]),
+        ],
+    )
    def test_url_parsing(self, url, expected):
        assert self.extractor.valid_url.findall(url) == expected

    def test_initialize(self):
        assert self.extractor.api_endpoint[-1] != "/"

-    @pytest.mark.parametrize("input_dict,expected", [
-        ({"x": 0, "valid": "data"}, {"valid": "data"}),
-        ({"nested": {"y": None, "valid": [{}]}}, {"nested": {"valid": [{}]}}),
-    ])
+    @pytest.mark.parametrize(
+        "input_dict,expected",
+        [
+            ({"x": 0, "valid": "data"}, {"valid": "data"}),
+            ({"nested": {"y": None, "valid": [{}]}}, {"nested": {"valid": [{}]}}),
+        ],
+    )
    def test_cleanup_dict(self, input_dict, expected):
        assert self.extractor.cleanup_dict(input_dict) == expected

@@ -114,8 +121,8 @@ class TestInstagramAPIExtractor(TestExtractorBase):

    def test_download_profile_basic(self, metadata, mock_user_response, mocker):
        """Test basic profile download without full_profile"""
-        mock_call = mocker.patch.object(self.extractor, 'call_api')
-        mock_download = mocker.patch.object(self.extractor, 'download_from_url')
+        mock_call = mocker.patch.object(self.extractor, "call_api")
+        mock_download = mocker.patch.object(self.extractor, "download_from_url")
        # Mock API responses
        mock_call.return_value = mock_user_response
        mock_download.return_value = "profile.jpg"
@@ -132,17 +139,14 @@ class TestInstagramAPIExtractor(TestExtractorBase):

    def test_download_profile_full(self, metadata, mock_user_response, mock_story_response, mocker):
        """Test full profile download with stories/posts"""
-        mock_call = mocker.patch.object(self.extractor, 'call_api')
-        mock_posts = mocker.patch.object(self.extractor, 'download_all_posts')
-        mock_highlights = mocker.patch.object(self.extractor, 'download_all_highlights')
-        mock_tagged = mocker.patch.object(self.extractor, 'download_all_tagged')
-        mock_stories = mocker.patch.object(self.extractor, '_download_stories_reusable')
+        mock_call = mocker.patch.object(self.extractor, "call_api")
+        mock_posts = mocker.patch.object(self.extractor, "download_all_posts")
+        mock_highlights = mocker.patch.object(self.extractor, "download_all_highlights")
+        mock_tagged = mocker.patch.object(self.extractor, "download_all_tagged")
+        mock_stories = mocker.patch.object(self.extractor, "_download_stories_reusable")

        self.extractor.full_profile = True
-        mock_call.side_effect = [
-            mock_user_response,
-            mock_story_response
-        ]
+        mock_call.side_effect = [mock_user_response, mock_story_response]
        mock_highlights.return_value = None
        mock_stories.return_value = mock_story_response
        mock_posts.return_value = None
@@ -155,7 +159,7 @@ class TestInstagramAPIExtractor(TestExtractorBase):

    def test_download_profile_not_found(self, metadata, mocker):
        """Test profile not found error"""
-        mock_call = mocker.patch.object(self.extractor, 'call_api')
+        mock_call = mocker.patch.object(self.extractor, "call_api")
        mock_call.return_value = {"user": None}
        with pytest.raises(AssertionError) as exc_info:
            self.extractor.download_profile(metadata, "invalid_user")
@@ -163,18 +167,14 @@ class TestInstagramAPIExtractor(TestExtractorBase):

    def test_download_profile_error_handling(self, metadata, mock_user_response, mocker):
        """Test error handling in full profile mode"""
-        mock_call = mocker.patch.object(self.extractor, 'call_api')
-        mock_highlights = mocker.patch.object(self.extractor, 'download_all_highlights')
-        mock_tagged = mocker.patch.object(self.extractor, 'download_all_tagged')
-        stories_tagged = mocker.patch.object(self.extractor, '_download_stories_reusable')
-        mock_posts = mocker.patch.object(self.extractor, 'download_all_posts')
+        mock_call = mocker.patch.object(self.extractor, "call_api")
+        mock_highlights = mocker.patch.object(self.extractor, "download_all_highlights")
+        mock_tagged = mocker.patch.object(self.extractor, "download_all_tagged")
+        stories_tagged = mocker.patch.object(self.extractor, "_download_stories_reusable")
+        mock_posts = mocker.patch.object(self.extractor, "download_all_posts")

        self.extractor.full_profile = True
-        mock_call.side_effect = [
-            mock_user_response,
-            Exception("Stories API failed"),
-            Exception("Posts API failed")
-        ]
+        mock_call.side_effect = [mock_user_response, Exception("Stories API failed"), Exception("Posts API failed")]
        mock_highlights.return_value = None
        mock_tagged.return_value = None
        stories_tagged.return_value = None
@@ -182,4 +182,4 @@ class TestInstagramAPIExtractor(TestExtractorBase):
        result = self.extractor.download_profile(metadata, "test_user")

        assert result.is_success()
-        assert "Error downloading stories for test_user" in result.metadata["errors"]
+        assert "Error downloading stories for test_user" in result.metadata["errors"]
--- a/tests/extractors/test_instagram_extractor.py
+++ b/tests/extractors/test_instagram_extractor.py
@@ -1,21 +1,41 @@
 import pytest

 from auto_archiver.modules.instagram_extractor import InstagramExtractor
-from .test_extractor_base import TestExtractorBase

-class TestInstagramExtractor(TestExtractorBase):

-    extractor_module: str = 'instagram_extractor'
-    config: dict = {}
+@pytest.fixture
+def instagram_extractor(setup_module, mocker):
+    extractor_module: str = "instagram_extractor"
+    config: dict = {
+        "username": "user_name",
+        "password": "password123",
+        "download_folder": "instaloader",
+        "session_file": "secrets/instaloader.session",
+    }
+    fake_loader = mocker.MagicMock()
+    fake_loader.load_session_from_file.return_value = None
+    fake_loader.login.return_value = None
+    fake_loader.save_session_to_file.return_value = None
+    mocker.patch(
+        "instaloader.Instaloader",
+        return_value=fake_loader,
+    )
+    return setup_module(extractor_module, config)

-    @pytest.mark.parametrize("url", [
+
+@pytest.mark.parametrize(
+    "url",
+    [
        "https://www.instagram.com/p/",
        "https://www.instagram.com/p/1234567890/",
        "https://www.instagram.com/reel/1234567890/",
        "https://www.instagram.com/username/",
        "https://www.instagram.com/username/stories/",
        "https://www.instagram.com/username/highlights/",
-    ])
-    def test_regex_matches(self, url):
-        # post
-        assert InstagramExtractor.valid_url.match(url)
+    ],
+)
+def test_regex_matches(url: str, instagram_extractor: InstagramExtractor) -> None:
+    """
+    Ensure that the valid_url regex matches all provided Instagram URLs.
+    """
+    assert instagram_extractor.valid_url.match(url)
--- a/tests/extractors/test_instagram_tbot_extractor.py
+++ b/tests/extractors/test_instagram_tbot_extractor.py
@@ -7,10 +7,16 @@ from auto_archiver.modules.instagram_tbot_extractor import InstagramTbotExtracto
 from tests.extractors.test_extractor_base import TestExtractorBase


+@pytest.fixture(autouse=True)
+def mock_sleep(mocker):
+    """Mock time.sleep to avoid delays."""
+    return mocker.patch("time.sleep")
+
+
@pytest.fixture
 def patch_extractor_methods(request, setup_module, mocker):
-    mocker.patch.object(InstagramTbotExtractor, '_prepare_session_file', return_value=None)
-    mocker.patch.object(InstagramTbotExtractor, '_initialize_telegram_client', return_value=None)
+    mocker.patch.object(InstagramTbotExtractor, "_prepare_session_file", return_value=None)
+    mocker.patch.object(InstagramTbotExtractor, "_initialize_telegram_client", return_value=None)
    yield


@@ -35,12 +41,7 @@ def mock_telegram_client(mocker):
@pytest.fixture
 def extractor(setup_module, patch_extractor_methods, mocker):
    extractor_module = "instagram_tbot_extractor"
-    config = {
-        "api_id": 12345,
-        "api_hash": "test_api_hash",
-        "session_file": "test_session",
-        "timeout": 4
-    }
+    config = {"api_id": 12345, "api_hash": "test_api_hash", "session_file": "test_session", "timeout": 4}
    extractor = setup_module(extractor_module, config)
    extractor.client = mocker.MagicMock()
    extractor.session_file = "test_session"
@@ -79,21 +80,30 @@ class TestInstagramTbotExtractorReal(TestExtractorBase):
        "session_file": "secrets/anon-insta",
    }

-    @pytest.mark.parametrize("url, expected_status, message, len_media", [
-        ("https://www.instagram.com/p/C4QgLbrIKXG", "insta-via-bot: success",
-         "Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou",
-         6),
-        ("https://www.instagram.com/reel/DEVLK8qoIbg/", "insta-via-bot: success",
-         "Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol",
-         3),
-        # instagram tbot not working (potentially intermittently?) for stories - replace with a live story to retest
-        # ("https://www.instagram.com/stories/bellingcatofficial/3556336382743057476/", False, "Media not found or unavailable"),
-        # Seems to be working intermittently for highlights
-        # ("https://www.instagram.com/stories/highlights/17868810693068139/", "insta-via-bot: success", None, 50),
-        # Marking invalid url as success
-        ("https://www.instagram.com/p/INVALID", "insta-via-bot: success", "Media not found or unavailable", 0),
-        ("https://www.youtube.com/watch?v=ymCMy8OffHM", False, None, 0),
-    ])
+    @pytest.mark.parametrize(
+        "url, expected_status, message, len_media",
+        [
+            (
+                "https://www.instagram.com/p/C4QgLbrIKXG",
+                "insta-via-bot: success",
+                "Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou",
+                6,
+            ),
+            (
+                "https://www.instagram.com/reel/DEVLK8qoIbg/",
+                "insta-via-bot: success",
+                "Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol",
+                3,
+            ),
+            # instagram tbot not working (potentially intermittently?) for stories - replace with a live story to retest
+            # ("https://www.instagram.com/stories/bellingcatofficial/3556336382743057476/", False, "Media not found or unavailable"),
+            # Seems to be working intermittently for highlights
+            # ("https://www.instagram.com/stories/highlights/17868810693068139/", "insta-via-bot: success", None, 50),
+            # Marking invalid url as success
+            ("https://www.instagram.com/p/INVALID", "insta-via-bot: success", "Media not found or unavailable", 0),
+            ("https://www.youtube.com/watch?v=ymCMy8OffHM", False, None, 0),
+        ],
+    )
    def test_download(self, url, expected_status, message, len_media, metadata_sample):
        """Test the `download()` method with various Instagram URLs."""
        metadata_sample.set_url(url)
--- a/tests/extractors/test_tiktok_tikwm_extractor.py
+++ b/tests/extractors/test_tiktok_tikwm_extractor.py
@@ -0,0 +1,151 @@
+from datetime import datetime, timezone
+import time
+import pytest
+import yt_dlp
+
+from auto_archiver.modules.generic_extractor.generic_extractor import GenericExtractor
+from .test_extractor_base import TestExtractorBase
+
+
+@pytest.fixture(autouse=True)
+def skip_ytdlp_own_methods(mocker):
+    # mock this method, so that we skip the ytdlp download in these tests
+    mocker.patch("auto_archiver.modules.generic_extractor.tiktok.Tiktok.skip_ytdlp_download", return_value=True)
+    mocker.patch(
+        "auto_archiver.modules.generic_extractor.generic_extractor.GenericExtractor.suitable_extractors",
+        return_value=[e for e in yt_dlp.YoutubeDL()._ies.values() if e.IE_NAME == "TikTok"],
+    )
+
+
+@pytest.fixture()
+def mock_get(mocker):
+    return mocker.patch("auto_archiver.modules.generic_extractor.tiktok.requests.get")
+
+
+class TestTiktokTikwmExtractor(TestExtractorBase):
+    """
+    Test suite for TestTiktokTikwmExtractor.
+    """
+
+    extractor_module = "generic_extractor"
+    extractor: GenericExtractor
+
+    config = {}
+
+    VALID_EXAMPLE_URL = "https://www.tiktok.com/@example/video/1234"
+
+    def test_invalid_json_responses(self, mock_get, make_item, caplog):
+        mock_get.return_value.status_code = 200
+        mock_get.return_value.json.side_effect = ValueError
+        with caplog.at_level("DEBUG"):
+            assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False
+            mock_get.assert_called_once()
+            mock_get.return_value.json.assert_called_once()
+            # first message is just the 'Skipping using ytdlp to download files for TikTok' message
+            assert (
+                "failed to parse JSON response from tikwm.com for url='https://www.tiktok.com/@example/video/1234'"
+                in caplog.text
+            )
+
+        mock_get.return_value.json.side_effect = Exception
+        with caplog.at_level("ERROR"):
+            assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False
+            mock_get.assert_called()
+            assert mock_get.call_count == 2
+            assert mock_get.return_value.json.call_count == 2
+            assert (
+                "failed to parse JSON response from tikwm.com for url='https://www.tiktok.com/@example/video/1234'"
+                in caplog.text
+            )
+
+    @pytest.mark.parametrize(
+        "response",
+        [
+            ({"msg": "failure"}),
+            ({"msg": "success"}),
+        ],
+    )
+    def test_unsuccessful_responses(self, mock_get, make_item, response, caplog):
+        mock_get.return_value.status_code = 200
+        mock_get.return_value.json.return_value = response
+        with caplog.at_level("DEBUG"):
+            assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False
+            mock_get.assert_called_once()
+            mock_get.return_value.json.assert_called_once()
+            assert "failed to get a valid response from tikwm.com" in caplog.text
+
+    @pytest.mark.parametrize(
+        "response,has_vid",
+        [
+            ({"data": {"id": 123}}, False),
+            ({"data": {"wmplay": "url"}}, True),
+            ({"data": {"play": "url"}}, True),
+        ],
+    )
+    def test_correct_extraction(self, mock_get, make_item, response, has_vid, mocker):
+        mock_get.return_value.status_code = 200
+        mock_get.return_value.json.return_value = {"msg": "success", **response}
+        result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
+        if not has_vid:
+            assert result is False
+        else:
+            assert result.is_success()
+            assert len(result.media) == 1
+        mock_get.assert_called()
+        assert mock_get.call_count == 1 + int(has_vid)
+        mock_get.return_value.json.assert_called_once()
+
+    def test_correct_data_extracted(self, mock_get, make_item):
+        mock_get.return_value.status_code = 200
+        mock_get.return_value.json.return_value = {
+            "msg": "success",
+            "data": {
+                "wmplay": "url",
+                "origin_cover": "cover.jpg",
+                "title": "Title",
+                "id": 123,
+                "duration": 60,
+                "create_time": 1736301699,
+                "author": "Author",
+                "other": "data",
+            },
+        }
+
+        result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
+        assert result.is_success()
+        assert len(result.media) == 2
+        assert result.get_title() == "Title"
+        assert result.get("author") == "Author"
+        assert result.get("api_data") == {"other": "data", "id": 123}
+        assert result.media[1].get("duration") == 60
+        assert result.get("timestamp") == datetime.fromtimestamp(1736301699, tz=timezone.utc)
+
+    @pytest.mark.download
+    def test_download_video(self, make_item):
+        url = "https://www.tiktok.com/@bbcnews/video/7478038212070411542"
+
+        result = self.extractor.download(make_item(url))
+        assert result.is_success()
+        assert len(result.media) == 2
+        assert (
+            result.get_title()
+            == "The A23a iceberg is one of the world's oldest and it's so big you can see it from space. #Iceberg  #A23a  #Antarctica  #Ice  #ClimateChange  #DavidAttenborough  #Ocean  #Sea  #SouthGeorgia  #BBCNews "
+        )
+        assert result.get("author").get("unique_id") == "bbcnews"
+        assert result.get("api_data").get("id") == "7478038212070411542"
+        assert result.media[1].get("duration") == 59
+        assert result.get("timestamp") == datetime.fromtimestamp(1741122000, tz=timezone.utc)
+
+    @pytest.mark.download
+    def test_download_sensitive_video(self, make_item):
+        url = "https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375"
+        # Required for rate limiting
+        time.sleep(1.1)
+        result = self.extractor.download(make_item(url))
+        assert result.is_success()
+        assert len(result.media) == 2
+        assert result.get_title() == "Căng nhất lúc này #ggs68 #ggs68taiwan #taiwan #dailoan #tiktoknews"
+        assert result.get("author").get("id") == "7197400619475649562"
+        assert result.get("api_data").get("id") == "7441821351142362375"
+        assert result.media[1].get("duration") == 34
+        assert result.get("timestamp") == datetime.fromtimestamp(1732684060, tz=timezone.utc)
--- a/tests/extractors/test_twitter_api_extractor.py
+++ b/tests/extractors/test_twitter_api_extractor.py
@@ -1,6 +1,5 @@
 import os
 import datetime
-import hashlib
 import pytest

 from pytwitter.models.media import MediaVariant
@@ -10,8 +9,7 @@ from auto_archiver.modules.twitter_api_extractor import TwitterApiExtractor

@pytest.mark.incremental
 class TestTwitterApiExtractor(TestExtractorBase):
-
-    extractor_module = 'twitter_api_extractor'
+    extractor_module: TwitterApiExtractor = "twitter_api_extractor"

    config = {
        "bearer_tokens": [],
@@ -22,41 +20,79 @@ class TestTwitterApiExtractor(TestExtractorBase):
        "access_secret": os.environ.get("TWITTER_ACCESS_SECRET"),
    }

-    @pytest.mark.parametrize("url, expected", [
-        ("https://x.com/bellingcat/status/1874097816571961839", "https://x.com/bellingcat/status/1874097816571961839"), # x.com urls unchanged
-        ("https://twitter.com/bellingcat/status/1874097816571961839", "https://twitter.com/bellingcat/status/1874097816571961839"), # twitter urls unchanged
-        ("https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # don't strip params from twitter urls (changed Jan 2025)
-        ("https://www.bellingcat.com/category/resources/", "https://www.bellingcat.com/category/resources/"), # non-twitter/x urls unchanged
-        ("https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # shouldn't strip params from non-twitter/x URLs
-    ])
+    @pytest.mark.parametrize(
+        "url, expected",
+        [
+            (
+                "https://x.com/bellingcat/status/1874097816571961839",
+                "https://x.com/bellingcat/status/1874097816571961839",
+            ),  # x.com urls unchanged
+            (
+                "https://twitter.com/bellingcat/status/1874097816571961839",
+                "https://twitter.com/bellingcat/status/1874097816571961839",
+            ),  # twitter urls unchanged
+            (
+                "https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
+                "https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
+            ),  # don't strip params from twitter urls (changed Jan 2025)
+            (
+                "https://www.bellingcat.com/category/resources/",
+                "https://www.bellingcat.com/category/resources/",
+            ),  # non-twitter/x urls unchanged
+            (
+                "https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
+                "https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w",
+            ),  # shouldn't strip params from non-twitter/x URLs
+        ],
+    )
    def test_sanitize_url(self, url, expected):
        assert expected == self.extractor.sanitize_url(url)

    @pytest.mark.download
    def test_sanitize_url_download(self):
-        assert "https://www.bellingcat.com/category/resources/" == self.extractor.sanitize_url("https://t.co/yl3oOJatFp")
+        assert "https://www.bellingcat.com/category/resources/" == self.extractor.sanitize_url(
+            "https://t.co/yl3oOJatFp"
+        )

-    @pytest.mark.parametrize("url, exptected_username, exptected_tweetid", [
-        ("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
-        ("https://x.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
-        ("https://www.bellingcat.com/category/resources/", False, False)
-        ])
+    @pytest.mark.parametrize(
+        "url, exptected_username, exptected_tweetid",
+        [
+            ("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
+            ("https://x.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
+            ("https://www.bellingcat.com/category/resources/", False, False),
+        ],
+    )
    def test_get_username_tweet_id_from_url(self, url, exptected_username, exptected_tweetid):
-    
        username, tweet_id = self.extractor.get_username_tweet_id(url)
        assert exptected_username == username
        assert exptected_tweetid == tweet_id

    def test_choose_variants(self):
        # taken from the response for url https://x.com/bellingcat/status/1871552600346415571
-        variant_list = [MediaVariant(content_type='application/x-mpegURL', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/pl/ovWo7ux-bKROwYIC.m3u8?tag=12&v=e1b'),
-                        MediaVariant(bit_rate=256000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/480x270/OqZIrKV0LFswMvxS.mp4?tag=12'),
-                        MediaVariant(bit_rate=832000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/640x360/uiDZDSmZ8MZn9hsi.mp4?tag=12'),
-                        MediaVariant(bit_rate=2176000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/1280x720/6Y340Esh568WZnRZ.mp4?tag=12')
-                        ]
+        variant_list = [
+            MediaVariant(
+                content_type="application/x-mpegURL",
+                url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/pl/ovWo7ux-bKROwYIC.m3u8?tag=12&v=e1b",
+            ),
+            MediaVariant(
+                bit_rate=256000,
+                content_type="video/mp4",
+                url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/480x270/OqZIrKV0LFswMvxS.mp4?tag=12",
+            ),
+            MediaVariant(
+                bit_rate=832000,
+                content_type="video/mp4",
+                url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/640x360/uiDZDSmZ8MZn9hsi.mp4?tag=12",
+            ),
+            MediaVariant(
+                bit_rate=2176000,
+                content_type="video/mp4",
+                url="https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/1280x720/6Y340Esh568WZnRZ.mp4?tag=12",
+            ),
+        ]
        chosen_variant = self.extractor.choose_variant(variant_list)
        assert chosen_variant == variant_list[3]
-    
+
    @pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
    @pytest.mark.download
    def test_download_nonexistent_tweet(self, make_item):
@@ -76,7 +112,6 @@ class TestTwitterApiExtractor(TestExtractorBase):
    @pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
    @pytest.mark.download
    def test_download_tweet_no_media(self, make_item):
-        
        item = make_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w")
        post = self.extractor.download(item)

@@ -84,7 +119,7 @@ class TestTwitterApiExtractor(TestExtractorBase):
            post,
            "Onion rings are just vegetable donuts.",
            datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
-            "twitter-api: success"
+            "twitter-api: success",
        )

    @pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
@@ -95,27 +130,41 @@ class TestTwitterApiExtractor(TestExtractorBase):
        self.assertValidResponseMetadata(
            post,
            "This month's Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it's crucial to investigate the people behind these services https://t.co/SfBUq0hSD0 https://t.co/rIHx0WlKp8",
-            datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc)
+            datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc),
        )

    @pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
-    @pytest.mark.parametrize("url, title, timestamp", [
-            ("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity https://t.co/t3u0hQsSB1", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
-            ("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence https://t.co/syYDSkpjZD", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
-            ("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive https://t.co/XE7cRdjzYq", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
-            ("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity https://t.co/YxCFbbhYE3", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)),
-        ])
+    @pytest.mark.parametrize(
+        "url, title, timestamp",
+        [
+            (
+                "https://x.com/SozinhoRamalho/status/1876710769913450647",
+                "ignore tweet, testing sensitivity warning nudity https://t.co/t3u0hQsSB1",
+                datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
+            ),
+            (
+                "https://x.com/SozinhoRamalho/status/1876710875475681357",
+                "ignore tweet, testing sensitivity warning violence https://t.co/syYDSkpjZD",
+                datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
+            ),
+            (
+                "https://x.com/SozinhoRamalho/status/1876711053813227618",
+                "ignore tweet, testing sensitivity warning sensitive https://t.co/XE7cRdjzYq",
+                datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
+            ),
+            (
+                "https://x.com/SozinhoRamalho/status/1876711141314801937",
+                "ignore tweet, testing sensitivity warning nudity, violence, sensitivity https://t.co/YxCFbbhYE3",
+                datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
+            ),
+        ],
+    )
    @pytest.mark.download
    def test_download_sensitive_media(self, url, title, timestamp, check_hash, make_item):
-
        """Download tweets with sensitive media"""

        post = self.extractor.download(make_item(url))
-        self.assertValidResponseMetadata(
-            post,
-            title,
-            timestamp
-        )
+        self.assertValidResponseMetadata(post, title, timestamp)
        assert len(post.media) == 1
        # check the SHA1 hash (quick) of the media, to make sure it's valid
-        check_hash(post.media[0].filename, "3eea9c03b2dcedd1eb9a169d8bfd1cf877996fab4961de019a96eb9d32d2d733")
+        check_hash(post.media[0].filename, "3eea9c03b2dcedd1eb9a169d8bfd1cf877996fab4961de019a96eb9d32d2d733")
--- a/tests/extractors/test_vk_extractor.py
+++ b/tests/extractors/test_vk_extractor.py
@@ -0,0 +1,77 @@
+import pytest
+
+from auto_archiver.core import Metadata
+from auto_archiver.modules.vk_extractor import VkExtractor
+
+
+@pytest.fixture
+def mock_vk_scraper(mocker):
+    """Fixture to mock VkScraper."""
+    return mocker.patch("auto_archiver.modules.vk_extractor.vk_extractor.VkScraper")
+
+
+@pytest.fixture
+def vk_extractor(setup_module, mock_vk_scraper) -> VkExtractor:
+    """Fixture to initialize VkExtractor with mocked VkScraper."""
+    extractor_module = "vk_extractor"
+    configs = {
+        "username": "name",
+        "password": "password123",
+        "session_file": "secrets/vk_config.v2.json",
+    }
+    vk = setup_module(extractor_module, configs)
+    vk.vks = mock_vk_scraper.return_value
+    return vk
+
+
+def test_netloc(vk_extractor, metadata):
+    # metadata url set as: "https://example.com/"
+    assert vk_extractor.download(metadata) is False
+
+
+def test_vk_url_but_scrape_returns_empty(vk_extractor, metadata):
+    metadata.set_url("https://vk.com/valid-wall")
+    vk_extractor.vks.scrape.return_value = []
+    assert vk_extractor.download(metadata) is False
+    assert metadata.netloc == "vk.com"
+    vk_extractor.vks.scrape.assert_called_once_with(metadata.get_url())
+
+
+def test_successful_scrape_and_download(vk_extractor, metadata, mocker):
+    mock_scrapes = [
+        {"text": "Post Title", "datetime": "2023-01-01T00:00:00", "id": 1},
+        {"text": "Another Post", "datetime": "2023-01-02T00:00:00", "id": 2},
+    ]
+    mock_filenames = ["image1.jpg", "image2.png"]
+    vk_extractor.vks.scrape.return_value = mock_scrapes
+    vk_extractor.vks.download_media.return_value = mock_filenames
+    metadata.set_url("https://vk.com/valid-wall")
+    result = vk_extractor.download(metadata)
+    # Test metadata
+    assert result.is_success()
+    assert result.status == "vk: success"
+    assert result.get_title() == "Post Title"
+    assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
+    assert "Another Post" in result.metadata["content"]
+    # Test Media objects
+    assert len(result.media) == 2
+    assert result.media[0].filename == "image1.jpg"
+    assert result.media[1].filename == "image2.png"
+    vk_extractor.vks.download_media.assert_called_once_with(mock_scrapes, vk_extractor.tmp_dir)
+
+
+def test_adds_first_title_and_timestamp(vk_extractor):
+    metadata = Metadata().set_url("https://vk.com/no-metadata")
+    metadata.set_url("https://vk.com/no-metadata")
+    mock_scrapes = [
+        {"text": "value", "datetime": "2023-01-01T00:00:00"},
+        {"text": "value2", "datetime": "2023-01-02T00:00:00"},
+    ]
+    vk_extractor.vks.scrape.return_value = mock_scrapes
+    vk_extractor.vks.download_media.return_value = []
+    result = vk_extractor.download(metadata)
+
+    assert result.get_title() == "value"
+    # formatted timestamp
+    assert result.get_timestamp() == "2023-01-01T00:00:00+00:00"
+    assert result.is_success()