Merge pull request #26 from bellingcat/deferred-media-archiving

Implemented deferred media archiving for all scrapers
2026-06-08 03:18:34 +03:00 · 2022-04-02 14:15:35 +02:00
parent d20db5f828 0bab20e371
commit 0099558c68
25 changed files with 369 additions and 171 deletions
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -26,19 +26,19 @@
        },
        "boto3": {
            "hashes": [
-                "sha256:ef210f8e85cdb6d26a38ebad1cfe9cefdef2ab269207e5987653555375a7ef6b",
-                "sha256:f0af8f4ef5fe6353c794cd3cce627d469a618b58ace7ca75a63cfd719df615ce"
+                "sha256:35f68b60652bff50e7bc926238443cb578f29f120908bb945e5640e90c6dd53e",
+                "sha256:7f3f93ee97215862ccd1a216f37deb7d64055c71f826b821805904df7b84ee6a"
            ],
            "index": "pypi",
-            "version": "==1.21.30"
+            "version": "==1.21.31"
        },
        "botocore": {
            "hashes": [
-                "sha256:af4bdc51eeecbe9fdcdadbed9ad58c5c91380ef30f3560022bbc2ee1d78f0ad6",
-                "sha256:c622751093e3d0bf61343e66d6d06190ef30bf42b1557d5070ca84e9efa06d4b"
+                "sha256:3bb21e3ee5e4de3ed76bb99b4496a46e9b5c82e7b7fdb62702f11dda1b57b769",
+                "sha256:424fd94bef86a11f5340dc15eb50602dedec2ecc01c3a25c4fea23a2c8195500"
            ],
            "markers": "python_version >= '3.6'",
-            "version": "==1.24.30"
+            "version": "==1.24.31"
        },
        "brotli": {
            "hashes": [
@@ -195,11 +195,11 @@
        },
        "click": {
            "hashes": [
-                "sha256:5e0d195c2067da3136efb897449ec1e9e6c98282fbf30d7f9e164af9be901a6b",
-                "sha256:7ab900e38149c9872376e8f9b5986ddcaf68c0f413cf73678a0bca5547e6f976"
+                "sha256:24e1a4a9ec5bf6299411369b208c1df2188d9eb8d916302fe6bf03faed227f1e",
+                "sha256:479707fe14d9ec9a0757618b7a100a0ae4c4e236fac5b7f80ca68028141a1a72"
            ],
            "markers": "python_version >= '3.7'",
-            "version": "==8.1.1"
+            "version": "==8.1.2"
        },
        "cryptg": {
            "hashes": [
@@ -872,6 +872,9 @@
            "version": "==2022.3.2"
        },
        "requests": {
+            "extras": [
+                "socks"
+            ],
            "hashes": [
                "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
                "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
@@ -925,45 +928,45 @@
        },
        "sqlalchemy": {
            "hashes": [
-                "sha256:03b5dc8b6399a51187e220ab351dfa8f81c310ce59442d047b74cd91ac4e859a",
-                "sha256:0539328f70f0c2bbaa0137be7a0787ceea7eda29e41e3a9d575c52030d4251aa",
-                "sha256:0d19eed02bf1225889e6f91cd7c2f89618919cb283846e7ed8aae1b0fe5f0403",
-                "sha256:18a965490cc0f83f3e867078096e78e97d81bf370f45debae6027331904d3348",
-                "sha256:1a6ae212067856bf2236708cb434554851cbe8099027953e94241a1743afb2b1",
-                "sha256:1ee7c85f27b1ae59c3fe8fd715ed38e73ba8c885c90f74bcbae609c0fdb0ada3",
-                "sha256:3081f61a86d0c2b6928fa7f4666bdeba0b9c7cc19619255454a71bfa60b55978",
-                "sha256:34d2e697115cab4a66d1b8ea60050004ce62f0598c6cf146ee66a4025c7cb7fc",
-                "sha256:36a3535bf9dd5f42a42d2cb6305f992888bbcb5789c615d35e34368853ae46f5",
-                "sha256:4443237f1f87d460453f6b33683f25135f3fba9dffbf2a053caae15bf838cfc2",
-                "sha256:44f0301e246c4d35d84e70192410d01509aea03a99cb963451aa0b652b7529f5",
-                "sha256:4b7371495c91319bfaa010e257eb8d3cd2d3eae14b256412d3294247ea7f0d78",
-                "sha256:5a43870fc272cf6dabf6ce7ad297a08e7f31672ea9ccd217a305c5bce8eafa9f",
-                "sha256:5a5f798fc3f3ca9b5f1bf66b50a58439d558d5df132e12cfed22e2bc167c411c",
-                "sha256:5b2718cb9e2bcf0053a83a6d0c491476b7f3832c59d82b53ccc78bb869e4027f",
-                "sha256:5ee1ce25cc21db5553f607511fb41b85b7dc5eade15536c5c238f898996987de",
-                "sha256:84747d1cc4823285b8253a34513162a664d4989217461e111097446b98803bfc",
-                "sha256:89d51682716135e0d93584c3ca625c40805a014aaddb3961074a04895793d4d3",
-                "sha256:8b4cf1effee1ef6db3f6c5185d32c0e6518bcf06212861875779676a22f68370",
-                "sha256:9473d5dadee7a81d003cf18b1f5266b0fb29a9dff985bc205d71ca8da037e18b",
-                "sha256:a11d8e12ff761101aa44404ce2df15e32f061d5559b862a847976c2efff014a1",
-                "sha256:a720cef2bcd4e645ae1fa01a143a31c04e095f26ff925f6090cb1ef7f1859e5d",
-                "sha256:bda76918f8c6da01278a97365bc17fa97d902be7f6d7596aad2bd7d9b52adbba",
-                "sha256:c3bbcfbaa33d5998698ed84eba0807a58023c86d0fa540ff4da96637815a4d92",
-                "sha256:c6545f832267bfea780c8fc6235f7a1fc87778e3e2629ddddeb88a94f9181292",
-                "sha256:c7c667579800445f390b56c37ee383639465766ebc2041a2d1f1279cda01d4a0",
-                "sha256:c8f671575256dddf1c334b2052aff88ac1c913e5430300057f1b0f2f12495019",
-                "sha256:cd4954eff9e9b9a294f632e7bdb0a4c41e23c89609c6b2f742f1321740566477",
-                "sha256:d258091ba28615ae133bf6a9975a24b9ed0bcc47f48bb1f57fa13cbe2bd4033c",
-                "sha256:d29a8c0e8c2b77f2f548acdf948543a661f6414282598147e094bab091f37af6",
-                "sha256:dd51b09d540e757dd6377f02950a80c0dc63aec6188582afdc21bf0db66efa0a",
-                "sha256:e403e1cfe7789eddba83bc7677dd8ffdaab56fb2f14eb3b6f014037b09cd8096",
-                "sha256:ea76d3f258c7b221a536fea200b64fc1b9272b48de4d1695bef616b7e5269183",
-                "sha256:eab54f6ec81c12b6184ebeacccd89567cee8fc94b2f9fba23aec30ca25fc287c",
-                "sha256:f39fb329a53043c10814fee68e123f02addc6000ed96994aedf24afe6fc30e9b",
-                "sha256:f409a1a44e3da766445600714e2ca70ddf735898382c11c5c250eb88a7b8b0d1"
+                "sha256:045d6a26c262929af0b9cb25441aae675ac04db4ea8bd2446b355617cd6b6b7d",
+                "sha256:07f4dab2deb6d34618a2ccfff3971a85923ad7c3a9a45401818870fc51d3f0cc",
+                "sha256:08aaad905aba8940f27aeb9f1f851bf63f18ef97b0062ca41f64afc4b64e0e8c",
+                "sha256:27a42894a2751e438eaed12fc0dcfe741ff2f66c14760d081222c5adc5460064",
+                "sha256:2a3e4dc7c452ba3c0f3175ad5a8e0ba49c2b0570a8d07272cf50844c8d78e74f",
+                "sha256:345306707bb0e51e7cd6e7573adafbce018894ee5e3b9c31134545f704936db0",
+                "sha256:36f08d94670315ca04c8139bd80b3e02b9dd9cc66fc11bcb96fd10ad51a051ab",
+                "sha256:3ebb97ed96f4506e2f212e1fcf0ec07a103bb194938627660a5acb4d9feae49c",
+                "sha256:40b995d7aeeb6f88a1927ce6692c0f626b59d8effd3e1d597f125e141707b37c",
+                "sha256:4414ace6e3a5e39523e55a5d9f3b215699b2ead4ff91fca98f1b659b7ab2d92a",
+                "sha256:50107d8183da3fbe5715957aa3954cd9d82aed555c5b4d3fd37fac861af422fa",
+                "sha256:50174e173d03209c34e07e7b57cca48d0082ac2390edf927aafc706c111da11e",
+                "sha256:5e88912bf192e7b5739c446d2276e1cba74cfa6c1c93eea2b2534404f6be1dbd",
+                "sha256:621d3f6c0ba2407bb97e82b649be5ca7d5b6c201dcfb964ce13f517bf1cb6305",
+                "sha256:623bac2d6bdca3f3e61cf1e1c466c5fb9f5cf08735736ee1111187b7a4108891",
+                "sha256:671f61c3db4595b0e86cc4b30f675a7c0206d9ce99f041b4f6761c7ddd1e0074",
+                "sha256:67c1c27c48875afc950bee5ee24582794f20b545e64e4f9ca94071a9b514d6ed",
+                "sha256:6a6cfd468f54d65324fd3847cfd0148b0610efa6a43e5f5fcc89f455696ae9e7",
+                "sha256:70048a83f0a1ece1fcd7189891c888e20af2c57fbd33eb760d8cece9843b896c",
+                "sha256:7ee14a7f9f76d1ef9d5e5b760c9252617c839b87eee04d1ce8325ac66ae155c4",
+                "sha256:804cf491437f3e4ce31247ab4b309b181f06ecc97d309b746d10f09439b4eb85",
+                "sha256:878c7beaafa365602762c19f638282e1885454fed1aed86f8fae038933c7c671",
+                "sha256:954ea8c527c4322afb6885944904714893af81fe9167e421273770991bf08a4a",
+                "sha256:a47bf6b7ca6c28e4f4e262fabcf5be6b907af81be36de77839c9eeda2cdf3bb3",
+                "sha256:a4fb5c6ee84a6bba4ff6f9f5379f0b3a0ffe9de7ba5a0945659b3da8d519709b",
+                "sha256:b34bbc683789559f1bc9bb685fc162e0956dbbdfbe2fbd6755a9f5982c113610",
+                "sha256:c025d45318b73c0601cca451532556cbab532b2742839ebb8cb58f9ebf06811e",
+                "sha256:c3ad7f5b61ba014f5045912aea15b03c473bb02b1c07fd92c9d2c794fa183276",
+                "sha256:c9218e3519398129e364121e0d89823e6ba2a2b77c28bfc661face0829c41433",
+                "sha256:cd5cffd1dd753828f1069f33062f3896e51c990acd957c264f40e051b3e19887",
+                "sha256:d8efcaa709ea8e7c08c3d3e7639c39b36083f5a995f397f9e6eedf5f5e4e4946",
+                "sha256:e297a5cc625e3f1367a82deedf2d48ee4d2b2bd263b8b8d2efbaaf5608b5229e",
+                "sha256:e67278ceb63270cdac0a7b89fc3c29a56f7dac9616a7ee48e7ad6b52e3b631e5",
+                "sha256:eb6558ba07409dafa18c793c34292b3265be455904966f0724c10198829477e3",
+                "sha256:f197c66663ed0f9e1178d51141d864688fb244a83f6b17f667d521e482537b2e",
+                "sha256:f47996b1810894f766c9ee689607077c6c0e0fd6761e04c12ba13efb56d50c1d"
            ],
            "index": "pypi",
-            "version": "==1.4.33"
+            "version": "==1.4.34"
        },
        "telethon": {
            "hashes": [
@@ -1138,11 +1141,11 @@
        },
        "click": {
            "hashes": [
-                "sha256:5e0d195c2067da3136efb897449ec1e9e6c98282fbf30d7f9e164af9be901a6b",
-                "sha256:7ab900e38149c9872376e8f9b5986ddcaf68c0f413cf73678a0bca5547e6f976"
+                "sha256:24e1a4a9ec5bf6299411369b208c1df2188d9eb8d916302fe6bf03faed227f1e",
+                "sha256:479707fe14d9ec9a0757618b7a100a0ae4c4e236fac5b7f80ca68028141a1a72"
            ],
            "markers": "python_version >= '3.7'",
-            "version": "==8.1.1"
+            "version": "==8.1.2"
        },
        "coverage": {
            "extras": [
@@ -1390,6 +1393,9 @@
            "version": "==2022.1"
        },
        "requests": {
+            "extras": [
+                "socks"
+            ],
            "hashes": [
                "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
                "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
@@ -1473,7 +1479,7 @@
                "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
                "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
            ],
-            "markers": "python_version >= '3.7'",
+            "markers": "python_full_version < '3.11.0'",
            "version": "==2.0.1"
        },
        "typing-extensions": {
--- a/cisticola/scraper/base.py
+++ b/cisticola/scraper/base.py
@@ -235,6 +235,20 @@ class Scraper:
        return archived_url

    def archive_files(self, result: ScraperResult) -> ScraperResult:
+        """Archive files corresponding to ``archived_url`` dict keys, if the 
+        files have not previously been archived.
+
+        Parameters
+        ----------
+        result: ScraperResult
+            Previously scraped ScraperResult run with ``archive_media=False``.
+
+        Returns
+        -------
+        ScraperResult
+            Same ScraperResult as ``result``, but with all URLs in ``archived_url`` dict archived.
+        """
+
        for url in result.archived_urls:
            if result.archived_urls[url] is None:
                media_blob, content_type, key = self.url_to_blob(url)
@@ -244,7 +258,6 @@ class Scraper:
        result.media_archived = True
        return result

-
    def can_handle(self, channel: Channel) -> bool:
        """Whether or not the scraper can scrape the specified channel.

@@ -345,7 +358,23 @@ class ScraperController:
            logger.error("No DB session")
            return

+        session = self.session()
+
+        # If any channels are not already in the database, add them
        for channel in channels:
+
+            platform_id = None
+            if channel.platform_id not in (None, ''):
+                platform_id = channel.platform_id
+
+            channel_in_db = session.query(Channel).filter_by(platform_id=platform_id, platform=channel.platform, url=channel.url).first()
+
+            if not channel_in_db:
+                logger.debug(f"{channel} does not exist in database, adding")
+                session.add(channel)
+                session.flush()
+                session.commit()
+
            handled = False

            for scraper in self.scrapers:
@@ -355,7 +384,6 @@ class ScraperController:
                    added = 0

                    # get most recent post
-                    session = self.session()
                    rows = session.query(ScraperResult).where(
                        ScraperResult.channel == channel.id).order_by(
                        ScraperResult.date.desc()).limit(1).all()
--- a/cisticola/scraper/bitchute.py
+++ b/cisticola/scraper/bitchute.py
@@ -45,9 +45,12 @@ class BitchuteScraper(Scraper):

            archived_urls = {}

-            if archive_media:
-                if 'video_url' in post:
-                    url = post['video_url']
+            if 'video_url' in post:
+                url = post['video_url']
+                archived_urls[url] = None 
+
+                if archive_media:
+
                    media_blob, content_type, key = self.url_to_blob(url)
                    archived_url = self.archive_blob(media_blob, content_type, key)
                    archived_urls[url] = archived_url
@@ -114,6 +117,7 @@ class BitchuteScraper(Scraper):
            channel=channel.id,
            raw_data=json.dumps(profile),
            date_archived=datetime.now(timezone.utc))
+            
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

 def strip_tags(html, convert_newlines=True):
--- a/cisticola/scraper/gab.py
+++ b/cisticola/scraper/gab.py
@@ -52,25 +52,24 @@ class GabScraper(Scraper):
            if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
                break

-            media_urls = []
            archived_urls = {}

-            if archive_media:
-
-                for attachment in post.get('media_attachments'):
+            for attachment in post.get('media_attachments'):
+                if attachment.get('type') == 'video':
+                    archived_urls[attachment['source_mp4']] = None
+                else:
+                    archived_urls[attachment['url']] = None
+                    
+            if post.get('reblog') is not None:
+                for attachment in post['reblog'].get('media_attachments'):
                    if attachment.get('type') == 'video':
-                        media_urls.append(attachment['source_mp4'])
+                        archived_urls[attachment['source_mp4']] = None
                    else:
-                        media_urls.append(attachment['url'])
-                        
-                if post.get('reblog') is not None:
-                    for attachment in post['reblog'].get('media_attachments'):
-                        if attachment.get('type') == 'video':
-                            media_urls.append(attachment['source_mp4'])
-                        else:
-                            media_urls.append(attachment['url'])
+                        archived_urls[attachment['url']] = None

-                for url in media_urls:
+            for url in archived_urls.keys():
+
+                if archive_media:
                    media_blob, content_type, key = self.url_to_blob(url)
                    archived_url = self.archive_blob(media_blob, content_type, key)
                    archived_urls[url] = archived_url
--- a/cisticola/scraper/gettr.py
+++ b/cisticola/scraper/gettr.py
@@ -32,26 +32,25 @@ class GettrScraper(Scraper):

            archived_urls = {}

-            if archive_media:
+            if 'imgs' in post:
+                for img in post['imgs']:
+                    url = "https://media.gettr.com/" + img
+                    archived_urls[url] = None

-                if 'imgs' in post:
-                    for img in post['imgs']:
-                        url = "https://media.gettr.com/" + img
-                        media_blob, content_type, key = self.url_to_blob(url)
-                        archived_url = self.archive_blob(media_blob, content_type, key)
-                        archived_urls[img] = archived_url
+            if 'main' in post:
+                url = "https://media.gettr.com/" + post['main']
+                archived_urls[url] = None

-                if 'main' in post:
-                    url = "https://media.gettr.com/" + post['main']
+            if 'ovid' in post:
+                url = "https://media.gettr.com/" + post['ovid']
+                archived_urls[url] = None
+
+            for url in archived_urls.keys():
+
+                if archive_media:
                    media_blob, content_type, key = self.url_to_blob(url)
                    archived_url = self.archive_blob(media_blob, content_type, key)
-                    archived_urls[post['main']] = archived_url
-
-                if 'vid' in post:
-                    url = "https://media.gettr.com/" + post['vid']
-                    media_blob, content_type, key = self.m3u8_url_to_blob(url)
-                    archived_url = self.archive_blob(media_blob, content_type, key)
-                    archived_urls[post['vid']] = archived_url
+                    archived_urls[url] = archived_url

            yield ScraperResult(
                scraper=self.__version__,
@@ -74,7 +73,7 @@ class GettrScraper(Scraper):
        return key 

    def get_profile(self, channel: Channel) -> RawChannelInfo:
-        client = client = PublicClient()
+        client = PublicClient()
        username = self.get_username_from_url(channel.url)
        profile = client.user_info(username)

--- a/cisticola/scraper/instagram.py
+++ b/cisticola/scraper/instagram.py
@@ -1,4 +1,4 @@
-from typing import Generator
+from typing import Generator, List
 from datetime import datetime, timezone
 import os
 import json
@@ -50,28 +50,14 @@ class InstagramScraper(Scraper):

            post_url = f'{BASE_URL}p/{post.shortcode}/'

-            archived_urls = {}
+            archived_urls = get_archived_urls_from_post(post = post)

-            if archive_media:
+            for url in archived_urls.keys():

-                with tempfile.TemporaryDirectory() as temp_dir:
-
-                    loader.download_post(post = post, target = Path(temp_dir))
-
-                    files = os.listdir(temp_dir)
-                    files = [f for f in files if not f.endswith('.txt')]
-
-                    for file in files:
-                        ext = file.split('.')[-1]
-                        content_type = CONTENT_TYPES[ext]
-                        filename = Path(temp_dir, file)
-                        key = f'{post.shortcode}__{file}'
-                    
-                        with open(filename, 'rb') as f:
-                            blob = f.read()
-                
-                        archived_url = self.archive_blob(blob = blob, content_type = content_type, key = key)
-                        archived_urls[post_url] = archived_url
+                if archive_media:
+                    media_blob, content_type, key = self.url_to_blob(url)
+                    archived_url = self.archive_blob(media_blob, content_type, key)
+                    archived_urls[url] = archived_url

            yield ScraperResult(
                scraper=self.__version__,
@@ -99,7 +85,7 @@ class InstagramScraper(Scraper):
                    date_archived=datetime.now(timezone.utc),
                    raw_posts=json.dumps(comment_dict, default=str),
                    archived_urls={},
-                    media_archived=archive_media)
+                    media_archived=True)

    def can_handle(self, channel):
        if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None:
@@ -127,7 +113,20 @@ class InstagramScraper(Scraper):
        profile['followees'] = user_profile.followees

        return RawChannelInfo(scraper=self.__version__,
-                        platform=channel.platform,
-                        channel=channel.id,
-                        raw_data=json.dumps(profile),
-                        date_archived=datetime.now(timezone.utc))
+            platform=channel.platform,
+            channel=channel.id,
+            raw_data=json.dumps(profile),
+            date_archived=datetime.now(timezone.utc))
+
+def get_archived_urls_from_post(post: instaloader.Post) -> List[str]:
+    typename = post._node['__typename']
+    if typename == 'GraphImage':
+        urls = [post._node['display_url']]
+    elif typename == 'GraphVideo':
+        urls = [post._node['video_url']]
+    elif typename == 'GraphSidecar':
+        urls = [edge['node']['display_url'] for edge in post._node['edge_sidecar_to_children']['edges']]
+    else:
+        raise NotImplementedError(f'post of type {typename} is currently not supported.')
+        
+    return {url : None for url in urls}
--- a/cisticola/scraper/odysee.py
+++ b/cisticola/scraper/odysee.py
@@ -37,10 +37,11 @@ class OdyseeScraper(Scraper):
            if since is not None and datetime.fromtimestamp(video.info['created']) <= since.date:
                break

-            archived_urls = {}
+            url = video.info['streaming_url']
+
+            archived_urls = {url: None}

            if archive_media:
-                url = video.info['streaming_url']

                # Check if file is a video file or an m3u8 file
                r = requests.head(url)
@@ -78,6 +79,21 @@ class OdyseeScraper(Scraper):
                    archived_urls={},
                    media_archived=True)

+    def archive_files(self, result: ScraperResult) -> ScraperResult:
+        for url in result.archived_urls:
+            if result.archived_urls[url] is None:
+                r = requests.head(url)
+                if r.headers['Content-Type'] == 'text/html; charset=utf-8':
+                    media_blob, content_type, key = self.m3u8_url_to_blob(url)
+                else:
+                    media_blob, content_type, key = self.url_to_blob(url)
+
+                archived_url = self.archive_blob(media_blob, content_type, key)
+                result.archived_urls[url] = archived_url
+
+        result.media_archived = True
+        return result
+
    def can_handle(self, channel):
        if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None:
            return True
@@ -95,7 +111,7 @@ class OdyseeScraper(Scraper):
        profile = odysee_channel.info

        return RawChannelInfo(scraper=self.__version__,
-                        platform=channel.platform,
-                        channel=channel.id,
-                        raw_data=json.dumps(profile),
-                        date_archived=datetime.now(timezone.utc))
+            platform=channel.platform,
+            channel=channel.id,
+            raw_data=json.dumps(profile),
+            date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/rumble.py
+++ b/cisticola/scraper/rumble.py
@@ -21,18 +21,18 @@ class RumbleScraper(Scraper):
        scraper = get_channel_videos(channel.url)

        for post in scraper:
-            if since is not None and post['datetime'].replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
+            if since is not None and post['datetime'].replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
                break

-            archived_urls = {}
+            url = post['media_url']
+
+            archived_urls = {url: None}

            if archive_media:

-                url = post['media_url']
-
                media_blob, content_type, key = self.ytdlp_url_to_blob(url)
                archived_url = self.archive_blob(media_blob, content_type, key)
-                archived_urls[post['media_url']] = archived_url
+                archived_urls[url] = archived_url

            yield ScraperResult(
                scraper=self.__version__,
@@ -50,6 +50,16 @@ class RumbleScraper(Scraper):
        key = urlparse(url).path.split('/')[-2] + ext
        return key 

+    def archive_files(self, result: ScraperResult) -> ScraperResult:
+        for url in result.archived_urls:
+            if result.archived_urls[url] is None:
+                media_blob, content_type, key = self.ytdlp_url_to_blob(url)
+                archived_url = self.archive_blob(media_blob, content_type, key)
+                result.archived_urls[url] = archived_url
+
+        result.media_archived = True
+        return result
+
    def can_handle(self, channel):
        if channel.platform == "Rumble" and channel.url is not None:
            return True
@@ -59,10 +69,10 @@ class RumbleScraper(Scraper):
        profile = get_channel_profile(url = channel.url)

        return RawChannelInfo(scraper=self.__version__,
-                        platform=channel.platform,
-                        channel=channel.id,
-                        raw_data=json.dumps(profile),
-                        date_archived=datetime.now(timezone.utc))
+            platform=channel.platform,
+            channel=channel.id,
+            raw_data=json.dumps(profile),
+            date_archived=datetime.now(timezone.utc))

 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

--- a/cisticola/scraper/telegram_snscrape.py
+++ b/cisticola/scraper/telegram_snscrape.py
@@ -34,8 +34,8 @@ class TelegramSnscrapeScraper(Scraper):
            for image_url in post.images:
                archived_urls[image_url] = None

-            if post.video:
-                archived_urls[post.video] = None
+            for video_url in post.videos:
+                archived_urls[video_url] = None

            if archive_media:
                for url in archived_urls:
--- a/cisticola/scraper/twitter.py
+++ b/cisticola/scraper/twitter.py
@@ -15,7 +15,7 @@ class TwitterScraper(Scraper):
    @logger.catch
    def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
        if channel.platform_id:
-            identifier = channel.platform_id
+            identifier = int(channel.platform_id)
        else:
            identifier = channel.screenname

@@ -24,7 +24,7 @@ class TwitterScraper(Scraper):
        first = True

        for tweet in scraper.get_items():
-            if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
+            if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
                # with TwitterProfileScraper, the first tweet could be an old pinned tweet
                if first:
                    first = False
@@ -106,7 +106,7 @@ class TwitterScraper(Scraper):
            raise ChannelDoesNotExistError(channel.url)
        else:   
            return RawChannelInfo(scraper=self.__version__,
-            platform=channel.platform,
-            channel=channel.id,
-            raw_data=json.dumps(entity.__dict__, default=str),
-            date_archived=datetime.now(timezone.utc))
+                platform=channel.platform,
+                channel=channel.id,
+                raw_data=json.dumps(entity.__dict__, default=str),
+                date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/vkontakte.py
+++ b/cisticola/scraper/vkontakte.py
@@ -1,8 +1,12 @@
 from datetime import datetime, timezone
 from typing import Generator
 from urllib.parse import urlparse
+import json 
+import re 
+
 from snscrape.modules.vkontakte import VKontakteUserScraper
 from loguru import logger
+from yt_dlp.extractor.vk import VKIE

 from cisticola.base import Channel, ScraperResult, RawChannelInfo
 from cisticola.scraper.base import Scraper
@@ -25,7 +29,7 @@ class VkontakteScraper(Scraper):
        first = True

        for post in scraper.get_items():
-            if since is not None and datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
+            if since is not None and datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
                # with VKontakteUserScraper, the first tweet could be an old pinned tweet
                if first:
                    first = False
@@ -35,23 +39,26 @@ class VkontakteScraper(Scraper):

            archived_urls = {}

-            if archive_media:
+            if post.photos:

-                if post.photos:
+                for photo in post.photos:
+                    variant = max(
+                        [v for v in photo.variants], key=lambda v: v.width * v.height)
+                    url = variant.url
+                    if url is not None:
+                        archived_urls[url] = None

-                    for photo in post.photos:
-                        variant = max(
-                            [v for v in photo.variants], key=lambda v: v.width * v.height)
-                        url = variant.url
-                
-                        if url is not None:
-                            media_blob, content_type, key = self.url_to_blob(url)
-                            archived_url = self.archive_blob(media_blob, content_type, key)
-                            archived_urls[url] = archived_url
+            if post.video:
+                archived_urls[post.video.url] = None

-                if post.video:
-                    url = post.video.url
-                    media_blob, content_type, key = self.ytdlp_url_to_blob(url)
+            for url in archived_urls.keys():
+
+                if archive_media:
+                    if re.match(VKIE._VALID_URL, url):
+                        # Uses regex from yt_dlp to verify VK video URL
+                        media_blob, content_type, key = self.ytdlp_url_to_blob(url)
+                    else:
+                        media_blob, content_type, key = self.url_to_blob(url)
                    archived_url = self.archive_blob(media_blob, content_type, key)
                    archived_urls[url] = archived_url

@@ -66,6 +73,21 @@ class VkontakteScraper(Scraper):
                archived_urls=archived_urls,
                media_archived=archive_media)

+    def archive_files(self, result: ScraperResult) -> ScraperResult:
+        for url in result.archived_urls:
+            if result.archived_urls[url] is None:
+                if re.match(VKIE._VALID_URL, url):
+                    # Uses regex from yt_dlp to verify VK video URL
+                    media_blob, content_type, key = self.ytdlp_url_to_blob(url)
+                else:
+                    media_blob, content_type, key = self.url_to_blob(url)
+                archived_url = self.archive_blob(media_blob, content_type, key)
+                result.archived_urls[url] = archived_url
+
+        result.media_archived = True
+        return result
+
+
    def can_handle(self, channel):
        if channel.platform == "Vkontakte" and channel.platform_id:
            return True
@@ -88,7 +110,7 @@ class VkontakteScraper(Scraper):
        profile = scraper._get_entity().__dict__

        return RawChannelInfo(scraper=self.__version__,
-                    platform=channel.platform,
-                    channel=channel.id,
-                    raw_data=json.dumps(profile),
-                    date_archived=datetime.now(timezone.utc))
+            platform=channel.platform,
+            channel=channel.id,
+            raw_data=json.dumps(profile),
+            date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/youtube.py
+++ b/cisticola/scraper/youtube.py
@@ -2,6 +2,9 @@ from datetime import datetime, timezone
 import json
 from typing import Generator
 import tempfile
+from pathlib import Path
+import os
+
 import yt_dlp
 from loguru import logger

@@ -48,7 +51,10 @@ class YoutubeScraper(Scraper):
                        
                for video in valid_videos:

-                    archived_urls = {}
+                    url = video['webpage_url']
+
+                    archived_urls = {url: None}
+                    
                    video_id = video["id"]
                    video_ext = video["ext"]

@@ -56,11 +62,8 @@ class YoutubeScraper(Scraper):
                    
                        key = f"{video_id}.{video_ext}"

-                        with open(f"{temp_dir}/{key}", "rb") as f:
+                        with open(Path(temp_dir)/key, "rb") as f:
                            media_blob = f.read()
-                        archived_url = self.archive_blob(media_blob, content_type, key)
-
-                        url = video['webpage_url']

                        archived_url = self.archive_blob(media_blob, content_type, key)
                        archived_urls[url] = archived_url
@@ -80,6 +83,41 @@ class YoutubeScraper(Scraper):
        if channel.platform == "Youtube" and channel.url:
            return True

+    def archive_files(self, result: ScraperResult) -> ScraperResult:
+        for url in result.archived_urls:
+            if result.archived_urls[url] is None:
+
+                media_blob = None
+
+                with tempfile.TemporaryDirectory() as temp_dir:
+
+                    ydl_opts = {
+                        "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
+                        "merge_output_format": "mp4",
+                        "outtmpl": f"{temp_dir}/%(id)s.%(ext)s"}
+
+                    ydl = yt_dlp.YoutubeDL(ydl_opts)
+
+                    try:
+                        ydl.download(url)
+                    except yt_dlp.utils.DownloadError as e:
+                        raise e
+                        
+                    files = os.listdir(temp_dir)
+                    if len(files) != 1:
+                        logger.warning(f'{len(files)} files downloaded for video: {url}')
+                    key = files[0]
+                    with open(Path(temp_dir, key), 'rb') as f:
+                        media_blob = f.read()
+
+                if media_blob is not None:
+                    content_type = 'video/mp4'            
+                    archived_url = self.archive_blob(media_blob, content_type, key)
+                    result.archived_urls[url] = archived_url
+
+        result.media_archived = True
+        return result
+
    def get_profile(self, channel: Channel) -> RawChannelInfo:
        ydl_opts = {}
        ydl = yt_dlp.YoutubeDL(ydl_opts)
@@ -89,12 +127,13 @@ class YoutubeScraper(Scraper):
            meta = ydl.extract_info(
                channel.url,
                process=False)
+            meta.pop('entries')

            return RawChannelInfo(scraper=self.__version__,
-                    platform=channel.platform,
-                    channel=channel.id,
-                    raw_data=json.dumps(meta),
-                    date_archived=datetime.now(timezone.utc))
+                platform=channel.platform,
+                channel=channel.id,
+                raw_data=json.dumps(meta),
+                date_archived=datetime.now(timezone.utc))

        except yt_dlp.utils.DownloadError as e:
            raise e
--- a/pytest.ini
+++ b/pytest.ini
@@ -12,10 +12,9 @@ addopts =
  --html='reports/tests.html'
  --self-contained-html
 markers = 
-    profile: marks tests for only extracting channel metadata (deselect with '-m 
-    "not profile"')
-    media: marks tests for archiving all media attachments (deselect with '-m 
-    "not media"')
+    profile: marks tests for only extracting channel metadata (deselect with '-m "not profile"')
+    media: marks tests for archiving all media attachments (deselect with '-m "not media"')
+    unarchived: marks tests for archiving all unarchived media attachments (deselect with '-m "not unarchived"')
 filterwarnings =
    ignore:the imp module is deprecated:DeprecationWarning
    ignore:The localize method is no longer necessary, as this time zone supports the fold attribute
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -99,12 +99,12 @@ RUMBLE_CHANNEL_KWARGS = {
    'notes': ''}

 TELEGRAM_CHANNEL_KWARGS = {
-    'name': 'USA Freedom Convoy (test)',
-    'platform_id': -1001799578085,
+    'name': 'South West Ohio Proud Boys (test)',
+    'platform_id': -1001276612436,
    'category': 'test',
    'platform': 'Telegram',
-    'url': 'https://t.me/usafreedomconvoy2022',
-    'screenname': 'usafreedomconvoy2022',
+    'url': 'https://t.me/SouthwestOhioPB',
+    'screenname': 'SouthwestOhioPB',
    'country': 'US',
    'influencer': None,
    'public': True,
--- a/tests/scraper/bitchute.py
+++ b/tests/scraper/bitchute.py
@@ -3,12 +3,19 @@ import pytest
 from cisticola.base import Channel
 from cisticola.scraper import BitchuteScraper

+@pytest.mark.unarchived
 def test_scrape_bitchute_channel_no_media(controller, channel_kwargs):

    channels = [Channel(**channel_kwargs['bitchute'])]
    controller.register_scraper(scraper = BitchuteScraper())
    controller.scrape_channels(channels = channels, archive_media = False)

+@pytest.mark.media
+@pytest.mark.unarchived
+def test_scrape_bitchute_channel_unarchived_media(controller):
+
+    controller.archive_unarchived_media()
+
@pytest.mark.media
 def test_scrape_bitchute_channel(controller, channel_kwargs):

--- a/tests/scraper/gab.py
+++ b/tests/scraper/gab.py
@@ -3,12 +3,19 @@ import pytest
 from cisticola.base import Channel
 from cisticola.scraper import GabScraper

+@pytest.mark.unarchived
 def test_scrape_gab_channel_no_media(controller, channel_kwargs):

    channels = [Channel(**channel_kwargs['gab'])]
    controller.register_scraper(scraper = GabScraper())
    controller.scrape_channels(channels = channels, archive_media = False)

+@pytest.mark.media
+@pytest.mark.unarchived
+def test_scrape_gab_channel_unarchived_media(controller):
+
+    controller.archive_unarchived_media()
+
@pytest.mark.media
 def test_scrape_gab_channel(controller, channel_kwargs):
    
--- a/tests/scraper/gettr.py
+++ b/tests/scraper/gettr.py
@@ -3,12 +3,19 @@ import pytest
 from cisticola.base import Channel
 from cisticola.scraper import GettrScraper

+@pytest.mark.unarchived
 def test_scrape_gettr_channel_no_media(controller, channel_kwargs):

    channels = [Channel(**channel_kwargs['gettr'])]
    controller.register_scraper(scraper = GettrScraper())
    controller.scrape_channels(channels = channels, archive_media = False)

+@pytest.mark.media
+@pytest.mark.unarchived
+def test_scrape_gettr_channel_unarchived_media(controller):
+
+    controller.archive_unarchived_media()
+
@pytest.mark.media
 def test_scrape_gettr_channel(controller, channel_kwargs):

--- a/tests/scraper/instagram.py
+++ b/tests/scraper/instagram.py
@@ -3,12 +3,19 @@ import pytest
 from cisticola.base import Channel
 from cisticola.scraper import InstagramScraper

+@pytest.mark.unarchived
 def test_scrape_instagram_channel_no_media(controller, channel_kwargs):

    channels = [Channel(**channel_kwargs['instagram'])]
    controller.register_scraper(scraper = InstagramScraper())
    controller.scrape_channels(channels = channels, archive_media = False)

+@pytest.mark.media
+@pytest.mark.unarchived
+def test_scrape_instagram_channel_unarchived_media(controller):
+
+    controller.archive_unarchived_media()
+
@pytest.mark.media
 def test_scrape_instagram_channel(controller, channel_kwargs):

--- a/tests/scraper/odysee.py
+++ b/tests/scraper/odysee.py
@@ -3,12 +3,19 @@ import pytest
 from cisticola.base import Channel
 from cisticola.scraper import OdyseeScraper

+@pytest.mark.unarchived
 def test_scrape_odysee_channel_no_media(controller, channel_kwargs):

    channels = [Channel(**channel_kwargs['odysee'])]
    controller.register_scraper(scraper = OdyseeScraper())
    controller.scrape_channels(channels = channels, archive_media = False)

+@pytest.mark.media
+@pytest.mark.unarchived
+def test_scrape_odysee_channel_unarchived_media(controller):
+
+    controller.archive_unarchived_media()
+
@pytest.mark.media
 def test_scrape_odysee_channel(controller, channel_kwargs):

--- a/tests/scraper/rumble.py
+++ b/tests/scraper/rumble.py
@@ -3,12 +3,19 @@ import pytest
 from cisticola.base import Channel
 from cisticola.scraper import RumbleScraper

+@pytest.mark.unarchived
 def test_scrape_rumble_channel_no_media(controller, channel_kwargs):

    channels = [Channel(**channel_kwargs['rumble'])]
    controller.register_scraper(scraper = RumbleScraper())
    controller.scrape_channels(channels = channels, archive_media = False)

+@pytest.mark.media
+@pytest.mark.unarchived
+def test_scrape_rumble_channel_unarchived_media(controller):
+
+    controller.archive_unarchived_media()
+
@pytest.mark.media
 def test_scrape_rumble_channel(controller, channel_kwargs):

--- a/tests/scraper/telegram_snscrape.py
+++ b/tests/scraper/telegram_snscrape.py
@@ -3,12 +3,19 @@ import pytest
 from cisticola.base import Channel
 from cisticola.scraper import TelegramSnscrapeScraper

+@pytest.mark.unarchived
 def test_scrape_telegram_snscrape_channel_no_media(controller, channel_kwargs):

    channels = [Channel(**channel_kwargs['telegram'])]
    controller.register_scraper(scraper = TelegramSnscrapeScraper())
    controller.scrape_channels(channels = channels, archive_media = False)

+@pytest.mark.media
+@pytest.mark.unarchived
+def test_scrape_telegram_snscrape_channel_unarchived_media(controller):
+
+    controller.archive_unarchived_media()
+
@pytest.mark.media
 def test_scrape_telegram_snscrape_channel(controller, channel_kwargs):

--- a/tests/scraper/telegram_telethon.py
+++ b/tests/scraper/telegram_telethon.py
@@ -3,6 +3,7 @@ import pytest
 from cisticola.base import Channel
 from cisticola.scraper import TelegramTelethonScraper

+@pytest.mark.unarchived
 def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs):
    controller.remove_all_scrapers()

@@ -10,6 +11,12 @@ def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs):
    controller.register_scraper(scraper = TelegramTelethonScraper())
    controller.scrape_channels(channels = channels, archive_media = False)

+@pytest.mark.media
+@pytest.mark.unarchived
+def test_scrape_telegram_telethon_unarchived_media(controller):
+
+    controller.archive_unarchived_media()
+
@pytest.mark.media
 def test_scrape_telegram_telethon_channel(controller, channel_kwargs):

--- a/tests/scraper/twitter.py
+++ b/tests/scraper/twitter.py
@@ -3,12 +3,19 @@ import pytest
 from cisticola.base import Channel
 from cisticola.scraper import TwitterScraper

+@pytest.mark.unarchived
 def test_scrape_twitter_channel_no_media(controller, channel_kwargs):

    channels = [Channel(**channel_kwargs['twitter'])]
    controller.register_scraper(scraper = TwitterScraper())
    controller.scrape_channels(channels = channels, archive_media = False)

+@pytest.mark.media
+@pytest.mark.unarchived
+def test_scrape_twitter_channel_unarchived_media(controller):
+
+    controller.archive_unarchived_media()
+
@pytest.mark.media
 def test_scrape_twitter_channel(controller, channel_kwargs):

--- a/tests/scraper/vkontakte.py
+++ b/tests/scraper/vkontakte.py
@@ -3,12 +3,19 @@ import pytest
 from cisticola.base import Channel
 from cisticola.scraper import VkontakteScraper

+@pytest.mark.unarchived
 def test_scrape_vkontakte_channel_no_media(controller, channel_kwargs):

    channels = [Channel(**channel_kwargs['vkontakte'])]
    controller.register_scraper(scraper = VkontakteScraper())
    controller.scrape_channels(channels = channels, archive_media = False)

+@pytest.mark.media
+@pytest.mark.unarchived
+def test_scrape_vkontakte_channel_unarchived_media(controller):
+
+    controller.archive_unarchived_media()
+
@pytest.mark.media
 def test_scrape_vkontakte_channel(controller, channel_kwargs):

--- a/tests/scraper/youtube.py
+++ b/tests/scraper/youtube.py
@@ -3,12 +3,19 @@ import pytest
 from cisticola.base import Channel
 from cisticola.scraper import YoutubeScraper

+@pytest.mark.unarchived
 def test_scrape_youtube_channel_no_media(controller, channel_kwargs):

    channels = [Channel(**channel_kwargs['youtube'])]
    controller.register_scraper(scraper = YoutubeScraper())
    controller.scrape_channels(channels = channels, archive_media = False)

+@pytest.mark.media
+@pytest.mark.unarchived
+def test_scrape_youtube_channel_unarchived_media(controller):
+
+    controller.archive_unarchived_media()
+
@pytest.mark.media
 def test_scrape_youtube_channel(controller, channel_kwargs):