diff --git a/Pipfile.lock b/Pipfile.lock index 44b2115..4629c08 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -26,19 +26,19 @@ }, "boto3": { "hashes": [ - "sha256:ef210f8e85cdb6d26a38ebad1cfe9cefdef2ab269207e5987653555375a7ef6b", - "sha256:f0af8f4ef5fe6353c794cd3cce627d469a618b58ace7ca75a63cfd719df615ce" + "sha256:35f68b60652bff50e7bc926238443cb578f29f120908bb945e5640e90c6dd53e", + "sha256:7f3f93ee97215862ccd1a216f37deb7d64055c71f826b821805904df7b84ee6a" ], "index": "pypi", - "version": "==1.21.30" + "version": "==1.21.31" }, "botocore": { "hashes": [ - "sha256:af4bdc51eeecbe9fdcdadbed9ad58c5c91380ef30f3560022bbc2ee1d78f0ad6", - "sha256:c622751093e3d0bf61343e66d6d06190ef30bf42b1557d5070ca84e9efa06d4b" + "sha256:3bb21e3ee5e4de3ed76bb99b4496a46e9b5c82e7b7fdb62702f11dda1b57b769", + "sha256:424fd94bef86a11f5340dc15eb50602dedec2ecc01c3a25c4fea23a2c8195500" ], "markers": "python_version >= '3.6'", - "version": "==1.24.30" + "version": "==1.24.31" }, "brotli": { "hashes": [ @@ -195,11 +195,11 @@ }, "click": { "hashes": [ - "sha256:5e0d195c2067da3136efb897449ec1e9e6c98282fbf30d7f9e164af9be901a6b", - "sha256:7ab900e38149c9872376e8f9b5986ddcaf68c0f413cf73678a0bca5547e6f976" + "sha256:24e1a4a9ec5bf6299411369b208c1df2188d9eb8d916302fe6bf03faed227f1e", + "sha256:479707fe14d9ec9a0757618b7a100a0ae4c4e236fac5b7f80ca68028141a1a72" ], "markers": "python_version >= '3.7'", - "version": "==8.1.1" + "version": "==8.1.2" }, "cryptg": { "hashes": [ @@ -872,6 +872,9 @@ "version": "==2022.3.2" }, "requests": { + "extras": [ + "socks" + ], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" @@ -925,45 +928,45 @@ }, "sqlalchemy": { "hashes": [ - "sha256:03b5dc8b6399a51187e220ab351dfa8f81c310ce59442d047b74cd91ac4e859a", - "sha256:0539328f70f0c2bbaa0137be7a0787ceea7eda29e41e3a9d575c52030d4251aa", - "sha256:0d19eed02bf1225889e6f91cd7c2f89618919cb283846e7ed8aae1b0fe5f0403", - "sha256:18a965490cc0f83f3e867078096e78e97d81bf370f45debae6027331904d3348", - "sha256:1a6ae212067856bf2236708cb434554851cbe8099027953e94241a1743afb2b1", - "sha256:1ee7c85f27b1ae59c3fe8fd715ed38e73ba8c885c90f74bcbae609c0fdb0ada3", - "sha256:3081f61a86d0c2b6928fa7f4666bdeba0b9c7cc19619255454a71bfa60b55978", - "sha256:34d2e697115cab4a66d1b8ea60050004ce62f0598c6cf146ee66a4025c7cb7fc", - "sha256:36a3535bf9dd5f42a42d2cb6305f992888bbcb5789c615d35e34368853ae46f5", - "sha256:4443237f1f87d460453f6b33683f25135f3fba9dffbf2a053caae15bf838cfc2", - "sha256:44f0301e246c4d35d84e70192410d01509aea03a99cb963451aa0b652b7529f5", - "sha256:4b7371495c91319bfaa010e257eb8d3cd2d3eae14b256412d3294247ea7f0d78", - "sha256:5a43870fc272cf6dabf6ce7ad297a08e7f31672ea9ccd217a305c5bce8eafa9f", - "sha256:5a5f798fc3f3ca9b5f1bf66b50a58439d558d5df132e12cfed22e2bc167c411c", - "sha256:5b2718cb9e2bcf0053a83a6d0c491476b7f3832c59d82b53ccc78bb869e4027f", - "sha256:5ee1ce25cc21db5553f607511fb41b85b7dc5eade15536c5c238f898996987de", - "sha256:84747d1cc4823285b8253a34513162a664d4989217461e111097446b98803bfc", - "sha256:89d51682716135e0d93584c3ca625c40805a014aaddb3961074a04895793d4d3", - "sha256:8b4cf1effee1ef6db3f6c5185d32c0e6518bcf06212861875779676a22f68370", - "sha256:9473d5dadee7a81d003cf18b1f5266b0fb29a9dff985bc205d71ca8da037e18b", - "sha256:a11d8e12ff761101aa44404ce2df15e32f061d5559b862a847976c2efff014a1", - "sha256:a720cef2bcd4e645ae1fa01a143a31c04e095f26ff925f6090cb1ef7f1859e5d", - "sha256:bda76918f8c6da01278a97365bc17fa97d902be7f6d7596aad2bd7d9b52adbba", - "sha256:c3bbcfbaa33d5998698ed84eba0807a58023c86d0fa540ff4da96637815a4d92", - "sha256:c6545f832267bfea780c8fc6235f7a1fc87778e3e2629ddddeb88a94f9181292", - "sha256:c7c667579800445f390b56c37ee383639465766ebc2041a2d1f1279cda01d4a0", - "sha256:c8f671575256dddf1c334b2052aff88ac1c913e5430300057f1b0f2f12495019", - "sha256:cd4954eff9e9b9a294f632e7bdb0a4c41e23c89609c6b2f742f1321740566477", - "sha256:d258091ba28615ae133bf6a9975a24b9ed0bcc47f48bb1f57fa13cbe2bd4033c", - "sha256:d29a8c0e8c2b77f2f548acdf948543a661f6414282598147e094bab091f37af6", - "sha256:dd51b09d540e757dd6377f02950a80c0dc63aec6188582afdc21bf0db66efa0a", - "sha256:e403e1cfe7789eddba83bc7677dd8ffdaab56fb2f14eb3b6f014037b09cd8096", - "sha256:ea76d3f258c7b221a536fea200b64fc1b9272b48de4d1695bef616b7e5269183", - "sha256:eab54f6ec81c12b6184ebeacccd89567cee8fc94b2f9fba23aec30ca25fc287c", - "sha256:f39fb329a53043c10814fee68e123f02addc6000ed96994aedf24afe6fc30e9b", - "sha256:f409a1a44e3da766445600714e2ca70ddf735898382c11c5c250eb88a7b8b0d1" + "sha256:045d6a26c262929af0b9cb25441aae675ac04db4ea8bd2446b355617cd6b6b7d", + "sha256:07f4dab2deb6d34618a2ccfff3971a85923ad7c3a9a45401818870fc51d3f0cc", + "sha256:08aaad905aba8940f27aeb9f1f851bf63f18ef97b0062ca41f64afc4b64e0e8c", + "sha256:27a42894a2751e438eaed12fc0dcfe741ff2f66c14760d081222c5adc5460064", + "sha256:2a3e4dc7c452ba3c0f3175ad5a8e0ba49c2b0570a8d07272cf50844c8d78e74f", + "sha256:345306707bb0e51e7cd6e7573adafbce018894ee5e3b9c31134545f704936db0", + "sha256:36f08d94670315ca04c8139bd80b3e02b9dd9cc66fc11bcb96fd10ad51a051ab", + "sha256:3ebb97ed96f4506e2f212e1fcf0ec07a103bb194938627660a5acb4d9feae49c", + "sha256:40b995d7aeeb6f88a1927ce6692c0f626b59d8effd3e1d597f125e141707b37c", + "sha256:4414ace6e3a5e39523e55a5d9f3b215699b2ead4ff91fca98f1b659b7ab2d92a", + "sha256:50107d8183da3fbe5715957aa3954cd9d82aed555c5b4d3fd37fac861af422fa", + "sha256:50174e173d03209c34e07e7b57cca48d0082ac2390edf927aafc706c111da11e", + "sha256:5e88912bf192e7b5739c446d2276e1cba74cfa6c1c93eea2b2534404f6be1dbd", + "sha256:621d3f6c0ba2407bb97e82b649be5ca7d5b6c201dcfb964ce13f517bf1cb6305", + "sha256:623bac2d6bdca3f3e61cf1e1c466c5fb9f5cf08735736ee1111187b7a4108891", + "sha256:671f61c3db4595b0e86cc4b30f675a7c0206d9ce99f041b4f6761c7ddd1e0074", + "sha256:67c1c27c48875afc950bee5ee24582794f20b545e64e4f9ca94071a9b514d6ed", + "sha256:6a6cfd468f54d65324fd3847cfd0148b0610efa6a43e5f5fcc89f455696ae9e7", + "sha256:70048a83f0a1ece1fcd7189891c888e20af2c57fbd33eb760d8cece9843b896c", + "sha256:7ee14a7f9f76d1ef9d5e5b760c9252617c839b87eee04d1ce8325ac66ae155c4", + "sha256:804cf491437f3e4ce31247ab4b309b181f06ecc97d309b746d10f09439b4eb85", + "sha256:878c7beaafa365602762c19f638282e1885454fed1aed86f8fae038933c7c671", + "sha256:954ea8c527c4322afb6885944904714893af81fe9167e421273770991bf08a4a", + "sha256:a47bf6b7ca6c28e4f4e262fabcf5be6b907af81be36de77839c9eeda2cdf3bb3", + "sha256:a4fb5c6ee84a6bba4ff6f9f5379f0b3a0ffe9de7ba5a0945659b3da8d519709b", + "sha256:b34bbc683789559f1bc9bb685fc162e0956dbbdfbe2fbd6755a9f5982c113610", + "sha256:c025d45318b73c0601cca451532556cbab532b2742839ebb8cb58f9ebf06811e", + "sha256:c3ad7f5b61ba014f5045912aea15b03c473bb02b1c07fd92c9d2c794fa183276", + "sha256:c9218e3519398129e364121e0d89823e6ba2a2b77c28bfc661face0829c41433", + "sha256:cd5cffd1dd753828f1069f33062f3896e51c990acd957c264f40e051b3e19887", + "sha256:d8efcaa709ea8e7c08c3d3e7639c39b36083f5a995f397f9e6eedf5f5e4e4946", + "sha256:e297a5cc625e3f1367a82deedf2d48ee4d2b2bd263b8b8d2efbaaf5608b5229e", + "sha256:e67278ceb63270cdac0a7b89fc3c29a56f7dac9616a7ee48e7ad6b52e3b631e5", + "sha256:eb6558ba07409dafa18c793c34292b3265be455904966f0724c10198829477e3", + "sha256:f197c66663ed0f9e1178d51141d864688fb244a83f6b17f667d521e482537b2e", + "sha256:f47996b1810894f766c9ee689607077c6c0e0fd6761e04c12ba13efb56d50c1d" ], "index": "pypi", - "version": "==1.4.33" + "version": "==1.4.34" }, "telethon": { "hashes": [ @@ -1138,11 +1141,11 @@ }, "click": { "hashes": [ - "sha256:5e0d195c2067da3136efb897449ec1e9e6c98282fbf30d7f9e164af9be901a6b", - "sha256:7ab900e38149c9872376e8f9b5986ddcaf68c0f413cf73678a0bca5547e6f976" + "sha256:24e1a4a9ec5bf6299411369b208c1df2188d9eb8d916302fe6bf03faed227f1e", + "sha256:479707fe14d9ec9a0757618b7a100a0ae4c4e236fac5b7f80ca68028141a1a72" ], "markers": "python_version >= '3.7'", - "version": "==8.1.1" + "version": "==8.1.2" }, "coverage": { "extras": [ @@ -1390,6 +1393,9 @@ "version": "==2022.1" }, "requests": { + "extras": [ + "socks" + ], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" @@ -1473,7 +1479,7 @@ "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f" ], - "markers": "python_version >= '3.7'", + "markers": "python_full_version < '3.11.0'", "version": "==2.0.1" }, "typing-extensions": { diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 3b24a21..0762c16 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -235,6 +235,20 @@ class Scraper: return archived_url def archive_files(self, result: ScraperResult) -> ScraperResult: + """Archive files corresponding to ``archived_url`` dict keys, if the + files have not previously been archived. + + Parameters + ---------- + result: ScraperResult + Previously scraped ScraperResult run with ``archive_media=False``. + + Returns + ------- + ScraperResult + Same ScraperResult as ``result``, but with all URLs in ``archived_url`` dict archived. + """ + for url in result.archived_urls: if result.archived_urls[url] is None: media_blob, content_type, key = self.url_to_blob(url) @@ -244,7 +258,6 @@ class Scraper: result.media_archived = True return result - def can_handle(self, channel: Channel) -> bool: """Whether or not the scraper can scrape the specified channel. @@ -345,7 +358,23 @@ class ScraperController: logger.error("No DB session") return + session = self.session() + + # If any channels are not already in the database, add them for channel in channels: + + platform_id = None + if channel.platform_id not in (None, ''): + platform_id = channel.platform_id + + channel_in_db = session.query(Channel).filter_by(platform_id=platform_id, platform=channel.platform, url=channel.url).first() + + if not channel_in_db: + logger.debug(f"{channel} does not exist in database, adding") + session.add(channel) + session.flush() + session.commit() + handled = False for scraper in self.scrapers: @@ -355,7 +384,6 @@ class ScraperController: added = 0 # get most recent post - session = self.session() rows = session.query(ScraperResult).where( ScraperResult.channel == channel.id).order_by( ScraperResult.date.desc()).limit(1).all() diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index edac872..5afa4e1 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -45,9 +45,12 @@ class BitchuteScraper(Scraper): archived_urls = {} - if archive_media: - if 'video_url' in post: - url = post['video_url'] + if 'video_url' in post: + url = post['video_url'] + archived_urls[url] = None + + if archive_media: + media_blob, content_type, key = self.url_to_blob(url) archived_url = self.archive_blob(media_blob, content_type, key) archived_urls[url] = archived_url @@ -114,6 +117,7 @@ class BitchuteScraper(Scraper): channel=channel.id, raw_data=json.dumps(profile), date_archived=datetime.now(timezone.utc)) + #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# def strip_tags(html, convert_newlines=True): diff --git a/cisticola/scraper/gab.py b/cisticola/scraper/gab.py index b38b16f..ab1cdf3 100644 --- a/cisticola/scraper/gab.py +++ b/cisticola/scraper/gab.py @@ -52,25 +52,24 @@ class GabScraper(Scraper): if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): break - media_urls = [] archived_urls = {} - if archive_media: - - for attachment in post.get('media_attachments'): + for attachment in post.get('media_attachments'): + if attachment.get('type') == 'video': + archived_urls[attachment['source_mp4']] = None + else: + archived_urls[attachment['url']] = None + + if post.get('reblog') is not None: + for attachment in post['reblog'].get('media_attachments'): if attachment.get('type') == 'video': - media_urls.append(attachment['source_mp4']) + archived_urls[attachment['source_mp4']] = None else: - media_urls.append(attachment['url']) - - if post.get('reblog') is not None: - for attachment in post['reblog'].get('media_attachments'): - if attachment.get('type') == 'video': - media_urls.append(attachment['source_mp4']) - else: - media_urls.append(attachment['url']) + archived_urls[attachment['url']] = None - for url in media_urls: + for url in archived_urls.keys(): + + if archive_media: media_blob, content_type, key = self.url_to_blob(url) archived_url = self.archive_blob(media_blob, content_type, key) archived_urls[url] = archived_url diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index f95170d..89ed35f 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -32,26 +32,25 @@ class GettrScraper(Scraper): archived_urls = {} - if archive_media: + if 'imgs' in post: + for img in post['imgs']: + url = "https://media.gettr.com/" + img + archived_urls[url] = None - if 'imgs' in post: - for img in post['imgs']: - url = "https://media.gettr.com/" + img - media_blob, content_type, key = self.url_to_blob(url) - archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[img] = archived_url + if 'main' in post: + url = "https://media.gettr.com/" + post['main'] + archived_urls[url] = None - if 'main' in post: - url = "https://media.gettr.com/" + post['main'] + if 'ovid' in post: + url = "https://media.gettr.com/" + post['ovid'] + archived_urls[url] = None + + for url in archived_urls.keys(): + + if archive_media: media_blob, content_type, key = self.url_to_blob(url) archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[post['main']] = archived_url - - if 'vid' in post: - url = "https://media.gettr.com/" + post['vid'] - media_blob, content_type, key = self.m3u8_url_to_blob(url) - archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[post['vid']] = archived_url + archived_urls[url] = archived_url yield ScraperResult( scraper=self.__version__, @@ -74,7 +73,7 @@ class GettrScraper(Scraper): return key def get_profile(self, channel: Channel) -> RawChannelInfo: - client = client = PublicClient() + client = PublicClient() username = self.get_username_from_url(channel.url) profile = client.user_info(username) diff --git a/cisticola/scraper/instagram.py b/cisticola/scraper/instagram.py index 2e79d20..4dbc205 100644 --- a/cisticola/scraper/instagram.py +++ b/cisticola/scraper/instagram.py @@ -1,4 +1,4 @@ -from typing import Generator +from typing import Generator, List from datetime import datetime, timezone import os import json @@ -50,28 +50,14 @@ class InstagramScraper(Scraper): post_url = f'{BASE_URL}p/{post.shortcode}/' - archived_urls = {} + archived_urls = get_archived_urls_from_post(post = post) - if archive_media: + for url in archived_urls.keys(): - with tempfile.TemporaryDirectory() as temp_dir: - - loader.download_post(post = post, target = Path(temp_dir)) - - files = os.listdir(temp_dir) - files = [f for f in files if not f.endswith('.txt')] - - for file in files: - ext = file.split('.')[-1] - content_type = CONTENT_TYPES[ext] - filename = Path(temp_dir, file) - key = f'{post.shortcode}__{file}' - - with open(filename, 'rb') as f: - blob = f.read() - - archived_url = self.archive_blob(blob = blob, content_type = content_type, key = key) - archived_urls[post_url] = archived_url + if archive_media: + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_blob(media_blob, content_type, key) + archived_urls[url] = archived_url yield ScraperResult( scraper=self.__version__, @@ -99,7 +85,7 @@ class InstagramScraper(Scraper): date_archived=datetime.now(timezone.utc), raw_posts=json.dumps(comment_dict, default=str), archived_urls={}, - media_archived=archive_media) + media_archived=True) def can_handle(self, channel): if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None: @@ -127,7 +113,20 @@ class InstagramScraper(Scraper): profile['followees'] = user_profile.followees return RawChannelInfo(scraper=self.__version__, - platform=channel.platform, - channel=channel.id, - raw_data=json.dumps(profile), - date_archived=datetime.now(timezone.utc)) + platform=channel.platform, + channel=channel.id, + raw_data=json.dumps(profile), + date_archived=datetime.now(timezone.utc)) + +def get_archived_urls_from_post(post: instaloader.Post) -> List[str]: + typename = post._node['__typename'] + if typename == 'GraphImage': + urls = [post._node['display_url']] + elif typename == 'GraphVideo': + urls = [post._node['video_url']] + elif typename == 'GraphSidecar': + urls = [edge['node']['display_url'] for edge in post._node['edge_sidecar_to_children']['edges']] + else: + raise NotImplementedError(f'post of type {typename} is currently not supported.') + + return {url : None for url in urls} \ No newline at end of file diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py index c17dd6b..0f7a3fe 100644 --- a/cisticola/scraper/odysee.py +++ b/cisticola/scraper/odysee.py @@ -37,10 +37,11 @@ class OdyseeScraper(Scraper): if since is not None and datetime.fromtimestamp(video.info['created']) <= since.date: break - archived_urls = {} + url = video.info['streaming_url'] + + archived_urls = {url: None} if archive_media: - url = video.info['streaming_url'] # Check if file is a video file or an m3u8 file r = requests.head(url) @@ -78,6 +79,21 @@ class OdyseeScraper(Scraper): archived_urls={}, media_archived=True) + def archive_files(self, result: ScraperResult) -> ScraperResult: + for url in result.archived_urls: + if result.archived_urls[url] is None: + r = requests.head(url) + if r.headers['Content-Type'] == 'text/html; charset=utf-8': + media_blob, content_type, key = self.m3u8_url_to_blob(url) + else: + media_blob, content_type, key = self.url_to_blob(url) + + archived_url = self.archive_blob(media_blob, content_type, key) + result.archived_urls[url] = archived_url + + result.media_archived = True + return result + def can_handle(self, channel): if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None: return True @@ -95,7 +111,7 @@ class OdyseeScraper(Scraper): profile = odysee_channel.info return RawChannelInfo(scraper=self.__version__, - platform=channel.platform, - channel=channel.id, - raw_data=json.dumps(profile), - date_archived=datetime.now(timezone.utc)) \ No newline at end of file + platform=channel.platform, + channel=channel.id, + raw_data=json.dumps(profile), + date_archived=datetime.now(timezone.utc)) \ No newline at end of file diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index 1d68fda..737be05 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -21,18 +21,18 @@ class RumbleScraper(Scraper): scraper = get_channel_videos(channel.url) for post in scraper: - if since is not None and post['datetime'].replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc): + if since is not None and post['datetime'].replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): break - archived_urls = {} + url = post['media_url'] + + archived_urls = {url: None} if archive_media: - url = post['media_url'] - media_blob, content_type, key = self.ytdlp_url_to_blob(url) archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[post['media_url']] = archived_url + archived_urls[url] = archived_url yield ScraperResult( scraper=self.__version__, @@ -50,6 +50,16 @@ class RumbleScraper(Scraper): key = urlparse(url).path.split('/')[-2] + ext return key + def archive_files(self, result: ScraperResult) -> ScraperResult: + for url in result.archived_urls: + if result.archived_urls[url] is None: + media_blob, content_type, key = self.ytdlp_url_to_blob(url) + archived_url = self.archive_blob(media_blob, content_type, key) + result.archived_urls[url] = archived_url + + result.media_archived = True + return result + def can_handle(self, channel): if channel.platform == "Rumble" and channel.url is not None: return True @@ -59,10 +69,10 @@ class RumbleScraper(Scraper): profile = get_channel_profile(url = channel.url) return RawChannelInfo(scraper=self.__version__, - platform=channel.platform, - channel=channel.id, - raw_data=json.dumps(profile), - date_archived=datetime.now(timezone.utc)) + platform=channel.platform, + channel=channel.id, + raw_data=json.dumps(profile), + date_archived=datetime.now(timezone.utc)) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py index 3678ffc..9b91203 100644 --- a/cisticola/scraper/telegram_snscrape.py +++ b/cisticola/scraper/telegram_snscrape.py @@ -34,8 +34,8 @@ class TelegramSnscrapeScraper(Scraper): for image_url in post.images: archived_urls[image_url] = None - if post.video: - archived_urls[post.video] = None + for video_url in post.videos: + archived_urls[video_url] = None if archive_media: for url in archived_urls: diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index a9c8c8c..a361252 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -15,7 +15,7 @@ class TwitterScraper(Scraper): @logger.catch def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: if channel.platform_id: - identifier = channel.platform_id + identifier = int(channel.platform_id) else: identifier = channel.screenname @@ -24,7 +24,7 @@ class TwitterScraper(Scraper): first = True for tweet in scraper.get_items(): - if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc): + if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): # with TwitterProfileScraper, the first tweet could be an old pinned tweet if first: first = False @@ -106,7 +106,7 @@ class TwitterScraper(Scraper): raise ChannelDoesNotExistError(channel.url) else: return RawChannelInfo(scraper=self.__version__, - platform=channel.platform, - channel=channel.id, - raw_data=json.dumps(entity.__dict__, default=str), - date_archived=datetime.now(timezone.utc)) + platform=channel.platform, + channel=channel.id, + raw_data=json.dumps(entity.__dict__, default=str), + date_archived=datetime.now(timezone.utc)) diff --git a/cisticola/scraper/vkontakte.py b/cisticola/scraper/vkontakte.py index 3f4130c..f36ac12 100644 --- a/cisticola/scraper/vkontakte.py +++ b/cisticola/scraper/vkontakte.py @@ -1,8 +1,12 @@ from datetime import datetime, timezone from typing import Generator from urllib.parse import urlparse +import json +import re + from snscrape.modules.vkontakte import VKontakteUserScraper from loguru import logger +from yt_dlp.extractor.vk import VKIE from cisticola.base import Channel, ScraperResult, RawChannelInfo from cisticola.scraper.base import Scraper @@ -25,7 +29,7 @@ class VkontakteScraper(Scraper): first = True for post in scraper.get_items(): - if since is not None and datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc): + if since is not None and datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): # with VKontakteUserScraper, the first tweet could be an old pinned tweet if first: first = False @@ -35,23 +39,26 @@ class VkontakteScraper(Scraper): archived_urls = {} - if archive_media: + if post.photos: - if post.photos: + for photo in post.photos: + variant = max( + [v for v in photo.variants], key=lambda v: v.width * v.height) + url = variant.url + if url is not None: + archived_urls[url] = None - for photo in post.photos: - variant = max( - [v for v in photo.variants], key=lambda v: v.width * v.height) - url = variant.url - - if url is not None: - media_blob, content_type, key = self.url_to_blob(url) - archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[url] = archived_url + if post.video: + archived_urls[post.video.url] = None - if post.video: - url = post.video.url - media_blob, content_type, key = self.ytdlp_url_to_blob(url) + for url in archived_urls.keys(): + + if archive_media: + if re.match(VKIE._VALID_URL, url): + # Uses regex from yt_dlp to verify VK video URL + media_blob, content_type, key = self.ytdlp_url_to_blob(url) + else: + media_blob, content_type, key = self.url_to_blob(url) archived_url = self.archive_blob(media_blob, content_type, key) archived_urls[url] = archived_url @@ -66,6 +73,21 @@ class VkontakteScraper(Scraper): archived_urls=archived_urls, media_archived=archive_media) + def archive_files(self, result: ScraperResult) -> ScraperResult: + for url in result.archived_urls: + if result.archived_urls[url] is None: + if re.match(VKIE._VALID_URL, url): + # Uses regex from yt_dlp to verify VK video URL + media_blob, content_type, key = self.ytdlp_url_to_blob(url) + else: + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_blob(media_blob, content_type, key) + result.archived_urls[url] = archived_url + + result.media_archived = True + return result + + def can_handle(self, channel): if channel.platform == "Vkontakte" and channel.platform_id: return True @@ -88,7 +110,7 @@ class VkontakteScraper(Scraper): profile = scraper._get_entity().__dict__ return RawChannelInfo(scraper=self.__version__, - platform=channel.platform, - channel=channel.id, - raw_data=json.dumps(profile), - date_archived=datetime.now(timezone.utc)) + platform=channel.platform, + channel=channel.id, + raw_data=json.dumps(profile), + date_archived=datetime.now(timezone.utc)) diff --git a/cisticola/scraper/youtube.py b/cisticola/scraper/youtube.py index a7b85ac..6b14d98 100644 --- a/cisticola/scraper/youtube.py +++ b/cisticola/scraper/youtube.py @@ -2,6 +2,9 @@ from datetime import datetime, timezone import json from typing import Generator import tempfile +from pathlib import Path +import os + import yt_dlp from loguru import logger @@ -48,7 +51,10 @@ class YoutubeScraper(Scraper): for video in valid_videos: - archived_urls = {} + url = video['webpage_url'] + + archived_urls = {url: None} + video_id = video["id"] video_ext = video["ext"] @@ -56,11 +62,8 @@ class YoutubeScraper(Scraper): key = f"{video_id}.{video_ext}" - with open(f"{temp_dir}/{key}", "rb") as f: + with open(Path(temp_dir)/key, "rb") as f: media_blob = f.read() - archived_url = self.archive_blob(media_blob, content_type, key) - - url = video['webpage_url'] archived_url = self.archive_blob(media_blob, content_type, key) archived_urls[url] = archived_url @@ -80,6 +83,41 @@ class YoutubeScraper(Scraper): if channel.platform == "Youtube" and channel.url: return True + def archive_files(self, result: ScraperResult) -> ScraperResult: + for url in result.archived_urls: + if result.archived_urls[url] is None: + + media_blob = None + + with tempfile.TemporaryDirectory() as temp_dir: + + ydl_opts = { + "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", + "merge_output_format": "mp4", + "outtmpl": f"{temp_dir}/%(id)s.%(ext)s"} + + ydl = yt_dlp.YoutubeDL(ydl_opts) + + try: + ydl.download(url) + except yt_dlp.utils.DownloadError as e: + raise e + + files = os.listdir(temp_dir) + if len(files) != 1: + logger.warning(f'{len(files)} files downloaded for video: {url}') + key = files[0] + with open(Path(temp_dir, key), 'rb') as f: + media_blob = f.read() + + if media_blob is not None: + content_type = 'video/mp4' + archived_url = self.archive_blob(media_blob, content_type, key) + result.archived_urls[url] = archived_url + + result.media_archived = True + return result + def get_profile(self, channel: Channel) -> RawChannelInfo: ydl_opts = {} ydl = yt_dlp.YoutubeDL(ydl_opts) @@ -89,12 +127,13 @@ class YoutubeScraper(Scraper): meta = ydl.extract_info( channel.url, process=False) + meta.pop('entries') return RawChannelInfo(scraper=self.__version__, - platform=channel.platform, - channel=channel.id, - raw_data=json.dumps(meta), - date_archived=datetime.now(timezone.utc)) + platform=channel.platform, + channel=channel.id, + raw_data=json.dumps(meta), + date_archived=datetime.now(timezone.utc)) except yt_dlp.utils.DownloadError as e: raise e diff --git a/pytest.ini b/pytest.ini index 744f87d..ae2a8b6 100644 --- a/pytest.ini +++ b/pytest.ini @@ -12,10 +12,9 @@ addopts = --html='reports/tests.html' --self-contained-html markers = - profile: marks tests for only extracting channel metadata (deselect with '-m - "not profile"') - media: marks tests for archiving all media attachments (deselect with '-m - "not media"') + profile: marks tests for only extracting channel metadata (deselect with '-m "not profile"') + media: marks tests for archiving all media attachments (deselect with '-m "not media"') + unarchived: marks tests for archiving all unarchived media attachments (deselect with '-m "not unarchived"') filterwarnings = ignore:the imp module is deprecated:DeprecationWarning ignore:The localize method is no longer necessary, as this time zone supports the fold attribute diff --git a/tests/conftest.py b/tests/conftest.py index 3bccf81..684c15d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -99,12 +99,12 @@ RUMBLE_CHANNEL_KWARGS = { 'notes': ''} TELEGRAM_CHANNEL_KWARGS = { - 'name': 'USA Freedom Convoy (test)', - 'platform_id': -1001799578085, + 'name': 'South West Ohio Proud Boys (test)', + 'platform_id': -1001276612436, 'category': 'test', 'platform': 'Telegram', - 'url': 'https://t.me/usafreedomconvoy2022', - 'screenname': 'usafreedomconvoy2022', + 'url': 'https://t.me/SouthwestOhioPB', + 'screenname': 'SouthwestOhioPB', 'country': 'US', 'influencer': None, 'public': True, diff --git a/tests/scraper/bitchute.py b/tests/scraper/bitchute.py index 94707ec..62b3ffe 100644 --- a/tests/scraper/bitchute.py +++ b/tests/scraper/bitchute.py @@ -3,12 +3,19 @@ import pytest from cisticola.base import Channel from cisticola.scraper import BitchuteScraper +@pytest.mark.unarchived def test_scrape_bitchute_channel_no_media(controller, channel_kwargs): channels = [Channel(**channel_kwargs['bitchute'])] controller.register_scraper(scraper = BitchuteScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media +@pytest.mark.unarchived +def test_scrape_bitchute_channel_unarchived_media(controller): + + controller.archive_unarchived_media() + @pytest.mark.media def test_scrape_bitchute_channel(controller, channel_kwargs): diff --git a/tests/scraper/gab.py b/tests/scraper/gab.py index d600429..79ba8d7 100644 --- a/tests/scraper/gab.py +++ b/tests/scraper/gab.py @@ -3,12 +3,19 @@ import pytest from cisticola.base import Channel from cisticola.scraper import GabScraper +@pytest.mark.unarchived def test_scrape_gab_channel_no_media(controller, channel_kwargs): channels = [Channel(**channel_kwargs['gab'])] controller.register_scraper(scraper = GabScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media +@pytest.mark.unarchived +def test_scrape_gab_channel_unarchived_media(controller): + + controller.archive_unarchived_media() + @pytest.mark.media def test_scrape_gab_channel(controller, channel_kwargs): diff --git a/tests/scraper/gettr.py b/tests/scraper/gettr.py index 81a8bb8..352e839 100644 --- a/tests/scraper/gettr.py +++ b/tests/scraper/gettr.py @@ -3,12 +3,19 @@ import pytest from cisticola.base import Channel from cisticola.scraper import GettrScraper +@pytest.mark.unarchived def test_scrape_gettr_channel_no_media(controller, channel_kwargs): channels = [Channel(**channel_kwargs['gettr'])] controller.register_scraper(scraper = GettrScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media +@pytest.mark.unarchived +def test_scrape_gettr_channel_unarchived_media(controller): + + controller.archive_unarchived_media() + @pytest.mark.media def test_scrape_gettr_channel(controller, channel_kwargs): diff --git a/tests/scraper/instagram.py b/tests/scraper/instagram.py index 98a0684..099ab40 100644 --- a/tests/scraper/instagram.py +++ b/tests/scraper/instagram.py @@ -3,12 +3,19 @@ import pytest from cisticola.base import Channel from cisticola.scraper import InstagramScraper +@pytest.mark.unarchived def test_scrape_instagram_channel_no_media(controller, channel_kwargs): channels = [Channel(**channel_kwargs['instagram'])] controller.register_scraper(scraper = InstagramScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media +@pytest.mark.unarchived +def test_scrape_instagram_channel_unarchived_media(controller): + + controller.archive_unarchived_media() + @pytest.mark.media def test_scrape_instagram_channel(controller, channel_kwargs): diff --git a/tests/scraper/odysee.py b/tests/scraper/odysee.py index 84a45f8..9883bdb 100644 --- a/tests/scraper/odysee.py +++ b/tests/scraper/odysee.py @@ -3,12 +3,19 @@ import pytest from cisticola.base import Channel from cisticola.scraper import OdyseeScraper +@pytest.mark.unarchived def test_scrape_odysee_channel_no_media(controller, channel_kwargs): channels = [Channel(**channel_kwargs['odysee'])] controller.register_scraper(scraper = OdyseeScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media +@pytest.mark.unarchived +def test_scrape_odysee_channel_unarchived_media(controller): + + controller.archive_unarchived_media() + @pytest.mark.media def test_scrape_odysee_channel(controller, channel_kwargs): diff --git a/tests/scraper/rumble.py b/tests/scraper/rumble.py index 18c8749..5b01f9c 100644 --- a/tests/scraper/rumble.py +++ b/tests/scraper/rumble.py @@ -3,12 +3,19 @@ import pytest from cisticola.base import Channel from cisticola.scraper import RumbleScraper +@pytest.mark.unarchived def test_scrape_rumble_channel_no_media(controller, channel_kwargs): channels = [Channel(**channel_kwargs['rumble'])] controller.register_scraper(scraper = RumbleScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media +@pytest.mark.unarchived +def test_scrape_rumble_channel_unarchived_media(controller): + + controller.archive_unarchived_media() + @pytest.mark.media def test_scrape_rumble_channel(controller, channel_kwargs): diff --git a/tests/scraper/telegram_snscrape.py b/tests/scraper/telegram_snscrape.py index dbaed43..5dbe151 100644 --- a/tests/scraper/telegram_snscrape.py +++ b/tests/scraper/telegram_snscrape.py @@ -3,12 +3,19 @@ import pytest from cisticola.base import Channel from cisticola.scraper import TelegramSnscrapeScraper +@pytest.mark.unarchived def test_scrape_telegram_snscrape_channel_no_media(controller, channel_kwargs): channels = [Channel(**channel_kwargs['telegram'])] controller.register_scraper(scraper = TelegramSnscrapeScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media +@pytest.mark.unarchived +def test_scrape_telegram_snscrape_channel_unarchived_media(controller): + + controller.archive_unarchived_media() + @pytest.mark.media def test_scrape_telegram_snscrape_channel(controller, channel_kwargs): diff --git a/tests/scraper/telegram_telethon.py b/tests/scraper/telegram_telethon.py index ee994eb..8dbe9ff 100644 --- a/tests/scraper/telegram_telethon.py +++ b/tests/scraper/telegram_telethon.py @@ -3,6 +3,7 @@ import pytest from cisticola.base import Channel from cisticola.scraper import TelegramTelethonScraper +@pytest.mark.unarchived def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs): controller.remove_all_scrapers() @@ -10,6 +11,12 @@ def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = TelegramTelethonScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media +@pytest.mark.unarchived +def test_scrape_telegram_telethon_unarchived_media(controller): + + controller.archive_unarchived_media() + @pytest.mark.media def test_scrape_telegram_telethon_channel(controller, channel_kwargs): diff --git a/tests/scraper/twitter.py b/tests/scraper/twitter.py index 97765aa..0a4ad86 100644 --- a/tests/scraper/twitter.py +++ b/tests/scraper/twitter.py @@ -3,12 +3,19 @@ import pytest from cisticola.base import Channel from cisticola.scraper import TwitterScraper +@pytest.mark.unarchived def test_scrape_twitter_channel_no_media(controller, channel_kwargs): channels = [Channel(**channel_kwargs['twitter'])] controller.register_scraper(scraper = TwitterScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media +@pytest.mark.unarchived +def test_scrape_twitter_channel_unarchived_media(controller): + + controller.archive_unarchived_media() + @pytest.mark.media def test_scrape_twitter_channel(controller, channel_kwargs): diff --git a/tests/scraper/vkontakte.py b/tests/scraper/vkontakte.py index 4209c30..12ff12c 100644 --- a/tests/scraper/vkontakte.py +++ b/tests/scraper/vkontakte.py @@ -3,12 +3,19 @@ import pytest from cisticola.base import Channel from cisticola.scraper import VkontakteScraper +@pytest.mark.unarchived def test_scrape_vkontakte_channel_no_media(controller, channel_kwargs): channels = [Channel(**channel_kwargs['vkontakte'])] controller.register_scraper(scraper = VkontakteScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media +@pytest.mark.unarchived +def test_scrape_vkontakte_channel_unarchived_media(controller): + + controller.archive_unarchived_media() + @pytest.mark.media def test_scrape_vkontakte_channel(controller, channel_kwargs): diff --git a/tests/scraper/youtube.py b/tests/scraper/youtube.py index 1750b08..79ba7c7 100644 --- a/tests/scraper/youtube.py +++ b/tests/scraper/youtube.py @@ -3,12 +3,19 @@ import pytest from cisticola.base import Channel from cisticola.scraper import YoutubeScraper +@pytest.mark.unarchived def test_scrape_youtube_channel_no_media(controller, channel_kwargs): channels = [Channel(**channel_kwargs['youtube'])] controller.register_scraper(scraper = YoutubeScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media +@pytest.mark.unarchived +def test_scrape_youtube_channel_unarchived_media(controller): + + controller.archive_unarchived_media() + @pytest.mark.media def test_scrape_youtube_channel(controller, channel_kwargs):