Merge pull request #26 from bellingcat/deferred-media-archiving

Implemented deferred media archiving for all scrapers
This commit is contained in:
Logan Williams
2022-04-02 14:15:35 +02:00
committed by GitHub
25 changed files with 369 additions and 171 deletions

106
Pipfile.lock generated
View File

@@ -26,19 +26,19 @@
},
"boto3": {
"hashes": [
"sha256:ef210f8e85cdb6d26a38ebad1cfe9cefdef2ab269207e5987653555375a7ef6b",
"sha256:f0af8f4ef5fe6353c794cd3cce627d469a618b58ace7ca75a63cfd719df615ce"
"sha256:35f68b60652bff50e7bc926238443cb578f29f120908bb945e5640e90c6dd53e",
"sha256:7f3f93ee97215862ccd1a216f37deb7d64055c71f826b821805904df7b84ee6a"
],
"index": "pypi",
"version": "==1.21.30"
"version": "==1.21.31"
},
"botocore": {
"hashes": [
"sha256:af4bdc51eeecbe9fdcdadbed9ad58c5c91380ef30f3560022bbc2ee1d78f0ad6",
"sha256:c622751093e3d0bf61343e66d6d06190ef30bf42b1557d5070ca84e9efa06d4b"
"sha256:3bb21e3ee5e4de3ed76bb99b4496a46e9b5c82e7b7fdb62702f11dda1b57b769",
"sha256:424fd94bef86a11f5340dc15eb50602dedec2ecc01c3a25c4fea23a2c8195500"
],
"markers": "python_version >= '3.6'",
"version": "==1.24.30"
"version": "==1.24.31"
},
"brotli": {
"hashes": [
@@ -195,11 +195,11 @@
},
"click": {
"hashes": [
"sha256:5e0d195c2067da3136efb897449ec1e9e6c98282fbf30d7f9e164af9be901a6b",
"sha256:7ab900e38149c9872376e8f9b5986ddcaf68c0f413cf73678a0bca5547e6f976"
"sha256:24e1a4a9ec5bf6299411369b208c1df2188d9eb8d916302fe6bf03faed227f1e",
"sha256:479707fe14d9ec9a0757618b7a100a0ae4c4e236fac5b7f80ca68028141a1a72"
],
"markers": "python_version >= '3.7'",
"version": "==8.1.1"
"version": "==8.1.2"
},
"cryptg": {
"hashes": [
@@ -872,6 +872,9 @@
"version": "==2022.3.2"
},
"requests": {
"extras": [
"socks"
],
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
@@ -925,45 +928,45 @@
},
"sqlalchemy": {
"hashes": [
"sha256:03b5dc8b6399a51187e220ab351dfa8f81c310ce59442d047b74cd91ac4e859a",
"sha256:0539328f70f0c2bbaa0137be7a0787ceea7eda29e41e3a9d575c52030d4251aa",
"sha256:0d19eed02bf1225889e6f91cd7c2f89618919cb283846e7ed8aae1b0fe5f0403",
"sha256:18a965490cc0f83f3e867078096e78e97d81bf370f45debae6027331904d3348",
"sha256:1a6ae212067856bf2236708cb434554851cbe8099027953e94241a1743afb2b1",
"sha256:1ee7c85f27b1ae59c3fe8fd715ed38e73ba8c885c90f74bcbae609c0fdb0ada3",
"sha256:3081f61a86d0c2b6928fa7f4666bdeba0b9c7cc19619255454a71bfa60b55978",
"sha256:34d2e697115cab4a66d1b8ea60050004ce62f0598c6cf146ee66a4025c7cb7fc",
"sha256:36a3535bf9dd5f42a42d2cb6305f992888bbcb5789c615d35e34368853ae46f5",
"sha256:4443237f1f87d460453f6b33683f25135f3fba9dffbf2a053caae15bf838cfc2",
"sha256:44f0301e246c4d35d84e70192410d01509aea03a99cb963451aa0b652b7529f5",
"sha256:4b7371495c91319bfaa010e257eb8d3cd2d3eae14b256412d3294247ea7f0d78",
"sha256:5a43870fc272cf6dabf6ce7ad297a08e7f31672ea9ccd217a305c5bce8eafa9f",
"sha256:5a5f798fc3f3ca9b5f1bf66b50a58439d558d5df132e12cfed22e2bc167c411c",
"sha256:5b2718cb9e2bcf0053a83a6d0c491476b7f3832c59d82b53ccc78bb869e4027f",
"sha256:5ee1ce25cc21db5553f607511fb41b85b7dc5eade15536c5c238f898996987de",
"sha256:84747d1cc4823285b8253a34513162a664d4989217461e111097446b98803bfc",
"sha256:89d51682716135e0d93584c3ca625c40805a014aaddb3961074a04895793d4d3",
"sha256:8b4cf1effee1ef6db3f6c5185d32c0e6518bcf06212861875779676a22f68370",
"sha256:9473d5dadee7a81d003cf18b1f5266b0fb29a9dff985bc205d71ca8da037e18b",
"sha256:a11d8e12ff761101aa44404ce2df15e32f061d5559b862a847976c2efff014a1",
"sha256:a720cef2bcd4e645ae1fa01a143a31c04e095f26ff925f6090cb1ef7f1859e5d",
"sha256:bda76918f8c6da01278a97365bc17fa97d902be7f6d7596aad2bd7d9b52adbba",
"sha256:c3bbcfbaa33d5998698ed84eba0807a58023c86d0fa540ff4da96637815a4d92",
"sha256:c6545f832267bfea780c8fc6235f7a1fc87778e3e2629ddddeb88a94f9181292",
"sha256:c7c667579800445f390b56c37ee383639465766ebc2041a2d1f1279cda01d4a0",
"sha256:c8f671575256dddf1c334b2052aff88ac1c913e5430300057f1b0f2f12495019",
"sha256:cd4954eff9e9b9a294f632e7bdb0a4c41e23c89609c6b2f742f1321740566477",
"sha256:d258091ba28615ae133bf6a9975a24b9ed0bcc47f48bb1f57fa13cbe2bd4033c",
"sha256:d29a8c0e8c2b77f2f548acdf948543a661f6414282598147e094bab091f37af6",
"sha256:dd51b09d540e757dd6377f02950a80c0dc63aec6188582afdc21bf0db66efa0a",
"sha256:e403e1cfe7789eddba83bc7677dd8ffdaab56fb2f14eb3b6f014037b09cd8096",
"sha256:ea76d3f258c7b221a536fea200b64fc1b9272b48de4d1695bef616b7e5269183",
"sha256:eab54f6ec81c12b6184ebeacccd89567cee8fc94b2f9fba23aec30ca25fc287c",
"sha256:f39fb329a53043c10814fee68e123f02addc6000ed96994aedf24afe6fc30e9b",
"sha256:f409a1a44e3da766445600714e2ca70ddf735898382c11c5c250eb88a7b8b0d1"
"sha256:045d6a26c262929af0b9cb25441aae675ac04db4ea8bd2446b355617cd6b6b7d",
"sha256:07f4dab2deb6d34618a2ccfff3971a85923ad7c3a9a45401818870fc51d3f0cc",
"sha256:08aaad905aba8940f27aeb9f1f851bf63f18ef97b0062ca41f64afc4b64e0e8c",
"sha256:27a42894a2751e438eaed12fc0dcfe741ff2f66c14760d081222c5adc5460064",
"sha256:2a3e4dc7c452ba3c0f3175ad5a8e0ba49c2b0570a8d07272cf50844c8d78e74f",
"sha256:345306707bb0e51e7cd6e7573adafbce018894ee5e3b9c31134545f704936db0",
"sha256:36f08d94670315ca04c8139bd80b3e02b9dd9cc66fc11bcb96fd10ad51a051ab",
"sha256:3ebb97ed96f4506e2f212e1fcf0ec07a103bb194938627660a5acb4d9feae49c",
"sha256:40b995d7aeeb6f88a1927ce6692c0f626b59d8effd3e1d597f125e141707b37c",
"sha256:4414ace6e3a5e39523e55a5d9f3b215699b2ead4ff91fca98f1b659b7ab2d92a",
"sha256:50107d8183da3fbe5715957aa3954cd9d82aed555c5b4d3fd37fac861af422fa",
"sha256:50174e173d03209c34e07e7b57cca48d0082ac2390edf927aafc706c111da11e",
"sha256:5e88912bf192e7b5739c446d2276e1cba74cfa6c1c93eea2b2534404f6be1dbd",
"sha256:621d3f6c0ba2407bb97e82b649be5ca7d5b6c201dcfb964ce13f517bf1cb6305",
"sha256:623bac2d6bdca3f3e61cf1e1c466c5fb9f5cf08735736ee1111187b7a4108891",
"sha256:671f61c3db4595b0e86cc4b30f675a7c0206d9ce99f041b4f6761c7ddd1e0074",
"sha256:67c1c27c48875afc950bee5ee24582794f20b545e64e4f9ca94071a9b514d6ed",
"sha256:6a6cfd468f54d65324fd3847cfd0148b0610efa6a43e5f5fcc89f455696ae9e7",
"sha256:70048a83f0a1ece1fcd7189891c888e20af2c57fbd33eb760d8cece9843b896c",
"sha256:7ee14a7f9f76d1ef9d5e5b760c9252617c839b87eee04d1ce8325ac66ae155c4",
"sha256:804cf491437f3e4ce31247ab4b309b181f06ecc97d309b746d10f09439b4eb85",
"sha256:878c7beaafa365602762c19f638282e1885454fed1aed86f8fae038933c7c671",
"sha256:954ea8c527c4322afb6885944904714893af81fe9167e421273770991bf08a4a",
"sha256:a47bf6b7ca6c28e4f4e262fabcf5be6b907af81be36de77839c9eeda2cdf3bb3",
"sha256:a4fb5c6ee84a6bba4ff6f9f5379f0b3a0ffe9de7ba5a0945659b3da8d519709b",
"sha256:b34bbc683789559f1bc9bb685fc162e0956dbbdfbe2fbd6755a9f5982c113610",
"sha256:c025d45318b73c0601cca451532556cbab532b2742839ebb8cb58f9ebf06811e",
"sha256:c3ad7f5b61ba014f5045912aea15b03c473bb02b1c07fd92c9d2c794fa183276",
"sha256:c9218e3519398129e364121e0d89823e6ba2a2b77c28bfc661face0829c41433",
"sha256:cd5cffd1dd753828f1069f33062f3896e51c990acd957c264f40e051b3e19887",
"sha256:d8efcaa709ea8e7c08c3d3e7639c39b36083f5a995f397f9e6eedf5f5e4e4946",
"sha256:e297a5cc625e3f1367a82deedf2d48ee4d2b2bd263b8b8d2efbaaf5608b5229e",
"sha256:e67278ceb63270cdac0a7b89fc3c29a56f7dac9616a7ee48e7ad6b52e3b631e5",
"sha256:eb6558ba07409dafa18c793c34292b3265be455904966f0724c10198829477e3",
"sha256:f197c66663ed0f9e1178d51141d864688fb244a83f6b17f667d521e482537b2e",
"sha256:f47996b1810894f766c9ee689607077c6c0e0fd6761e04c12ba13efb56d50c1d"
],
"index": "pypi",
"version": "==1.4.33"
"version": "==1.4.34"
},
"telethon": {
"hashes": [
@@ -1138,11 +1141,11 @@
},
"click": {
"hashes": [
"sha256:5e0d195c2067da3136efb897449ec1e9e6c98282fbf30d7f9e164af9be901a6b",
"sha256:7ab900e38149c9872376e8f9b5986ddcaf68c0f413cf73678a0bca5547e6f976"
"sha256:24e1a4a9ec5bf6299411369b208c1df2188d9eb8d916302fe6bf03faed227f1e",
"sha256:479707fe14d9ec9a0757618b7a100a0ae4c4e236fac5b7f80ca68028141a1a72"
],
"markers": "python_version >= '3.7'",
"version": "==8.1.1"
"version": "==8.1.2"
},
"coverage": {
"extras": [
@@ -1390,6 +1393,9 @@
"version": "==2022.1"
},
"requests": {
"extras": [
"socks"
],
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
@@ -1473,7 +1479,7 @@
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
"sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
],
"markers": "python_version >= '3.7'",
"markers": "python_full_version < '3.11.0'",
"version": "==2.0.1"
},
"typing-extensions": {

View File

@@ -235,6 +235,20 @@ class Scraper:
return archived_url
def archive_files(self, result: ScraperResult) -> ScraperResult:
"""Archive files corresponding to ``archived_url`` dict keys, if the
files have not previously been archived.
Parameters
----------
result: ScraperResult
Previously scraped ScraperResult run with ``archive_media=False``.
Returns
-------
ScraperResult
Same ScraperResult as ``result``, but with all URLs in ``archived_url`` dict archived.
"""
for url in result.archived_urls:
if result.archived_urls[url] is None:
media_blob, content_type, key = self.url_to_blob(url)
@@ -244,7 +258,6 @@ class Scraper:
result.media_archived = True
return result
def can_handle(self, channel: Channel) -> bool:
"""Whether or not the scraper can scrape the specified channel.
@@ -345,7 +358,23 @@ class ScraperController:
logger.error("No DB session")
return
session = self.session()
# If any channels are not already in the database, add them
for channel in channels:
platform_id = None
if channel.platform_id not in (None, ''):
platform_id = channel.platform_id
channel_in_db = session.query(Channel).filter_by(platform_id=platform_id, platform=channel.platform, url=channel.url).first()
if not channel_in_db:
logger.debug(f"{channel} does not exist in database, adding")
session.add(channel)
session.flush()
session.commit()
handled = False
for scraper in self.scrapers:
@@ -355,7 +384,6 @@ class ScraperController:
added = 0
# get most recent post
session = self.session()
rows = session.query(ScraperResult).where(
ScraperResult.channel == channel.id).order_by(
ScraperResult.date.desc()).limit(1).all()

View File

@@ -45,9 +45,12 @@ class BitchuteScraper(Scraper):
archived_urls = {}
if archive_media:
if 'video_url' in post:
url = post['video_url']
if 'video_url' in post:
url = post['video_url']
archived_urls[url] = None
if archive_media:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
@@ -114,6 +117,7 @@ class BitchuteScraper(Scraper):
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def strip_tags(html, convert_newlines=True):

View File

@@ -52,25 +52,24 @@ class GabScraper(Scraper):
if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
break
media_urls = []
archived_urls = {}
if archive_media:
for attachment in post.get('media_attachments'):
for attachment in post.get('media_attachments'):
if attachment.get('type') == 'video':
archived_urls[attachment['source_mp4']] = None
else:
archived_urls[attachment['url']] = None
if post.get('reblog') is not None:
for attachment in post['reblog'].get('media_attachments'):
if attachment.get('type') == 'video':
media_urls.append(attachment['source_mp4'])
archived_urls[attachment['source_mp4']] = None
else:
media_urls.append(attachment['url'])
if post.get('reblog') is not None:
for attachment in post['reblog'].get('media_attachments'):
if attachment.get('type') == 'video':
media_urls.append(attachment['source_mp4'])
else:
media_urls.append(attachment['url'])
archived_urls[attachment['url']] = None
for url in media_urls:
for url in archived_urls.keys():
if archive_media:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url

View File

@@ -32,26 +32,25 @@ class GettrScraper(Scraper):
archived_urls = {}
if archive_media:
if 'imgs' in post:
for img in post['imgs']:
url = "https://media.gettr.com/" + img
archived_urls[url] = None
if 'imgs' in post:
for img in post['imgs']:
url = "https://media.gettr.com/" + img
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[img] = archived_url
if 'main' in post:
url = "https://media.gettr.com/" + post['main']
archived_urls[url] = None
if 'main' in post:
url = "https://media.gettr.com/" + post['main']
if 'ovid' in post:
url = "https://media.gettr.com/" + post['ovid']
archived_urls[url] = None
for url in archived_urls.keys():
if archive_media:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[post['main']] = archived_url
if 'vid' in post:
url = "https://media.gettr.com/" + post['vid']
media_blob, content_type, key = self.m3u8_url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[post['vid']] = archived_url
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
@@ -74,7 +73,7 @@ class GettrScraper(Scraper):
return key
def get_profile(self, channel: Channel) -> RawChannelInfo:
client = client = PublicClient()
client = PublicClient()
username = self.get_username_from_url(channel.url)
profile = client.user_info(username)

View File

@@ -1,4 +1,4 @@
from typing import Generator
from typing import Generator, List
from datetime import datetime, timezone
import os
import json
@@ -50,28 +50,14 @@ class InstagramScraper(Scraper):
post_url = f'{BASE_URL}p/{post.shortcode}/'
archived_urls = {}
archived_urls = get_archived_urls_from_post(post = post)
if archive_media:
for url in archived_urls.keys():
with tempfile.TemporaryDirectory() as temp_dir:
loader.download_post(post = post, target = Path(temp_dir))
files = os.listdir(temp_dir)
files = [f for f in files if not f.endswith('.txt')]
for file in files:
ext = file.split('.')[-1]
content_type = CONTENT_TYPES[ext]
filename = Path(temp_dir, file)
key = f'{post.shortcode}__{file}'
with open(filename, 'rb') as f:
blob = f.read()
archived_url = self.archive_blob(blob = blob, content_type = content_type, key = key)
archived_urls[post_url] = archived_url
if archive_media:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
@@ -99,7 +85,7 @@ class InstagramScraper(Scraper):
date_archived=datetime.now(timezone.utc),
raw_posts=json.dumps(comment_dict, default=str),
archived_urls={},
media_archived=archive_media)
media_archived=True)
def can_handle(self, channel):
if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None:
@@ -127,7 +113,20 @@ class InstagramScraper(Scraper):
profile['followees'] = user_profile.followees
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
def get_archived_urls_from_post(post: instaloader.Post) -> List[str]:
typename = post._node['__typename']
if typename == 'GraphImage':
urls = [post._node['display_url']]
elif typename == 'GraphVideo':
urls = [post._node['video_url']]
elif typename == 'GraphSidecar':
urls = [edge['node']['display_url'] for edge in post._node['edge_sidecar_to_children']['edges']]
else:
raise NotImplementedError(f'post of type {typename} is currently not supported.')
return {url : None for url in urls}

View File

@@ -37,10 +37,11 @@ class OdyseeScraper(Scraper):
if since is not None and datetime.fromtimestamp(video.info['created']) <= since.date:
break
archived_urls = {}
url = video.info['streaming_url']
archived_urls = {url: None}
if archive_media:
url = video.info['streaming_url']
# Check if file is a video file or an m3u8 file
r = requests.head(url)
@@ -78,6 +79,21 @@ class OdyseeScraper(Scraper):
archived_urls={},
media_archived=True)
def archive_files(self, result: ScraperResult) -> ScraperResult:
for url in result.archived_urls:
if result.archived_urls[url] is None:
r = requests.head(url)
if r.headers['Content-Type'] == 'text/html; charset=utf-8':
media_blob, content_type, key = self.m3u8_url_to_blob(url)
else:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
result.archived_urls[url] = archived_url
result.media_archived = True
return result
def can_handle(self, channel):
if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None:
return True
@@ -95,7 +111,7 @@ class OdyseeScraper(Scraper):
profile = odysee_channel.info
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -21,18 +21,18 @@ class RumbleScraper(Scraper):
scraper = get_channel_videos(channel.url)
for post in scraper:
if since is not None and post['datetime'].replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
if since is not None and post['datetime'].replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
break
archived_urls = {}
url = post['media_url']
archived_urls = {url: None}
if archive_media:
url = post['media_url']
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[post['media_url']] = archived_url
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
@@ -50,6 +50,16 @@ class RumbleScraper(Scraper):
key = urlparse(url).path.split('/')[-2] + ext
return key
def archive_files(self, result: ScraperResult) -> ScraperResult:
for url in result.archived_urls:
if result.archived_urls[url] is None:
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
result.archived_urls[url] = archived_url
result.media_archived = True
return result
def can_handle(self, channel):
if channel.platform == "Rumble" and channel.url is not None:
return True
@@ -59,10 +69,10 @@ class RumbleScraper(Scraper):
profile = get_channel_profile(url = channel.url)
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -34,8 +34,8 @@ class TelegramSnscrapeScraper(Scraper):
for image_url in post.images:
archived_urls[image_url] = None
if post.video:
archived_urls[post.video] = None
for video_url in post.videos:
archived_urls[video_url] = None
if archive_media:
for url in archived_urls:

View File

@@ -15,7 +15,7 @@ class TwitterScraper(Scraper):
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
if channel.platform_id:
identifier = channel.platform_id
identifier = int(channel.platform_id)
else:
identifier = channel.screenname
@@ -24,7 +24,7 @@ class TwitterScraper(Scraper):
first = True
for tweet in scraper.get_items():
if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
# with TwitterProfileScraper, the first tweet could be an old pinned tweet
if first:
first = False
@@ -106,7 +106,7 @@ class TwitterScraper(Scraper):
raise ChannelDoesNotExistError(channel.url)
else:
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(entity.__dict__, default=str),
date_archived=datetime.now(timezone.utc))
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(entity.__dict__, default=str),
date_archived=datetime.now(timezone.utc))

View File

@@ -1,8 +1,12 @@
from datetime import datetime, timezone
from typing import Generator
from urllib.parse import urlparse
import json
import re
from snscrape.modules.vkontakte import VKontakteUserScraper
from loguru import logger
from yt_dlp.extractor.vk import VKIE
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
@@ -25,7 +29,7 @@ class VkontakteScraper(Scraper):
first = True
for post in scraper.get_items():
if since is not None and datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
if since is not None and datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
# with VKontakteUserScraper, the first tweet could be an old pinned tweet
if first:
first = False
@@ -35,23 +39,26 @@ class VkontakteScraper(Scraper):
archived_urls = {}
if archive_media:
if post.photos:
if post.photos:
for photo in post.photos:
variant = max(
[v for v in photo.variants], key=lambda v: v.width * v.height)
url = variant.url
if url is not None:
archived_urls[url] = None
for photo in post.photos:
variant = max(
[v for v in photo.variants], key=lambda v: v.width * v.height)
url = variant.url
if url is not None:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
if post.video:
archived_urls[post.video.url] = None
if post.video:
url = post.video.url
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
for url in archived_urls.keys():
if archive_media:
if re.match(VKIE._VALID_URL, url):
# Uses regex from yt_dlp to verify VK video URL
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
else:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
@@ -66,6 +73,21 @@ class VkontakteScraper(Scraper):
archived_urls=archived_urls,
media_archived=archive_media)
def archive_files(self, result: ScraperResult) -> ScraperResult:
for url in result.archived_urls:
if result.archived_urls[url] is None:
if re.match(VKIE._VALID_URL, url):
# Uses regex from yt_dlp to verify VK video URL
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
else:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
result.archived_urls[url] = archived_url
result.media_archived = True
return result
def can_handle(self, channel):
if channel.platform == "Vkontakte" and channel.platform_id:
return True
@@ -88,7 +110,7 @@ class VkontakteScraper(Scraper):
profile = scraper._get_entity().__dict__
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -2,6 +2,9 @@ from datetime import datetime, timezone
import json
from typing import Generator
import tempfile
from pathlib import Path
import os
import yt_dlp
from loguru import logger
@@ -48,7 +51,10 @@ class YoutubeScraper(Scraper):
for video in valid_videos:
archived_urls = {}
url = video['webpage_url']
archived_urls = {url: None}
video_id = video["id"]
video_ext = video["ext"]
@@ -56,11 +62,8 @@ class YoutubeScraper(Scraper):
key = f"{video_id}.{video_ext}"
with open(f"{temp_dir}/{key}", "rb") as f:
with open(Path(temp_dir)/key, "rb") as f:
media_blob = f.read()
archived_url = self.archive_blob(media_blob, content_type, key)
url = video['webpage_url']
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
@@ -80,6 +83,41 @@ class YoutubeScraper(Scraper):
if channel.platform == "Youtube" and channel.url:
return True
def archive_files(self, result: ScraperResult) -> ScraperResult:
for url in result.archived_urls:
if result.archived_urls[url] is None:
media_blob = None
with tempfile.TemporaryDirectory() as temp_dir:
ydl_opts = {
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4",
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s"}
ydl = yt_dlp.YoutubeDL(ydl_opts)
try:
ydl.download(url)
except yt_dlp.utils.DownloadError as e:
raise e
files = os.listdir(temp_dir)
if len(files) != 1:
logger.warning(f'{len(files)} files downloaded for video: {url}')
key = files[0]
with open(Path(temp_dir, key), 'rb') as f:
media_blob = f.read()
if media_blob is not None:
content_type = 'video/mp4'
archived_url = self.archive_blob(media_blob, content_type, key)
result.archived_urls[url] = archived_url
result.media_archived = True
return result
def get_profile(self, channel: Channel) -> RawChannelInfo:
ydl_opts = {}
ydl = yt_dlp.YoutubeDL(ydl_opts)
@@ -89,12 +127,13 @@ class YoutubeScraper(Scraper):
meta = ydl.extract_info(
channel.url,
process=False)
meta.pop('entries')
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(meta),
date_archived=datetime.now(timezone.utc))
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(meta),
date_archived=datetime.now(timezone.utc))
except yt_dlp.utils.DownloadError as e:
raise e

View File

@@ -12,10 +12,9 @@ addopts =
--html='reports/tests.html'
--self-contained-html
markers =
profile: marks tests for only extracting channel metadata (deselect with '-m
"not profile"')
media: marks tests for archiving all media attachments (deselect with '-m
"not media"')
profile: marks tests for only extracting channel metadata (deselect with '-m "not profile"')
media: marks tests for archiving all media attachments (deselect with '-m "not media"')
unarchived: marks tests for archiving all unarchived media attachments (deselect with '-m "not unarchived"')
filterwarnings =
ignore:the imp module is deprecated:DeprecationWarning
ignore:The localize method is no longer necessary, as this time zone supports the fold attribute

View File

@@ -99,12 +99,12 @@ RUMBLE_CHANNEL_KWARGS = {
'notes': ''}
TELEGRAM_CHANNEL_KWARGS = {
'name': 'USA Freedom Convoy (test)',
'platform_id': -1001799578085,
'name': 'South West Ohio Proud Boys (test)',
'platform_id': -1001276612436,
'category': 'test',
'platform': 'Telegram',
'url': 'https://t.me/usafreedomconvoy2022',
'screenname': 'usafreedomconvoy2022',
'url': 'https://t.me/SouthwestOhioPB',
'screenname': 'SouthwestOhioPB',
'country': 'US',
'influencer': None,
'public': True,

View File

@@ -3,12 +3,19 @@ import pytest
from cisticola.base import Channel
from cisticola.scraper import BitchuteScraper
@pytest.mark.unarchived
def test_scrape_bitchute_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['bitchute'])]
controller.register_scraper(scraper = BitchuteScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_bitchute_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_bitchute_channel(controller, channel_kwargs):

View File

@@ -3,12 +3,19 @@ import pytest
from cisticola.base import Channel
from cisticola.scraper import GabScraper
@pytest.mark.unarchived
def test_scrape_gab_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['gab'])]
controller.register_scraper(scraper = GabScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_gab_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_gab_channel(controller, channel_kwargs):

View File

@@ -3,12 +3,19 @@ import pytest
from cisticola.base import Channel
from cisticola.scraper import GettrScraper
@pytest.mark.unarchived
def test_scrape_gettr_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['gettr'])]
controller.register_scraper(scraper = GettrScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_gettr_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_gettr_channel(controller, channel_kwargs):

View File

@@ -3,12 +3,19 @@ import pytest
from cisticola.base import Channel
from cisticola.scraper import InstagramScraper
@pytest.mark.unarchived
def test_scrape_instagram_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['instagram'])]
controller.register_scraper(scraper = InstagramScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_instagram_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_instagram_channel(controller, channel_kwargs):

View File

@@ -3,12 +3,19 @@ import pytest
from cisticola.base import Channel
from cisticola.scraper import OdyseeScraper
@pytest.mark.unarchived
def test_scrape_odysee_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['odysee'])]
controller.register_scraper(scraper = OdyseeScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_odysee_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_odysee_channel(controller, channel_kwargs):

View File

@@ -3,12 +3,19 @@ import pytest
from cisticola.base import Channel
from cisticola.scraper import RumbleScraper
@pytest.mark.unarchived
def test_scrape_rumble_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['rumble'])]
controller.register_scraper(scraper = RumbleScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_rumble_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_rumble_channel(controller, channel_kwargs):

View File

@@ -3,12 +3,19 @@ import pytest
from cisticola.base import Channel
from cisticola.scraper import TelegramSnscrapeScraper
@pytest.mark.unarchived
def test_scrape_telegram_snscrape_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['telegram'])]
controller.register_scraper(scraper = TelegramSnscrapeScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_telegram_snscrape_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_telegram_snscrape_channel(controller, channel_kwargs):

View File

@@ -3,6 +3,7 @@ import pytest
from cisticola.base import Channel
from cisticola.scraper import TelegramTelethonScraper
@pytest.mark.unarchived
def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs):
controller.remove_all_scrapers()
@@ -10,6 +11,12 @@ def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs):
controller.register_scraper(scraper = TelegramTelethonScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_telegram_telethon_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_telegram_telethon_channel(controller, channel_kwargs):

View File

@@ -3,12 +3,19 @@ import pytest
from cisticola.base import Channel
from cisticola.scraper import TwitterScraper
@pytest.mark.unarchived
def test_scrape_twitter_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['twitter'])]
controller.register_scraper(scraper = TwitterScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_twitter_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_twitter_channel(controller, channel_kwargs):

View File

@@ -3,12 +3,19 @@ import pytest
from cisticola.base import Channel
from cisticola.scraper import VkontakteScraper
@pytest.mark.unarchived
def test_scrape_vkontakte_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['vkontakte'])]
controller.register_scraper(scraper = VkontakteScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_vkontakte_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_vkontakte_channel(controller, channel_kwargs):

View File

@@ -3,12 +3,19 @@ import pytest
from cisticola.base import Channel
from cisticola.scraper import YoutubeScraper
@pytest.mark.unarchived
def test_scrape_youtube_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['youtube'])]
controller.register_scraper(scraper = YoutubeScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_youtube_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_youtube_channel(controller, channel_kwargs):