mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
Merge pull request #26 from bellingcat/deferred-media-archiving
Implemented deferred media archiving for all scrapers
This commit is contained in:
106
Pipfile.lock
generated
106
Pipfile.lock
generated
@@ -26,19 +26,19 @@
|
||||
},
|
||||
"boto3": {
|
||||
"hashes": [
|
||||
"sha256:ef210f8e85cdb6d26a38ebad1cfe9cefdef2ab269207e5987653555375a7ef6b",
|
||||
"sha256:f0af8f4ef5fe6353c794cd3cce627d469a618b58ace7ca75a63cfd719df615ce"
|
||||
"sha256:35f68b60652bff50e7bc926238443cb578f29f120908bb945e5640e90c6dd53e",
|
||||
"sha256:7f3f93ee97215862ccd1a216f37deb7d64055c71f826b821805904df7b84ee6a"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.21.30"
|
||||
"version": "==1.21.31"
|
||||
},
|
||||
"botocore": {
|
||||
"hashes": [
|
||||
"sha256:af4bdc51eeecbe9fdcdadbed9ad58c5c91380ef30f3560022bbc2ee1d78f0ad6",
|
||||
"sha256:c622751093e3d0bf61343e66d6d06190ef30bf42b1557d5070ca84e9efa06d4b"
|
||||
"sha256:3bb21e3ee5e4de3ed76bb99b4496a46e9b5c82e7b7fdb62702f11dda1b57b769",
|
||||
"sha256:424fd94bef86a11f5340dc15eb50602dedec2ecc01c3a25c4fea23a2c8195500"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==1.24.30"
|
||||
"version": "==1.24.31"
|
||||
},
|
||||
"brotli": {
|
||||
"hashes": [
|
||||
@@ -195,11 +195,11 @@
|
||||
},
|
||||
"click": {
|
||||
"hashes": [
|
||||
"sha256:5e0d195c2067da3136efb897449ec1e9e6c98282fbf30d7f9e164af9be901a6b",
|
||||
"sha256:7ab900e38149c9872376e8f9b5986ddcaf68c0f413cf73678a0bca5547e6f976"
|
||||
"sha256:24e1a4a9ec5bf6299411369b208c1df2188d9eb8d916302fe6bf03faed227f1e",
|
||||
"sha256:479707fe14d9ec9a0757618b7a100a0ae4c4e236fac5b7f80ca68028141a1a72"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==8.1.1"
|
||||
"version": "==8.1.2"
|
||||
},
|
||||
"cryptg": {
|
||||
"hashes": [
|
||||
@@ -872,6 +872,9 @@
|
||||
"version": "==2022.3.2"
|
||||
},
|
||||
"requests": {
|
||||
"extras": [
|
||||
"socks"
|
||||
],
|
||||
"hashes": [
|
||||
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
|
||||
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
|
||||
@@ -925,45 +928,45 @@
|
||||
},
|
||||
"sqlalchemy": {
|
||||
"hashes": [
|
||||
"sha256:03b5dc8b6399a51187e220ab351dfa8f81c310ce59442d047b74cd91ac4e859a",
|
||||
"sha256:0539328f70f0c2bbaa0137be7a0787ceea7eda29e41e3a9d575c52030d4251aa",
|
||||
"sha256:0d19eed02bf1225889e6f91cd7c2f89618919cb283846e7ed8aae1b0fe5f0403",
|
||||
"sha256:18a965490cc0f83f3e867078096e78e97d81bf370f45debae6027331904d3348",
|
||||
"sha256:1a6ae212067856bf2236708cb434554851cbe8099027953e94241a1743afb2b1",
|
||||
"sha256:1ee7c85f27b1ae59c3fe8fd715ed38e73ba8c885c90f74bcbae609c0fdb0ada3",
|
||||
"sha256:3081f61a86d0c2b6928fa7f4666bdeba0b9c7cc19619255454a71bfa60b55978",
|
||||
"sha256:34d2e697115cab4a66d1b8ea60050004ce62f0598c6cf146ee66a4025c7cb7fc",
|
||||
"sha256:36a3535bf9dd5f42a42d2cb6305f992888bbcb5789c615d35e34368853ae46f5",
|
||||
"sha256:4443237f1f87d460453f6b33683f25135f3fba9dffbf2a053caae15bf838cfc2",
|
||||
"sha256:44f0301e246c4d35d84e70192410d01509aea03a99cb963451aa0b652b7529f5",
|
||||
"sha256:4b7371495c91319bfaa010e257eb8d3cd2d3eae14b256412d3294247ea7f0d78",
|
||||
"sha256:5a43870fc272cf6dabf6ce7ad297a08e7f31672ea9ccd217a305c5bce8eafa9f",
|
||||
"sha256:5a5f798fc3f3ca9b5f1bf66b50a58439d558d5df132e12cfed22e2bc167c411c",
|
||||
"sha256:5b2718cb9e2bcf0053a83a6d0c491476b7f3832c59d82b53ccc78bb869e4027f",
|
||||
"sha256:5ee1ce25cc21db5553f607511fb41b85b7dc5eade15536c5c238f898996987de",
|
||||
"sha256:84747d1cc4823285b8253a34513162a664d4989217461e111097446b98803bfc",
|
||||
"sha256:89d51682716135e0d93584c3ca625c40805a014aaddb3961074a04895793d4d3",
|
||||
"sha256:8b4cf1effee1ef6db3f6c5185d32c0e6518bcf06212861875779676a22f68370",
|
||||
"sha256:9473d5dadee7a81d003cf18b1f5266b0fb29a9dff985bc205d71ca8da037e18b",
|
||||
"sha256:a11d8e12ff761101aa44404ce2df15e32f061d5559b862a847976c2efff014a1",
|
||||
"sha256:a720cef2bcd4e645ae1fa01a143a31c04e095f26ff925f6090cb1ef7f1859e5d",
|
||||
"sha256:bda76918f8c6da01278a97365bc17fa97d902be7f6d7596aad2bd7d9b52adbba",
|
||||
"sha256:c3bbcfbaa33d5998698ed84eba0807a58023c86d0fa540ff4da96637815a4d92",
|
||||
"sha256:c6545f832267bfea780c8fc6235f7a1fc87778e3e2629ddddeb88a94f9181292",
|
||||
"sha256:c7c667579800445f390b56c37ee383639465766ebc2041a2d1f1279cda01d4a0",
|
||||
"sha256:c8f671575256dddf1c334b2052aff88ac1c913e5430300057f1b0f2f12495019",
|
||||
"sha256:cd4954eff9e9b9a294f632e7bdb0a4c41e23c89609c6b2f742f1321740566477",
|
||||
"sha256:d258091ba28615ae133bf6a9975a24b9ed0bcc47f48bb1f57fa13cbe2bd4033c",
|
||||
"sha256:d29a8c0e8c2b77f2f548acdf948543a661f6414282598147e094bab091f37af6",
|
||||
"sha256:dd51b09d540e757dd6377f02950a80c0dc63aec6188582afdc21bf0db66efa0a",
|
||||
"sha256:e403e1cfe7789eddba83bc7677dd8ffdaab56fb2f14eb3b6f014037b09cd8096",
|
||||
"sha256:ea76d3f258c7b221a536fea200b64fc1b9272b48de4d1695bef616b7e5269183",
|
||||
"sha256:eab54f6ec81c12b6184ebeacccd89567cee8fc94b2f9fba23aec30ca25fc287c",
|
||||
"sha256:f39fb329a53043c10814fee68e123f02addc6000ed96994aedf24afe6fc30e9b",
|
||||
"sha256:f409a1a44e3da766445600714e2ca70ddf735898382c11c5c250eb88a7b8b0d1"
|
||||
"sha256:045d6a26c262929af0b9cb25441aae675ac04db4ea8bd2446b355617cd6b6b7d",
|
||||
"sha256:07f4dab2deb6d34618a2ccfff3971a85923ad7c3a9a45401818870fc51d3f0cc",
|
||||
"sha256:08aaad905aba8940f27aeb9f1f851bf63f18ef97b0062ca41f64afc4b64e0e8c",
|
||||
"sha256:27a42894a2751e438eaed12fc0dcfe741ff2f66c14760d081222c5adc5460064",
|
||||
"sha256:2a3e4dc7c452ba3c0f3175ad5a8e0ba49c2b0570a8d07272cf50844c8d78e74f",
|
||||
"sha256:345306707bb0e51e7cd6e7573adafbce018894ee5e3b9c31134545f704936db0",
|
||||
"sha256:36f08d94670315ca04c8139bd80b3e02b9dd9cc66fc11bcb96fd10ad51a051ab",
|
||||
"sha256:3ebb97ed96f4506e2f212e1fcf0ec07a103bb194938627660a5acb4d9feae49c",
|
||||
"sha256:40b995d7aeeb6f88a1927ce6692c0f626b59d8effd3e1d597f125e141707b37c",
|
||||
"sha256:4414ace6e3a5e39523e55a5d9f3b215699b2ead4ff91fca98f1b659b7ab2d92a",
|
||||
"sha256:50107d8183da3fbe5715957aa3954cd9d82aed555c5b4d3fd37fac861af422fa",
|
||||
"sha256:50174e173d03209c34e07e7b57cca48d0082ac2390edf927aafc706c111da11e",
|
||||
"sha256:5e88912bf192e7b5739c446d2276e1cba74cfa6c1c93eea2b2534404f6be1dbd",
|
||||
"sha256:621d3f6c0ba2407bb97e82b649be5ca7d5b6c201dcfb964ce13f517bf1cb6305",
|
||||
"sha256:623bac2d6bdca3f3e61cf1e1c466c5fb9f5cf08735736ee1111187b7a4108891",
|
||||
"sha256:671f61c3db4595b0e86cc4b30f675a7c0206d9ce99f041b4f6761c7ddd1e0074",
|
||||
"sha256:67c1c27c48875afc950bee5ee24582794f20b545e64e4f9ca94071a9b514d6ed",
|
||||
"sha256:6a6cfd468f54d65324fd3847cfd0148b0610efa6a43e5f5fcc89f455696ae9e7",
|
||||
"sha256:70048a83f0a1ece1fcd7189891c888e20af2c57fbd33eb760d8cece9843b896c",
|
||||
"sha256:7ee14a7f9f76d1ef9d5e5b760c9252617c839b87eee04d1ce8325ac66ae155c4",
|
||||
"sha256:804cf491437f3e4ce31247ab4b309b181f06ecc97d309b746d10f09439b4eb85",
|
||||
"sha256:878c7beaafa365602762c19f638282e1885454fed1aed86f8fae038933c7c671",
|
||||
"sha256:954ea8c527c4322afb6885944904714893af81fe9167e421273770991bf08a4a",
|
||||
"sha256:a47bf6b7ca6c28e4f4e262fabcf5be6b907af81be36de77839c9eeda2cdf3bb3",
|
||||
"sha256:a4fb5c6ee84a6bba4ff6f9f5379f0b3a0ffe9de7ba5a0945659b3da8d519709b",
|
||||
"sha256:b34bbc683789559f1bc9bb685fc162e0956dbbdfbe2fbd6755a9f5982c113610",
|
||||
"sha256:c025d45318b73c0601cca451532556cbab532b2742839ebb8cb58f9ebf06811e",
|
||||
"sha256:c3ad7f5b61ba014f5045912aea15b03c473bb02b1c07fd92c9d2c794fa183276",
|
||||
"sha256:c9218e3519398129e364121e0d89823e6ba2a2b77c28bfc661face0829c41433",
|
||||
"sha256:cd5cffd1dd753828f1069f33062f3896e51c990acd957c264f40e051b3e19887",
|
||||
"sha256:d8efcaa709ea8e7c08c3d3e7639c39b36083f5a995f397f9e6eedf5f5e4e4946",
|
||||
"sha256:e297a5cc625e3f1367a82deedf2d48ee4d2b2bd263b8b8d2efbaaf5608b5229e",
|
||||
"sha256:e67278ceb63270cdac0a7b89fc3c29a56f7dac9616a7ee48e7ad6b52e3b631e5",
|
||||
"sha256:eb6558ba07409dafa18c793c34292b3265be455904966f0724c10198829477e3",
|
||||
"sha256:f197c66663ed0f9e1178d51141d864688fb244a83f6b17f667d521e482537b2e",
|
||||
"sha256:f47996b1810894f766c9ee689607077c6c0e0fd6761e04c12ba13efb56d50c1d"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.4.33"
|
||||
"version": "==1.4.34"
|
||||
},
|
||||
"telethon": {
|
||||
"hashes": [
|
||||
@@ -1138,11 +1141,11 @@
|
||||
},
|
||||
"click": {
|
||||
"hashes": [
|
||||
"sha256:5e0d195c2067da3136efb897449ec1e9e6c98282fbf30d7f9e164af9be901a6b",
|
||||
"sha256:7ab900e38149c9872376e8f9b5986ddcaf68c0f413cf73678a0bca5547e6f976"
|
||||
"sha256:24e1a4a9ec5bf6299411369b208c1df2188d9eb8d916302fe6bf03faed227f1e",
|
||||
"sha256:479707fe14d9ec9a0757618b7a100a0ae4c4e236fac5b7f80ca68028141a1a72"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==8.1.1"
|
||||
"version": "==8.1.2"
|
||||
},
|
||||
"coverage": {
|
||||
"extras": [
|
||||
@@ -1390,6 +1393,9 @@
|
||||
"version": "==2022.1"
|
||||
},
|
||||
"requests": {
|
||||
"extras": [
|
||||
"socks"
|
||||
],
|
||||
"hashes": [
|
||||
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
|
||||
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
|
||||
@@ -1473,7 +1479,7 @@
|
||||
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
|
||||
"sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"markers": "python_full_version < '3.11.0'",
|
||||
"version": "==2.0.1"
|
||||
},
|
||||
"typing-extensions": {
|
||||
|
||||
@@ -235,6 +235,20 @@ class Scraper:
|
||||
return archived_url
|
||||
|
||||
def archive_files(self, result: ScraperResult) -> ScraperResult:
|
||||
"""Archive files corresponding to ``archived_url`` dict keys, if the
|
||||
files have not previously been archived.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
result: ScraperResult
|
||||
Previously scraped ScraperResult run with ``archive_media=False``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ScraperResult
|
||||
Same ScraperResult as ``result``, but with all URLs in ``archived_url`` dict archived.
|
||||
"""
|
||||
|
||||
for url in result.archived_urls:
|
||||
if result.archived_urls[url] is None:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
@@ -244,7 +258,6 @@ class Scraper:
|
||||
result.media_archived = True
|
||||
return result
|
||||
|
||||
|
||||
def can_handle(self, channel: Channel) -> bool:
|
||||
"""Whether or not the scraper can scrape the specified channel.
|
||||
|
||||
@@ -345,7 +358,23 @@ class ScraperController:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
|
||||
session = self.session()
|
||||
|
||||
# If any channels are not already in the database, add them
|
||||
for channel in channels:
|
||||
|
||||
platform_id = None
|
||||
if channel.platform_id not in (None, ''):
|
||||
platform_id = channel.platform_id
|
||||
|
||||
channel_in_db = session.query(Channel).filter_by(platform_id=platform_id, platform=channel.platform, url=channel.url).first()
|
||||
|
||||
if not channel_in_db:
|
||||
logger.debug(f"{channel} does not exist in database, adding")
|
||||
session.add(channel)
|
||||
session.flush()
|
||||
session.commit()
|
||||
|
||||
handled = False
|
||||
|
||||
for scraper in self.scrapers:
|
||||
@@ -355,7 +384,6 @@ class ScraperController:
|
||||
added = 0
|
||||
|
||||
# get most recent post
|
||||
session = self.session()
|
||||
rows = session.query(ScraperResult).where(
|
||||
ScraperResult.channel == channel.id).order_by(
|
||||
ScraperResult.date.desc()).limit(1).all()
|
||||
|
||||
@@ -45,9 +45,12 @@ class BitchuteScraper(Scraper):
|
||||
|
||||
archived_urls = {}
|
||||
|
||||
if archive_media:
|
||||
if 'video_url' in post:
|
||||
url = post['video_url']
|
||||
if 'video_url' in post:
|
||||
url = post['video_url']
|
||||
archived_urls[url] = None
|
||||
|
||||
if archive_media:
|
||||
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
@@ -114,6 +117,7 @@ class BitchuteScraper(Scraper):
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
def strip_tags(html, convert_newlines=True):
|
||||
|
||||
@@ -52,25 +52,24 @@ class GabScraper(Scraper):
|
||||
if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
||||
break
|
||||
|
||||
media_urls = []
|
||||
archived_urls = {}
|
||||
|
||||
if archive_media:
|
||||
|
||||
for attachment in post.get('media_attachments'):
|
||||
for attachment in post.get('media_attachments'):
|
||||
if attachment.get('type') == 'video':
|
||||
archived_urls[attachment['source_mp4']] = None
|
||||
else:
|
||||
archived_urls[attachment['url']] = None
|
||||
|
||||
if post.get('reblog') is not None:
|
||||
for attachment in post['reblog'].get('media_attachments'):
|
||||
if attachment.get('type') == 'video':
|
||||
media_urls.append(attachment['source_mp4'])
|
||||
archived_urls[attachment['source_mp4']] = None
|
||||
else:
|
||||
media_urls.append(attachment['url'])
|
||||
|
||||
if post.get('reblog') is not None:
|
||||
for attachment in post['reblog'].get('media_attachments'):
|
||||
if attachment.get('type') == 'video':
|
||||
media_urls.append(attachment['source_mp4'])
|
||||
else:
|
||||
media_urls.append(attachment['url'])
|
||||
archived_urls[attachment['url']] = None
|
||||
|
||||
for url in media_urls:
|
||||
for url in archived_urls.keys():
|
||||
|
||||
if archive_media:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
@@ -32,26 +32,25 @@ class GettrScraper(Scraper):
|
||||
|
||||
archived_urls = {}
|
||||
|
||||
if archive_media:
|
||||
if 'imgs' in post:
|
||||
for img in post['imgs']:
|
||||
url = "https://media.gettr.com/" + img
|
||||
archived_urls[url] = None
|
||||
|
||||
if 'imgs' in post:
|
||||
for img in post['imgs']:
|
||||
url = "https://media.gettr.com/" + img
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[img] = archived_url
|
||||
if 'main' in post:
|
||||
url = "https://media.gettr.com/" + post['main']
|
||||
archived_urls[url] = None
|
||||
|
||||
if 'main' in post:
|
||||
url = "https://media.gettr.com/" + post['main']
|
||||
if 'ovid' in post:
|
||||
url = "https://media.gettr.com/" + post['ovid']
|
||||
archived_urls[url] = None
|
||||
|
||||
for url in archived_urls.keys():
|
||||
|
||||
if archive_media:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[post['main']] = archived_url
|
||||
|
||||
if 'vid' in post:
|
||||
url = "https://media.gettr.com/" + post['vid']
|
||||
media_blob, content_type, key = self.m3u8_url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[post['vid']] = archived_url
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
@@ -74,7 +73,7 @@ class GettrScraper(Scraper):
|
||||
return key
|
||||
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
client = client = PublicClient()
|
||||
client = PublicClient()
|
||||
username = self.get_username_from_url(channel.url)
|
||||
profile = client.user_info(username)
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from typing import Generator
|
||||
from typing import Generator, List
|
||||
from datetime import datetime, timezone
|
||||
import os
|
||||
import json
|
||||
@@ -50,28 +50,14 @@ class InstagramScraper(Scraper):
|
||||
|
||||
post_url = f'{BASE_URL}p/{post.shortcode}/'
|
||||
|
||||
archived_urls = {}
|
||||
archived_urls = get_archived_urls_from_post(post = post)
|
||||
|
||||
if archive_media:
|
||||
for url in archived_urls.keys():
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
|
||||
loader.download_post(post = post, target = Path(temp_dir))
|
||||
|
||||
files = os.listdir(temp_dir)
|
||||
files = [f for f in files if not f.endswith('.txt')]
|
||||
|
||||
for file in files:
|
||||
ext = file.split('.')[-1]
|
||||
content_type = CONTENT_TYPES[ext]
|
||||
filename = Path(temp_dir, file)
|
||||
key = f'{post.shortcode}__{file}'
|
||||
|
||||
with open(filename, 'rb') as f:
|
||||
blob = f.read()
|
||||
|
||||
archived_url = self.archive_blob(blob = blob, content_type = content_type, key = key)
|
||||
archived_urls[post_url] = archived_url
|
||||
if archive_media:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
@@ -99,7 +85,7 @@ class InstagramScraper(Scraper):
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_posts=json.dumps(comment_dict, default=str),
|
||||
archived_urls={},
|
||||
media_archived=archive_media)
|
||||
media_archived=True)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None:
|
||||
@@ -127,7 +113,20 @@ class InstagramScraper(Scraper):
|
||||
profile['followees'] = user_profile.followees
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
def get_archived_urls_from_post(post: instaloader.Post) -> List[str]:
|
||||
typename = post._node['__typename']
|
||||
if typename == 'GraphImage':
|
||||
urls = [post._node['display_url']]
|
||||
elif typename == 'GraphVideo':
|
||||
urls = [post._node['video_url']]
|
||||
elif typename == 'GraphSidecar':
|
||||
urls = [edge['node']['display_url'] for edge in post._node['edge_sidecar_to_children']['edges']]
|
||||
else:
|
||||
raise NotImplementedError(f'post of type {typename} is currently not supported.')
|
||||
|
||||
return {url : None for url in urls}
|
||||
@@ -37,10 +37,11 @@ class OdyseeScraper(Scraper):
|
||||
if since is not None and datetime.fromtimestamp(video.info['created']) <= since.date:
|
||||
break
|
||||
|
||||
archived_urls = {}
|
||||
url = video.info['streaming_url']
|
||||
|
||||
archived_urls = {url: None}
|
||||
|
||||
if archive_media:
|
||||
url = video.info['streaming_url']
|
||||
|
||||
# Check if file is a video file or an m3u8 file
|
||||
r = requests.head(url)
|
||||
@@ -78,6 +79,21 @@ class OdyseeScraper(Scraper):
|
||||
archived_urls={},
|
||||
media_archived=True)
|
||||
|
||||
def archive_files(self, result: ScraperResult) -> ScraperResult:
|
||||
for url in result.archived_urls:
|
||||
if result.archived_urls[url] is None:
|
||||
r = requests.head(url)
|
||||
if r.headers['Content-Type'] == 'text/html; charset=utf-8':
|
||||
media_blob, content_type, key = self.m3u8_url_to_blob(url)
|
||||
else:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
result.archived_urls[url] = archived_url
|
||||
|
||||
result.media_archived = True
|
||||
return result
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
@@ -95,7 +111,7 @@ class OdyseeScraper(Scraper):
|
||||
profile = odysee_channel.info
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
@@ -21,18 +21,18 @@ class RumbleScraper(Scraper):
|
||||
scraper = get_channel_videos(channel.url)
|
||||
|
||||
for post in scraper:
|
||||
if since is not None and post['datetime'].replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
|
||||
if since is not None and post['datetime'].replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
||||
break
|
||||
|
||||
archived_urls = {}
|
||||
url = post['media_url']
|
||||
|
||||
archived_urls = {url: None}
|
||||
|
||||
if archive_media:
|
||||
|
||||
url = post['media_url']
|
||||
|
||||
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[post['media_url']] = archived_url
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
@@ -50,6 +50,16 @@ class RumbleScraper(Scraper):
|
||||
key = urlparse(url).path.split('/')[-2] + ext
|
||||
return key
|
||||
|
||||
def archive_files(self, result: ScraperResult) -> ScraperResult:
|
||||
for url in result.archived_urls:
|
||||
if result.archived_urls[url] is None:
|
||||
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
result.archived_urls[url] = archived_url
|
||||
|
||||
result.media_archived = True
|
||||
return result
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Rumble" and channel.url is not None:
|
||||
return True
|
||||
@@ -59,10 +69,10 @@ class RumbleScraper(Scraper):
|
||||
profile = get_channel_profile(url = channel.url)
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
|
||||
@@ -34,8 +34,8 @@ class TelegramSnscrapeScraper(Scraper):
|
||||
for image_url in post.images:
|
||||
archived_urls[image_url] = None
|
||||
|
||||
if post.video:
|
||||
archived_urls[post.video] = None
|
||||
for video_url in post.videos:
|
||||
archived_urls[video_url] = None
|
||||
|
||||
if archive_media:
|
||||
for url in archived_urls:
|
||||
|
||||
@@ -15,7 +15,7 @@ class TwitterScraper(Scraper):
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
if channel.platform_id:
|
||||
identifier = channel.platform_id
|
||||
identifier = int(channel.platform_id)
|
||||
else:
|
||||
identifier = channel.screenname
|
||||
|
||||
@@ -24,7 +24,7 @@ class TwitterScraper(Scraper):
|
||||
first = True
|
||||
|
||||
for tweet in scraper.get_items():
|
||||
if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
|
||||
if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
||||
# with TwitterProfileScraper, the first tweet could be an old pinned tweet
|
||||
if first:
|
||||
first = False
|
||||
@@ -106,7 +106,7 @@ class TwitterScraper(Scraper):
|
||||
raise ChannelDoesNotExistError(channel.url)
|
||||
else:
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(entity.__dict__, default=str),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(entity.__dict__, default=str),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
from datetime import datetime, timezone
|
||||
from typing import Generator
|
||||
from urllib.parse import urlparse
|
||||
import json
|
||||
import re
|
||||
|
||||
from snscrape.modules.vkontakte import VKontakteUserScraper
|
||||
from loguru import logger
|
||||
from yt_dlp.extractor.vk import VKIE
|
||||
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
@@ -25,7 +29,7 @@ class VkontakteScraper(Scraper):
|
||||
first = True
|
||||
|
||||
for post in scraper.get_items():
|
||||
if since is not None and datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
|
||||
if since is not None and datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
||||
# with VKontakteUserScraper, the first tweet could be an old pinned tweet
|
||||
if first:
|
||||
first = False
|
||||
@@ -35,23 +39,26 @@ class VkontakteScraper(Scraper):
|
||||
|
||||
archived_urls = {}
|
||||
|
||||
if archive_media:
|
||||
if post.photos:
|
||||
|
||||
if post.photos:
|
||||
for photo in post.photos:
|
||||
variant = max(
|
||||
[v for v in photo.variants], key=lambda v: v.width * v.height)
|
||||
url = variant.url
|
||||
if url is not None:
|
||||
archived_urls[url] = None
|
||||
|
||||
for photo in post.photos:
|
||||
variant = max(
|
||||
[v for v in photo.variants], key=lambda v: v.width * v.height)
|
||||
url = variant.url
|
||||
|
||||
if url is not None:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
if post.video:
|
||||
archived_urls[post.video.url] = None
|
||||
|
||||
if post.video:
|
||||
url = post.video.url
|
||||
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
|
||||
for url in archived_urls.keys():
|
||||
|
||||
if archive_media:
|
||||
if re.match(VKIE._VALID_URL, url):
|
||||
# Uses regex from yt_dlp to verify VK video URL
|
||||
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
|
||||
else:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
@@ -66,6 +73,21 @@ class VkontakteScraper(Scraper):
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
|
||||
def archive_files(self, result: ScraperResult) -> ScraperResult:
|
||||
for url in result.archived_urls:
|
||||
if result.archived_urls[url] is None:
|
||||
if re.match(VKIE._VALID_URL, url):
|
||||
# Uses regex from yt_dlp to verify VK video URL
|
||||
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
|
||||
else:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
result.archived_urls[url] = archived_url
|
||||
|
||||
result.media_archived = True
|
||||
return result
|
||||
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Vkontakte" and channel.platform_id:
|
||||
return True
|
||||
@@ -88,7 +110,7 @@ class VkontakteScraper(Scraper):
|
||||
profile = scraper._get_entity().__dict__
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
@@ -2,6 +2,9 @@ from datetime import datetime, timezone
|
||||
import json
|
||||
from typing import Generator
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
import yt_dlp
|
||||
from loguru import logger
|
||||
|
||||
@@ -48,7 +51,10 @@ class YoutubeScraper(Scraper):
|
||||
|
||||
for video in valid_videos:
|
||||
|
||||
archived_urls = {}
|
||||
url = video['webpage_url']
|
||||
|
||||
archived_urls = {url: None}
|
||||
|
||||
video_id = video["id"]
|
||||
video_ext = video["ext"]
|
||||
|
||||
@@ -56,11 +62,8 @@ class YoutubeScraper(Scraper):
|
||||
|
||||
key = f"{video_id}.{video_ext}"
|
||||
|
||||
with open(f"{temp_dir}/{key}", "rb") as f:
|
||||
with open(Path(temp_dir)/key, "rb") as f:
|
||||
media_blob = f.read()
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
|
||||
url = video['webpage_url']
|
||||
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
@@ -80,6 +83,41 @@ class YoutubeScraper(Scraper):
|
||||
if channel.platform == "Youtube" and channel.url:
|
||||
return True
|
||||
|
||||
def archive_files(self, result: ScraperResult) -> ScraperResult:
|
||||
for url in result.archived_urls:
|
||||
if result.archived_urls[url] is None:
|
||||
|
||||
media_blob = None
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
|
||||
ydl_opts = {
|
||||
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
|
||||
"merge_output_format": "mp4",
|
||||
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s"}
|
||||
|
||||
ydl = yt_dlp.YoutubeDL(ydl_opts)
|
||||
|
||||
try:
|
||||
ydl.download(url)
|
||||
except yt_dlp.utils.DownloadError as e:
|
||||
raise e
|
||||
|
||||
files = os.listdir(temp_dir)
|
||||
if len(files) != 1:
|
||||
logger.warning(f'{len(files)} files downloaded for video: {url}')
|
||||
key = files[0]
|
||||
with open(Path(temp_dir, key), 'rb') as f:
|
||||
media_blob = f.read()
|
||||
|
||||
if media_blob is not None:
|
||||
content_type = 'video/mp4'
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
result.archived_urls[url] = archived_url
|
||||
|
||||
result.media_archived = True
|
||||
return result
|
||||
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
ydl_opts = {}
|
||||
ydl = yt_dlp.YoutubeDL(ydl_opts)
|
||||
@@ -89,12 +127,13 @@ class YoutubeScraper(Scraper):
|
||||
meta = ydl.extract_info(
|
||||
channel.url,
|
||||
process=False)
|
||||
meta.pop('entries')
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(meta),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(meta),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
except yt_dlp.utils.DownloadError as e:
|
||||
raise e
|
||||
|
||||
@@ -12,10 +12,9 @@ addopts =
|
||||
--html='reports/tests.html'
|
||||
--self-contained-html
|
||||
markers =
|
||||
profile: marks tests for only extracting channel metadata (deselect with '-m
|
||||
"not profile"')
|
||||
media: marks tests for archiving all media attachments (deselect with '-m
|
||||
"not media"')
|
||||
profile: marks tests for only extracting channel metadata (deselect with '-m "not profile"')
|
||||
media: marks tests for archiving all media attachments (deselect with '-m "not media"')
|
||||
unarchived: marks tests for archiving all unarchived media attachments (deselect with '-m "not unarchived"')
|
||||
filterwarnings =
|
||||
ignore:the imp module is deprecated:DeprecationWarning
|
||||
ignore:The localize method is no longer necessary, as this time zone supports the fold attribute
|
||||
|
||||
@@ -99,12 +99,12 @@ RUMBLE_CHANNEL_KWARGS = {
|
||||
'notes': ''}
|
||||
|
||||
TELEGRAM_CHANNEL_KWARGS = {
|
||||
'name': 'USA Freedom Convoy (test)',
|
||||
'platform_id': -1001799578085,
|
||||
'name': 'South West Ohio Proud Boys (test)',
|
||||
'platform_id': -1001276612436,
|
||||
'category': 'test',
|
||||
'platform': 'Telegram',
|
||||
'url': 'https://t.me/usafreedomconvoy2022',
|
||||
'screenname': 'usafreedomconvoy2022',
|
||||
'url': 'https://t.me/SouthwestOhioPB',
|
||||
'screenname': 'SouthwestOhioPB',
|
||||
'country': 'US',
|
||||
'influencer': None,
|
||||
'public': True,
|
||||
|
||||
@@ -3,12 +3,19 @@ import pytest
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import BitchuteScraper
|
||||
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_bitchute_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['bitchute'])]
|
||||
controller.register_scraper(scraper = BitchuteScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
@pytest.mark.media
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_bitchute_channel_unarchived_media(controller):
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_bitchute_channel(controller, channel_kwargs):
|
||||
|
||||
|
||||
@@ -3,12 +3,19 @@ import pytest
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import GabScraper
|
||||
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_gab_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['gab'])]
|
||||
controller.register_scraper(scraper = GabScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
@pytest.mark.media
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_gab_channel_unarchived_media(controller):
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_gab_channel(controller, channel_kwargs):
|
||||
|
||||
|
||||
@@ -3,12 +3,19 @@ import pytest
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import GettrScraper
|
||||
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_gettr_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['gettr'])]
|
||||
controller.register_scraper(scraper = GettrScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
@pytest.mark.media
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_gettr_channel_unarchived_media(controller):
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_gettr_channel(controller, channel_kwargs):
|
||||
|
||||
|
||||
@@ -3,12 +3,19 @@ import pytest
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import InstagramScraper
|
||||
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_instagram_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['instagram'])]
|
||||
controller.register_scraper(scraper = InstagramScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
@pytest.mark.media
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_instagram_channel_unarchived_media(controller):
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_instagram_channel(controller, channel_kwargs):
|
||||
|
||||
|
||||
@@ -3,12 +3,19 @@ import pytest
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import OdyseeScraper
|
||||
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_odysee_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['odysee'])]
|
||||
controller.register_scraper(scraper = OdyseeScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
@pytest.mark.media
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_odysee_channel_unarchived_media(controller):
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_odysee_channel(controller, channel_kwargs):
|
||||
|
||||
|
||||
@@ -3,12 +3,19 @@ import pytest
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import RumbleScraper
|
||||
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_rumble_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['rumble'])]
|
||||
controller.register_scraper(scraper = RumbleScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
@pytest.mark.media
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_rumble_channel_unarchived_media(controller):
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_rumble_channel(controller, channel_kwargs):
|
||||
|
||||
|
||||
@@ -3,12 +3,19 @@ import pytest
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import TelegramSnscrapeScraper
|
||||
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_telegram_snscrape_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['telegram'])]
|
||||
controller.register_scraper(scraper = TelegramSnscrapeScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
@pytest.mark.media
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_telegram_snscrape_channel_unarchived_media(controller):
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_telegram_snscrape_channel(controller, channel_kwargs):
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ import pytest
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import TelegramTelethonScraper
|
||||
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs):
|
||||
controller.remove_all_scrapers()
|
||||
|
||||
@@ -10,6 +11,12 @@ def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs):
|
||||
controller.register_scraper(scraper = TelegramTelethonScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
@pytest.mark.media
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_telegram_telethon_unarchived_media(controller):
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_telegram_telethon_channel(controller, channel_kwargs):
|
||||
|
||||
|
||||
@@ -3,12 +3,19 @@ import pytest
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import TwitterScraper
|
||||
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_twitter_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['twitter'])]
|
||||
controller.register_scraper(scraper = TwitterScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
@pytest.mark.media
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_twitter_channel_unarchived_media(controller):
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_twitter_channel(controller, channel_kwargs):
|
||||
|
||||
|
||||
@@ -3,12 +3,19 @@ import pytest
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import VkontakteScraper
|
||||
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_vkontakte_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['vkontakte'])]
|
||||
controller.register_scraper(scraper = VkontakteScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
@pytest.mark.media
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_vkontakte_channel_unarchived_media(controller):
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_vkontakte_channel(controller, channel_kwargs):
|
||||
|
||||
|
||||
@@ -3,12 +3,19 @@ import pytest
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import YoutubeScraper
|
||||
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_youtube_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['youtube'])]
|
||||
controller.register_scraper(scraper = YoutubeScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
@pytest.mark.media
|
||||
@pytest.mark.unarchived
|
||||
def test_scrape_youtube_channel_unarchived_media(controller):
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_youtube_channel(controller, channel_kwargs):
|
||||
|
||||
|
||||
Reference in New Issue
Block a user