renamed 'archive_media' and 'media' to avoid name collision, changed scope of test fixture controller to 'function' so that db is fresh for each executed test

This commit is contained in:
Tristan Lee
2022-03-09 13:19:35 -06:00
parent 739e1d8484
commit 6cf3b8842d
20 changed files with 130 additions and 125 deletions

View File

@@ -69,7 +69,7 @@ class Scraper:
return blob, content_type, key
def archive_media(self, blob: bytes, content_type: str, key: str) -> str:
def archive_blob(self, blob: bytes, content_type: str, key: str) -> str:
filename = self.__version__.replace(' ', '_') + '/' + key
@@ -83,7 +83,7 @@ class Scraper:
def can_handle(self, channel: Channel) -> bool:
raise NotImplementedError
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
raise NotImplementedError
@@ -103,7 +103,7 @@ class ScraperController:
self.scrapers.extend(scraper)
@logger.catch
def scrape_channels(self, channels: List[Channel], media: bool = True):
def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
if self.session is None:
logger.error("No DB session")
return
@@ -128,7 +128,7 @@ class ScraperController:
else:
since = None
posts = scraper.get_posts(channel, since=since, media=media)
posts = scraper.get_posts(channel, since=since, archive_media=archive_media)
for post in posts:
session.add(post)

View File

@@ -1,4 +1,4 @@
from datetime import datetime
from datetime import datetime, timezone
import time
import re
from html.parser import HTMLParser
@@ -22,7 +22,7 @@ class BitchuteScraper(Scraper):
return username
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
session = requests.Session()
session.headers.update(self.headers)
@@ -43,11 +43,11 @@ class BitchuteScraper(Scraper):
archived_urls = {}
if media:
if archive_media:
if 'video_url' in post:
url = post['video_url']
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
@@ -56,7 +56,7 @@ class BitchuteScraper(Scraper):
channel=channel.id,
platform_id=post['id'],
date=datetime.fromtimestamp(post['timestamp']),
date_archived=datetime.now(),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post),
archived_urls=archived_urls)

View File

@@ -1,4 +1,4 @@
from datetime import datetime
from datetime import datetime, timezone
import json
from typing import Generator
@@ -16,7 +16,7 @@ class GabScraper(Scraper):
return username
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
client = Garc(profile = 'main')
username = GabScraper.get_username_from_url(channel.url)
@@ -29,7 +29,7 @@ class GabScraper(Scraper):
media_urls = []
archived_urls = {}
if media:
if archive_media:
media_urls.extend([p['url'] for p in post['media_attachments']])
@@ -38,7 +38,7 @@ class GabScraper(Scraper):
for url in media_urls:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
@@ -46,8 +46,8 @@ class GabScraper(Scraper):
platform="Gab",
channel=channel.id,
platform_id=post['id'],
date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo = None),
date_archived=datetime.now(),
date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post),
archived_urls=archived_urls)

View File

@@ -1,6 +1,6 @@
from datetime import datetime
from datetime import datetime, timezone
import json
from typing import Generator, Tuple
from typing import Generator
from urllib.parse import urlparse
from gogettr import PublicClient
@@ -19,7 +19,7 @@ class GettrScraper(Scraper):
return username
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
client = PublicClient()
username = GettrScraper.get_username_from_url(channel.url)
scraper = client.user_activity(username=username, type="posts")
@@ -30,25 +30,25 @@ class GettrScraper(Scraper):
archived_urls = {}
if media:
if archive_media:
if 'imgs' in post:
for img in post['imgs']:
url = "https://media.gettr.com/" + img
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[img] = archived_url
if 'main' in post:
url = "https://media.gettr.com/" + post['main']
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[post['main']] = archived_url
if 'vid' in post:
url = "https://media.gettr.com/" + post['vid']
media_blob, content_type, key = self.m3u8_url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[post['vid']] = archived_url
yield ScraperResult(
@@ -57,7 +57,7 @@ class GettrScraper(Scraper):
channel=channel.id,
platform_id=post['_id'],
date=datetime.fromtimestamp(post['cdate']/1000.),
date_archived=datetime.now(),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post),
archived_urls=archived_urls)

View File

@@ -1,4 +1,4 @@
from datetime import datetime
from datetime import datetime, timezone
import json
from typing import Generator
from urllib.parse import urlparse
@@ -19,7 +19,7 @@ class OdyseeScraper(Scraper):
return username
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = OdyseeScraper.get_username_from_url(channel.url)
odysee_channel = OdyseeChannel(channel_name = username)
@@ -32,7 +32,7 @@ class OdyseeScraper(Scraper):
archived_urls = {}
if media:
if archive_media:
url = video.info['streaming_url']
# Check if file is a video file or an m3u8 file
@@ -42,7 +42,7 @@ class OdyseeScraper(Scraper):
else:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
all_comments = video.get_all_comments()
@@ -53,7 +53,7 @@ class OdyseeScraper(Scraper):
channel=channel.id,
platform_id=video.info['claim_id'],
date=datetime.fromtimestamp(video.info['created']),
date_archived=datetime.now(),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(video.info),
archived_urls=archived_urls)

View File

@@ -1,4 +1,4 @@
from datetime import datetime
from datetime import datetime, timezone
import json
from typing import Generator, Tuple
import tempfile
@@ -22,7 +22,7 @@ class RumbleScraper(Scraper):
return username
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = RumbleScraper.get_username_from_url(channel.url)
scraper = get_channel_videos(username)
@@ -33,12 +33,12 @@ class RumbleScraper(Scraper):
archived_urls = {}
if media:
if archive_media:
url = post['media_url']
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[post['media_url']] = archived_url
yield ScraperResult(
@@ -46,8 +46,8 @@ class RumbleScraper(Scraper):
platform="Rumble",
channel=channel.id,
platform_id=post['media_url'].split('/')[-2],
date=datetime.fromisoformat(post['datetime']).replace(tzinfo=None),
date_archived=datetime.now(),
date=datetime.fromisoformat(post['datetime']).replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post),
archived_urls=archived_urls)

View File

@@ -14,7 +14,7 @@ class TelegramSnscrapeScraper(Scraper):
if channel.platform == "Telegram" and channel.public and not channel.chat:
return True
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
scr = snscrape.modules.telegram.TelegramChannelScraper(
channel.screenname)
@@ -29,18 +29,18 @@ class TelegramSnscrapeScraper(Scraper):
archived_urls = {}
if media:
if archive_media:
for image_url in post.images:
logger.debug(f'Archiving image: {image_url}')
media_blob, content_type, key = self.url_to_blob(image_url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[image_url] = archived_url
if post.video:
logger.debug(f'Archiving video: {post.video}')
media_blob, content_type, key = self.url_to_blob(post.video)
archived_url = self.archive_media(media_blob, content_type, key)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[post.video] = archived_url
yield ScraperResult(

View File

@@ -26,7 +26,7 @@ class TelegramTelethonScraper(Scraper):
if channel.platform == "Telegram" and channel.public and not channel.chat:
return True
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = self.get_username_from_url(channel.url)
@@ -47,7 +47,7 @@ class TelegramTelethonScraper(Scraper):
archived_urls = {}
if media:
if archive_media:
if post.media is not None:
with tempfile.TemporaryDirectory() as temp_dir:
@@ -61,7 +61,7 @@ class TelegramTelethonScraper(Scraper):
blob = f.read()
# TODO specify Content-Type
archived_url = self.archive_media(blob = blob, content_type = '', key = output_file_with_ext)
archived_url = self.archive_blob(blob = blob, content_type = '', key = output_file_with_ext)
archived_urls[post_url] = archived_url
yield ScraperResult(

View File

@@ -12,7 +12,7 @@ class TwitterScraper(Scraper):
"""An implementation of a Scraper for Twitter, using snscrape library"""
__version__ = "TwitterScraper 0.0.1"
def get_posts(self, channel: Channel, since: ScraperResult = None, media: bool = True) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
scraper = TwitterProfileScraper(channel.platform_id)
first = True
@@ -28,24 +28,26 @@ class TwitterScraper(Scraper):
archived_urls = {}
if tweet.media:
for media in tweet.media:
if type(media) == Video:
variant = max(
[v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
url = variant.url
elif type(media) == Gif:
url = media.variants[0].url
elif type(media) == Photo:
url = media.fullUrl
else:
logger.warning(f"Could not get media URL of {media}")
url = None
if archive_media:
if url is not None:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[url] = archived_url
if tweet.media:
for media in tweet.media:
if type(media) == Video:
variant = max(
[v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
url = variant.url
elif type(media) == Gif:
url = media.variants[0].url
elif type(media) == Photo:
url = media.fullUrl
else:
logger.warning(f"Could not get media URL of {media}")
url = None
if url is not None:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
@@ -53,7 +55,7 @@ class TwitterScraper(Scraper):
channel=channel.id,
platform_id=tweet.id,
date=tweet.date,
date_archived=datetime.now(),
date_archived=datetime.now(timezone.utc),
raw_data=tweet.json(),
archived_urls=archived_urls)

View File

@@ -13,48 +13,48 @@ logger.add(sys.stderr, level="INFO")
logger.add("../russian_telegram_ingest.log")
test_channels = [
# Channel(
# id=0,
# name="QAnon Россия",
# platform_id=-1001319637748,
# category="Qanon",
# followers=94048,
# platform="Telegram",
# url="https://t.me/qanonrus",
# screenname="qanonrus",
# country="RU",
# influencer=None,
# public=True,
# chat=False,
# notes=""),
# Channel(
# id=1,
# name="The Great Awakening | Q",
# platform_id=-1001325597521,
# category="Qanon",
# followers=5715,
# platform="Telegram",
# url="https://t.me/greatawakin",
# screenname="greatawakin",
# country="RU",
# influencer=None,
# public=True,
# chat=False,
# notes=""),
# Channel(
# id=2,
# name="Великое Пробуждение",
# platform_id=-1001285898079,
# category="Qanon",
# followers=5861,
# platform="Telegram",
# url="https://t.me/greatawakeningrus",
# screenname="greatawakeningrus",
# country="RU",
# influencer=None,
# public=True,
# chat=False,
# notes=""),
Channel(
id=0,
name="QAnon Россия",
platform_id=-1001319637748,
category="Qanon",
followers=94048,
platform="Telegram",
url="https://t.me/qanonrus",
screenname="qanonrus",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=1,
name="The Great Awakening | Q",
platform_id=-1001325597521,
category="Qanon",
followers=5715,
platform="Telegram",
url="https://t.me/greatawakin",
screenname="greatawakin",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=2,
name="Великое Пробуждение",
platform_id=-1001285898079,
category="Qanon",
followers=5861,
platform="Telegram",
url="https://t.me/greatawakeningrus",
screenname="greatawakeningrus",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=3,
name="T🕊Редакция Президент Гордон🕊",
@@ -134,5 +134,5 @@ controller.register_scraper(telegram)
engine = create_engine('sqlite:///russian_telegram.db')
controller.connect_to_db(engine)
controller.scrape_channels(test_channels)
controller.scrape_channels(test_channels, archive_media = False)

View File

@@ -1,4 +1,5 @@
from sqlalchemy import create_engine
from loguru import logger
from cisticola.base import Channel
from cisticola.scraper import (
@@ -12,6 +13,8 @@ from cisticola.scraper import (
TelegramTelethonScraper,
TwitterScraper)
logger.add("../test.log")
test_channels = [
Channel(
id=0,
@@ -118,12 +121,12 @@ scrapers = [
OdyseeScraper(),
RumbleScraper(),
TelegramSnscrapeScraper(),
TwitterScraper()
TelegramTelethonScraper()]
TelegramTelethonScraper(),
TwitterScraper()]
controller.register_scrapers(scrapers)
engine = create_engine('sqlite:///test3.db')
controller.connect_to_db(engine)
controller.scrape_channels(test_channels, media = True)
controller.scrape_channels(test_channels, archive_media = False)

View File

@@ -113,7 +113,7 @@ TWITTER_CHANNEL_KWARGS = {
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
@pytest.fixture(scope='package')
@pytest.fixture(scope='function')
def controller(tmpdir_factory):
"""Initialize ScraperController and SQLite database file to be used for all

View File

@@ -5,10 +5,10 @@ def test_scrape_bitchute_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['bitchute'])]
controller.register_scraper(scraper = BitchuteScraper())
controller.scrape_channels(channels = channels, media = False)
controller.scrape_channels(channels = channels, archive_media = False)
def test_scrape_bitchute_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['bitchute'])]
controller.register_scraper(scraper = BitchuteScraper())
controller.scrape_channels(channels = channels, media = True)
controller.scrape_channels(channels = channels, archive_media = True)

View File

@@ -5,10 +5,10 @@ def test_scrape_gab_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['gab'])]
controller.register_scraper(scraper = GabScraper())
controller.scrape_channels(channels = channels, media = False)
controller.scrape_channels(channels = channels, archive_media = False)
def test_scrape_gab_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['gab'])]
controller.register_scraper(scraper = GabScraper())
controller.scrape_channels(channels = channels, media = True)
controller.scrape_channels(channels = channels, archive_media = True)

View File

@@ -5,10 +5,10 @@ def test_scrape_gettr_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['gettr'])]
controller.register_scraper(scraper = GettrScraper())
controller.scrape_channels(channels = channels, media = False)
controller.scrape_channels(channels = channels, archive_media = False)
def test_scrape_gettr_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['gettr'])]
controller.register_scraper(scraper = GettrScraper())
controller.scrape_channels(channels = channels, media = True)
controller.scrape_channels(channels = channels, archive_media = True)

View File

@@ -5,10 +5,10 @@ def test_scrape_odysee_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['odysee'])]
controller.register_scraper(scraper = OdyseeScraper())
controller.scrape_channels(channels = channels, media = False)
controller.scrape_channels(channels = channels, archive_media = False)
def test_scrape_odysee_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['odysee'])]
controller.register_scraper(scraper = OdyseeScraper())
controller.scrape_channels(channels = channels, media = True)
controller.scrape_channels(channels = channels, archive_media = True)

View File

@@ -5,10 +5,10 @@ def test_scrape_rumble_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['rumble'])]
controller.register_scraper(scraper = RumbleScraper())
controller.scrape_channels(channels = channels, media = False)
controller.scrape_channels(channels = channels, archive_media = False)
def test_scrape_rumble_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['rumble'])]
controller.register_scraper(scraper = RumbleScraper())
controller.scrape_channels(channels = channels, media = True)
controller.scrape_channels(channels = channels, archive_media = True)

View File

@@ -5,10 +5,10 @@ def test_scrape_telegram_snscrape_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['telegram'])]
controller.register_scraper(scraper = TelegramSnscrapeScraper())
controller.scrape_channels(channels = channels, media = False)
controller.scrape_channels(channels = channels, archive_media = False)
def test_scrape_telegram_snscrape_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['telegram'])]
controller.register_scraper(scraper = TelegramSnscrapeScraper())
controller.scrape_channels(channels = channels, media = True)
controller.scrape_channels(channels = channels, archive_media = True)

View File

@@ -5,10 +5,10 @@ def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['telegram'])]
controller.register_scraper(scraper = TelegramTelethonScraper())
controller.scrape_channels(channels = channels, media = False)
controller.scrape_channels(channels = channels, archive_media = False)
def test_scrape_telegram_telethon_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['telegram'])]
controller.register_scraper(scraper = TelegramTelethonScraper())
controller.scrape_channels(channels = channels, media = True)
controller.scrape_channels(channels = channels, archive_media = True)

View File

@@ -5,10 +5,10 @@ def test_scrape_twitter_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['twitter'])]
controller.register_scraper(scraper = TwitterScraper())
controller.scrape_channels(channels = channels, media = False)
controller.scrape_channels(channels = channels, archive_media = False)
def test_scrape_twitter_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['twitter'])]
controller.register_scraper(scraper = TwitterScraper())
controller.scrape_channels(channels = channels, media = True)
controller.scrape_channels(channels = channels, archive_media = True)