Merge branch 'main' into channel-db

This commit is contained in:
Logan Williams
2022-03-22 11:49:07 +01:00
committed by GitHub
41 changed files with 970 additions and 313 deletions

View File

@@ -14,29 +14,91 @@ from cisticola.base import Channel, ScraperResult, mapper_registry
from cisticola.utils import make_request
class Scraper:
"""Base class for defining platform-specific scrapers for scraping all posts
from a given channel on that specific platform.
"""
__version__ = "Scraper 0.0.0"
def __init__(self):
self.s3_client = boto3.client('s3',
region_name=os.environ['DO_SPACES_REGION'],
endpoint_url='https://{}.digitaloceanspaces.com'.format(
os.environ['DO_SPACES_REGION']),
aws_access_key_id=os.environ['DO_SPACES_KEY'],
aws_secret_access_key=os.environ['DO_SPACES_SECRET'])
# Initialize client to transfer files to the storage archive
self.s3_client = boto3.client(
service_name='s3',
region_name=os.environ['DO_SPACES_REGION'],
endpoint_url=f'https://{os.environ["DO_SPACES_REGION"]}.digitaloceanspaces.com',
aws_access_key_id=os.environ['DO_SPACES_KEY'],
aws_secret_access_key=os.environ['DO_SPACES_SECRET'])
# Define request headers (necessary to bypass scraping protection
# for several platform scrapers)
self.headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'}
pass
def __str__(self):
return self.__version__
def get_username_from_url(self, url: str) -> str:
"""Extract a channel's username from its URL.
Parameters
----------
url: str
URL of the channel on a given platform
e.g. ``"https://twitter.com/EliotHiggins"``
Returns
-------
username: str
Extracted username of the channel.
e.g. ``"EliotHiggins"``
"""
raise NotImplementedError
def url_to_key(self, url: str, content_type: str) -> str:
"""Generate a unique identifier for media from a specified post.
Parameters
---------
url: str
URL of original post.
e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"``
content_type: str
Content-Type of media.
e.g. ``"image/jpeg"``
Returns
-------
key: str
Unique identifier for the media file from a specified post based on
the original post URL and the media's Content-Type.
"""
key = urlparse(url).path.split('/')[-1]
return key
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
"""Download media file from a specified media file URL.
Parameters
---------
url: str
URL of media file from original post.
e.g. ``"https://pbs.twimg.com/media/FN0j0dYWUAcQxfK?format=png&name=medium"``
key: str or None
Pre-defined unique identifier for the media file.
Returns
-------
blob: bytes
Raw bytes of the downloaded media file.
content_type: str
Content-Type of media.
e.g. ``"image/jpeg"``.
key: str
Unique identifier for the media file.
"""
r = make_request(url, headers = self.headers)
@@ -49,6 +111,27 @@ class Scraper:
return blob, content_type, key
def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
"""Download media file from a specified media URL, where the media file
is formatted as an m3u8 playlist, which is then decoded to an mp4 file.
Parameters
---------
url: str
URL of m3u8 playlist file from original post.
e.g. ``"https://media.gettr.com/group47/origin/2022/03/15/01/cbc436c1-1a1a-4b97-671d-c42109f3ec9b/out.m3u8"``
key: str or None
Pre-defined unique identifier for the media file.
Returns
-------
blob: bytes
Raw bytes of the downloaded media file.
content_type: str
Content-Type of media.
e.g. ``"video/mp4"``.
key: str
Unique identifier for the media file.
"""
content_type = 'video/mp4'
ext = '.' + content_type.split('/')[-1]
@@ -71,7 +154,28 @@ class Scraper:
return blob, content_type, key
def ytdlp_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
"""Download media file from a specified media URL, using a fork of
youtube-dl that enables faster downloading.
Parameters
---------
url: str
URL of media file from original post.
e.g. ``"https://rumble.com/embed/vgt7gh/"``
key: str or None
Pre-defined unique identifier for the media file.
Returns
-------
blob: bytes
Raw bytes of the downloaded media file.
content_type: str
Content-Type of media.
e.g. ``"video/mp4"``.
key: str
Unique identifier for the media file.
"""
content_type = 'video/mp4'
with tempfile.TemporaryDirectory() as temp_dir:
@@ -103,6 +207,23 @@ class Scraper:
return blob, content_type, key
def archive_blob(self, blob: bytes, content_type: str, key: str) -> str:
"""Upload raw bytes of a media file to the storage archive.
Parameters
----------
blob: bytes
Raw bytes of the media file to be archived.
content_type: str
Content-Type of media.
e.g. ``"video/mp4"``.
key: str
Unique identifier for the media file.
Returns
-------
archived_url: str
URL specifying the file on the storage archive.
"""
filename = self.__version__.replace(' ', '_') + '/' + key
@@ -114,9 +235,42 @@ class Scraper:
return archived_url
def can_handle(self, channel: Channel) -> bool:
"""Whether or not the scraper can scrape the specified channel.
Parameters
----------
channel: Channel
Channel to be scraped.
Returns
-------
bool
``True`` if the scraper is capable of scraping ``channel``,
``False`` if not.
"""
raise NotImplementedError
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
"""Scrape all posts from the specified Channel.
Parameters
----------
channel: Channel
Channel to be scraped.
since: ScraperResult or None
Most recently scraped ScraperResult from a previous scrape, or
``None`` if scraper has not run before.
archive_media: bool
If ``True``, any media files (images, video, etc.) from posts are archived.
If ``False``, media files are not archived.
Yields
------
ScraperResult
Scraper result from a single post/comment from the specified Channel.
"""
raise NotImplementedError
@@ -129,9 +283,13 @@ class ScraperController:
self.session = None
def register_scraper(self, scraper: Scraper):
"""Register a single Scraper instance to the controller.
"""
self.scrapers.append(scraper)
def register_scrapers(self, scraper: List[Scraper]):
"""Register a list of Scraper instances to the controller.
"""
self.scrapers.extend(scraper)
def scrape_all_channels(self, archive_media: bool = True):
@@ -147,6 +305,17 @@ class ScraperController:
@logger.catch(reraise = True)
def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
"""Scrape all posts for all specified channels.
Parameters
----------
channels: list<Channel>
List of Channel instances to be scraped
archive_media: bool
If ``True``, any media files (images, video, etc.) from posts are archived.
If ``False``, media files are not archived.
"""
if self.session is None:
logger.error("No DB session")
return
@@ -185,6 +354,9 @@ class ScraperController:
logger.warning(f"No handler found for Channel {channel}")
def connect_to_db(self, engine):
"""Connect the specified SQLAlchemy engine to the controller.
"""
# create tables
mapper_registry.metadata.create_all(bind=engine)
@@ -193,8 +365,8 @@ class ScraperController:
self.session.configure(bind=self.engine)
def reset_db(self):
"""Drop all data from the connected SQLAlchemy database.
"""
mapper_registry.metadata.drop_all(bind=self.engine)
self.connect_to_db(self.engine)
self.connect_to_db(self.engine)

View File

@@ -17,7 +17,7 @@ class BitchuteScraper(Scraper):
library"""
__version__ = "BitchuteScraper 0.0.1"
def get_username_from_url(url):
def get_username_from_url(self, url):
username = url.split('bitchute.com/channel/')[-1].strip('/')
return username
@@ -33,7 +33,7 @@ class BitchuteScraper(Scraper):
detail = 'comments'
username = BitchuteScraper.get_username_from_url(channel.url)
username = self.get_username_from_url(channel.url)
scraper = get_videos_user(session, username, csrftoken, detail)
for post in scraper:
@@ -61,7 +61,7 @@ class BitchuteScraper(Scraper):
archived_urls=archived_urls)
def can_handle(self, channel):
if channel.platform == "Bitchute" and BitchuteScraper.get_username_from_url(channel.url) is not None:
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
return True
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -11,14 +11,14 @@ class GabScraper(Scraper):
"""An implementation of a Scraper for Gab, using GARC library"""
__version__ = "GabScraper 0.0.1"
def get_username_from_url(url):
def get_username_from_url(self, url):
username = url.split('https://gab.com/')[-1]
return username
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
client = Garc(profile = 'main')
username = GabScraper.get_username_from_url(channel.url)
username = self.get_username_from_url(channel.url)
scraper = client.userposts(username)
@@ -52,5 +52,5 @@ class GabScraper(Scraper):
archived_urls=archived_urls)
def can_handle(self, channel):
if channel.platform == "Gab" and GabScraper.get_username_from_url(channel.url) is not None:
if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:
return True

View File

@@ -12,7 +12,7 @@ class GettrScraper(Scraper):
"""An implementation of a Scraper for Gettr, using gogettr library"""
__version__ = "GettrScraper 0.0.1"
def get_username_from_url(url):
def get_username_from_url(self, url):
username = url.split("gettr.com/user/")[1]
if len(username.split("/")) > 1:
return None
@@ -21,7 +21,7 @@ class GettrScraper(Scraper):
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
client = PublicClient()
username = GettrScraper.get_username_from_url(channel.url)
username = self.get_username_from_url(channel.url)
scraper = client.user_activity(username=username, type="posts")
for post in scraper:
@@ -62,7 +62,7 @@ class GettrScraper(Scraper):
archived_urls=archived_urls)
def can_handle(self, channel):
if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None:
if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None:
return True
def url_to_key(self, url: str, content_type: str) -> str:

View File

@@ -18,6 +18,7 @@ CONTENT_TYPES = {
'mp4' : 'video/mp4'}
class InstagramScraper(Scraper):
"""An implementation of a Scraper for Instagram, using instaloader library"""
__version__ = "InstagramScraper 0.0.1"
def get_username_from_url(self, url):

View File

@@ -13,7 +13,7 @@ class OdyseeScraper(Scraper):
"""An implementation of a Scraper for Odysee, using polyphemus library"""
__version__ = "OdyseeScraper 0.0.1"
def get_username_from_url(url):
def get_username_from_url(self, url):
username = url.split('odysee.com/')[-1].strip('@').split(':')[0]
@@ -21,7 +21,7 @@ class OdyseeScraper(Scraper):
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = OdyseeScraper.get_username_from_url(channel.url)
username = self.get_username_from_url(channel.url)
odysee_channel = OdyseeChannel(channel_name = username)
all_videos = odysee_channel.get_all_videos()
@@ -70,7 +70,7 @@ class OdyseeScraper(Scraper):
archived_urls={})
def can_handle(self, channel):
if channel.platform == "Odysee" and OdyseeScraper.get_username_from_url(channel.url) is not None:
if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None:
return True
def url_to_key(self, url: str, content_type: str) -> str:

View File

@@ -14,14 +14,14 @@ class RumbleScraper(Scraper):
"""An implementation of a Scraper for Rumble, using custom functions"""
__version__ = "RumbleScraper 0.0.1"
def get_username_from_url(url):
def get_username_from_url(self, url):
username = url.split('https://rumble.com/c/')[1]
return username
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = RumbleScraper.get_username_from_url(channel.url)
username = self.get_username_from_url(channel.url)
scraper = get_channel_videos(username)
for post in scraper:
@@ -54,7 +54,7 @@ class RumbleScraper(Scraper):
return key
def can_handle(self, channel):
if channel.platform == "Rumble" and RumbleScraper.get_username_from_url(channel.url) is not None:
if channel.platform == "Rumble" and self.get_username_from_url(channel.url) is not None:
return True
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -8,6 +8,7 @@ from cisticola.base import Channel, ScraperResult
from cisticola.scraper.base import Scraper
class TelegramSnscrapeScraper(Scraper):
"""An implementation of a Scraper for Telegram, using snscrape library"""
__version__ = "TelegramSnscrapeScraper 0.0.1"
def can_handle(self, channel):

View File

@@ -14,6 +14,7 @@ from cisticola.scraper.base import Scraper
MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
class TelegramTelethonScraper(Scraper):
"""An implementation of a Scraper for Telegram, using Telethon library"""
__version__ = "TelegramTelethonScraper 0.0.1"
def get_username_from_url(self, url):
@@ -30,9 +31,9 @@ class TelegramTelethonScraper(Scraper):
username = self.get_username_from_url(channel.url)
api_id = os.environ['TELEGRAM_API_ID_1']
api_hash = os.environ['TELEGRAM_API_HASH_1']
phone = os.environ['TELEGRAM_PHONE_1']
api_id = os.environ['TELEGRAM_API_ID']
api_hash = os.environ['TELEGRAM_API_HASH']
phone = os.environ['TELEGRAM_PHONE']
with TelegramClient(phone, api_id, api_hash) as client: