mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-12 13:28:34 +03:00
added docstrings, improved Sphinx docs
This commit is contained in:
@@ -13,29 +13,73 @@ from cisticola.base import Channel, ScraperResult, mapper_registry
|
||||
from cisticola.scraper import make_request
|
||||
|
||||
class Scraper:
|
||||
"""Base class for defining platform-specific scrapers for scraping all posts
|
||||
from a given channel on that specific platform.
|
||||
"""
|
||||
|
||||
__version__ = "Scraper 0.0.0"
|
||||
|
||||
def __init__(self):
|
||||
self.s3_client = boto3.client('s3',
|
||||
region_name=os.environ['DO_SPACES_REGION'],
|
||||
endpoint_url='https://{}.digitaloceanspaces.com'.format(
|
||||
os.environ['DO_SPACES_REGION']),
|
||||
aws_access_key_id=os.environ['DO_SPACES_KEY'],
|
||||
aws_secret_access_key=os.environ['DO_SPACES_SECRET'])
|
||||
|
||||
# Initialize client to transfer files to the storage archive
|
||||
self.s3_client = boto3.client(
|
||||
service_name='s3',
|
||||
region_name=os.environ['DO_SPACES_REGION'],
|
||||
endpoint_url=f'https://{os.environ["DO_SPACES_REGION"]}.digitaloceanspaces.com',
|
||||
aws_access_key_id=os.environ['DO_SPACES_KEY'],
|
||||
aws_secret_access_key=os.environ['DO_SPACES_SECRET'])
|
||||
|
||||
# Define request headers (necessary to bypass scraping protection
|
||||
# for several platform scrapers)
|
||||
self.headers = {
|
||||
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'}
|
||||
|
||||
pass
|
||||
|
||||
def __str__(self):
|
||||
return self.__version__
|
||||
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
"""Generate a unique identifier for media from a specified post.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
url: str
|
||||
URL of original post.
|
||||
e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"``
|
||||
content_type: str
|
||||
Content-Type of media.
|
||||
e.g. ``"image/jpeg"``
|
||||
|
||||
Returns
|
||||
-------
|
||||
key: str
|
||||
Unique identifier for the media file from a specified post based on
|
||||
the original post URL and the media's Content-Type.
|
||||
"""
|
||||
|
||||
key = urlparse(url).path.split('/')[-1]
|
||||
return key
|
||||
|
||||
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
"""Download media file from a specified post URL.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
url: str
|
||||
URL of original post.
|
||||
e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"``
|
||||
key: str or None
|
||||
Pre-defined unique identifier for the media file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
blob: bytes
|
||||
Raw bytes of the downloaded media file.
|
||||
content_type: str
|
||||
Content-Type of media.
|
||||
e.g. ``"image/jpeg"``.
|
||||
key: str
|
||||
Unique identifier for the media file.
|
||||
"""
|
||||
|
||||
r = make_request(url, headers = self.headers)
|
||||
|
||||
@@ -48,6 +92,27 @@ class Scraper:
|
||||
return blob, content_type, key
|
||||
|
||||
def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
"""Download media file from a specified post URL, where the media file
|
||||
is formatted as an m3u8 playlist, which is then decoded to an mp4 file.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
url: str
|
||||
URL of original post.
|
||||
e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"``
|
||||
key: str or None
|
||||
Pre-defined unique identifier for the media file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
blob: bytes
|
||||
Raw bytes of the downloaded media file.
|
||||
content_type: str
|
||||
Content-Type of media.
|
||||
e.g. ``"video/mp4"``.
|
||||
key: str
|
||||
Unique identifier for the media file.
|
||||
"""
|
||||
|
||||
content_type = 'video/mp4'
|
||||
ext = '.' + content_type.split('/')[-1]
|
||||
@@ -70,6 +135,23 @@ class Scraper:
|
||||
return blob, content_type, key
|
||||
|
||||
def archive_blob(self, blob: bytes, content_type: str, key: str) -> str:
|
||||
"""Upload raw bytes of a media file to the storage archive.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
blob: bytes
|
||||
Raw bytes of the media file to be archived.
|
||||
content_type: str
|
||||
Content-Type of media.
|
||||
e.g. ``"video/mp4"``.
|
||||
key: str
|
||||
Unique identifier for the media file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
archived_url: str
|
||||
URL specifying the file on the storage archive.
|
||||
"""
|
||||
|
||||
filename = self.__version__.replace(' ', '_') + '/' + key
|
||||
|
||||
@@ -81,9 +163,37 @@ class Scraper:
|
||||
return archived_url
|
||||
|
||||
def can_handle(self, channel: Channel) -> bool:
|
||||
"""Whether or not the scraper can scrape the specified channel.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
channel: Channel
|
||||
Channel to be scraped.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
``True`` if the scraper is capable of scraping ``channel``,
|
||||
``False`` if not.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
"""Scrape all posts from the specified Channel.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
channel: Channel
|
||||
Channel to be scraped.
|
||||
since: ScraperResult or None
|
||||
Most recently scraped ScraperResult from a previous scrape, or
|
||||
``None`` if scraper has not run before.
|
||||
archive_media: bool
|
||||
If ``True``, any media files (images, video, etc.) from posts are archived.
|
||||
If ``False``, media files are not archived.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@@ -97,13 +207,28 @@ class ScraperController:
|
||||
self.mapper_registry = None
|
||||
|
||||
def register_scraper(self, scraper: Scraper):
|
||||
"""Register a single Scraper instance to the controller.
|
||||
"""
|
||||
self.scrapers.append(scraper)
|
||||
|
||||
def register_scrapers(self, scraper: List[Scraper]):
|
||||
"""Register a list of Scraper instances to the controller.
|
||||
"""
|
||||
self.scrapers.extend(scraper)
|
||||
|
||||
@logger.catch
|
||||
def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
|
||||
"""Scrape all posts for all specified channels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
channels: list<Channel>
|
||||
List of Channel instances to be scraped
|
||||
archive_media: bool
|
||||
If ``True``, any media files (images, video, etc.) from posts are archived.
|
||||
If ``False``, media files are not archived.
|
||||
"""
|
||||
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
@@ -143,15 +268,11 @@ class ScraperController:
|
||||
logger.warning(f"No handler found for Channel {channel}")
|
||||
|
||||
def connect_to_db(self, engine):
|
||||
"""Connect the specified SQLAlchemy engine to the controller.
|
||||
"""
|
||||
|
||||
# create tables
|
||||
mapper_registry.metadata.create_all(bind=engine)
|
||||
|
||||
self.session = sessionmaker()
|
||||
self.session.configure(bind=engine)
|
||||
|
||||
|
||||
class ETLController:
|
||||
"""This class will transform the raw_data tables into a format more conducive to analysis."""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
self.session.configure(bind=engine)
|
||||
Reference in New Issue
Block a user