mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-13 05:48:33 +03:00
added and made more consistent docstrings, wrote script that makes minor edits to Sphinx apidocs to improve documentation clarity
This commit is contained in:
@@ -24,7 +24,7 @@ PIL.Image.MAX_IMAGE_PIXELS = 1024 * 1024 * 256
|
||||
|
||||
@dataclass
|
||||
class ScraperResult:
|
||||
"""A minimally processed result from a scraper
|
||||
"""Minimally processed set of information from a scraper about one post
|
||||
"""
|
||||
|
||||
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
|
||||
@@ -100,7 +100,7 @@ class Channel:
|
||||
|
||||
@dataclass
|
||||
class RawChannelInfo:
|
||||
"""A minimally processed result from a scraper
|
||||
"""Minimally processed set of information from a scraper about one channel
|
||||
"""
|
||||
|
||||
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
|
||||
|
||||
@@ -323,19 +323,43 @@ class ScraperController:
|
||||
self.session = None
|
||||
|
||||
def register_scraper(self, scraper: Scraper):
|
||||
"""Register a single Scraper instance to the controller.
|
||||
"""Add a single Scraper instance to the list of available Scrapers.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
scraper: cisticola.scraper.Scraper
|
||||
Instance of platform-specific scraper to be controlled by the ScraperController
|
||||
"""
|
||||
self.scrapers.append(scraper)
|
||||
|
||||
def register_scrapers(self, scraper: List[Scraper]):
|
||||
"""Register a list of Scraper instances to the controller.
|
||||
def register_scrapers(self, scrapers: List[Scraper]):
|
||||
"""Add a a list of Scraper instances to the list of available Scrapers.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
scrapers: <list>cisticola.scraper.Scraper
|
||||
List of instances of platform-specific scrapers to be controlled by the ScraperController
|
||||
|
||||
"""
|
||||
self.scrapers.extend(scraper)
|
||||
self.scrapers.extend(scrapers)
|
||||
|
||||
def remove_all_scrapers(self):
|
||||
"""Reset the ScraperController so that it doesn't control any scrapers
|
||||
"""
|
||||
self.scrapers = []
|
||||
|
||||
def scrape_all_channels(self, archive_media: bool = True, fetch_old: bool = False):
|
||||
"""Scrape posts from all channels in the database, that satisfy a researcher-specified criteria
|
||||
|
||||
Parameters
|
||||
----------
|
||||
archive_media: bool
|
||||
If ``True``, any media files (images, video, etc.) from posts are archived.
|
||||
If ``False``, media files are not archived.
|
||||
fetch_old: bool
|
||||
If ``True``, scrape all posts from channels, regardless of when channel was last scraped.
|
||||
If ``False``, scrape only posts that are more recent than the previous scrape of each channel.
|
||||
"""
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
@@ -350,6 +374,8 @@ class ScraperController:
|
||||
return self.scrape_channels(channels, archive_media=archive_media, fetch_old=fetch_old)
|
||||
|
||||
def scrape_all_channel_info(self):
|
||||
"""Scrape profile information from all channels in the database.
|
||||
"""
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
@@ -368,7 +394,7 @@ class ScraperController:
|
||||
return self.scrape_channel_info(channels)
|
||||
|
||||
def scrape_channels(self, channels: List[Channel], archive_media: bool = True, fetch_old: bool = False):
|
||||
"""Scrape all posts for all specified channels.
|
||||
"""Scrape all posts from a specified list of channels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@@ -376,7 +402,10 @@ class ScraperController:
|
||||
List of Channel instances to be scraped
|
||||
archive_media: bool
|
||||
If ``True``, any media files (images, video, etc.) from posts are archived.
|
||||
If ``False``, media files are not archived.
|
||||
If ``False``, media files are not archived.
|
||||
fetch_old: bool
|
||||
If ``True``, scrape all posts from channels, regardless of when channel was last scraped.
|
||||
If ``False``, scrape only posts that are more recent than the previous scrape of each channel.
|
||||
"""
|
||||
|
||||
if self.session is None:
|
||||
@@ -455,6 +484,16 @@ class ScraperController:
|
||||
session.close()
|
||||
|
||||
def archive_unarchived_media_batch(self, session = None, chronological=False):
|
||||
"""Archive previously unarchived media URLs from a batch of raw_post rows.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
session: sqlalchemy.orm.Session or None
|
||||
SQLAlchemy Session that interfaces with the database
|
||||
chronological: bool
|
||||
If ``True``, media attachments are archived starting with the oldest post
|
||||
If ``False``, media attachments are archived in random order
|
||||
"""
|
||||
if session is None:
|
||||
session = self.session()
|
||||
if chronological:
|
||||
@@ -489,6 +528,14 @@ class ScraperController:
|
||||
|
||||
@logger.catch(reraise = True)
|
||||
def archive_unarchived_media(self, chronological=False):
|
||||
"""Archive previously unarchived media URLs from all raw_post rows.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
chronological: bool
|
||||
If ``True``, media attachments are archived starting with the oldest post
|
||||
If ``False``, media attachments are archived in random order
|
||||
"""
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
@@ -498,9 +545,6 @@ class ScraperController:
|
||||
while True:
|
||||
self.archive_unarchived_media_batch(self, session=session, chronological=chronological)
|
||||
|
||||
|
||||
session.close()
|
||||
|
||||
@logger.catch(reraise = True)
|
||||
def scrape_channel_info(self, channels: List[Channel]):
|
||||
"""Scrape channel info for specified channels.
|
||||
@@ -509,9 +553,6 @@ class ScraperController:
|
||||
----------
|
||||
channels: list<Channel>
|
||||
List of Channel instances to be scraped
|
||||
archive_media: bool
|
||||
If ``True``, any media files (images, video, etc.) from posts are archived.
|
||||
If ``False``, media files are not archived.
|
||||
"""
|
||||
|
||||
if self.session is None:
|
||||
@@ -551,6 +592,11 @@ class ScraperController:
|
||||
|
||||
def connect_to_db(self, engine):
|
||||
"""Connect the specified SQLAlchemy engine to the controller.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
engine: sqlalchemy.engine.Engine
|
||||
Instance of SQLAlchemy engine to connect to
|
||||
"""
|
||||
|
||||
# create tables
|
||||
|
||||
@@ -30,7 +30,7 @@ class Transformer:
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
True if it can be handled by this Transformer, false otherwise.
|
||||
``True`` if it can be handled by this Transformer, false otherwise.
|
||||
"""
|
||||
|
||||
pass
|
||||
@@ -52,22 +52,36 @@ class Transformer:
|
||||
pass
|
||||
|
||||
def transform_media(self, data: ScraperResult, transformed: Post, insert: Callable):
|
||||
'''Transform media'''
|
||||
"""Transform a post's media attachment to standard form and insert into database.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data: cisticola.base.ScraperResult
|
||||
Raw post data of post that media file was attached to
|
||||
transformed: cisticola.base.Post
|
||||
Transformed post data of post that media file was attached to
|
||||
insert: Callable
|
||||
A function that either inserts the object into a database or finds an object with the
|
||||
relevant unique constraints if applicable.
|
||||
"""
|
||||
for k in data.archived_urls:
|
||||
if data.archived_urls[k]:
|
||||
archived_url = data.archived_urls[k]
|
||||
filename = archived_url.split('/')[-1]
|
||||
ext = None if '.' not in filename else filename.split('.')[-1].lower()
|
||||
|
||||
if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
|
||||
insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform))
|
||||
elif ext == 'oga' or ext == 'mp3' or ext == "wav" or ext == 'aif' or ext == 'aiff' or ext == 'aac':
|
||||
insert(Audio(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform))
|
||||
elif ext == 'jpg' or ext == 'jpeg' or ext == 'png' or ext == 'gif' or ext == 'bmp' or ext == 'heic' or ext == 'tiff':
|
||||
insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform))
|
||||
media_kwargs = dict(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform)
|
||||
|
||||
if ext in ('mp4', 'mov', 'avi', 'mkv'):
|
||||
media_class = Video
|
||||
elif ext in ('oga', 'mp3', "wav", 'aif', 'aiff', 'aac'):
|
||||
media_class = Audio
|
||||
elif ext in ('jpg', 'jpeg', 'png', 'gif', 'bmp', 'heic', 'tiff'):
|
||||
media_class = Image
|
||||
else:
|
||||
logger.warning(f"Unknown file extension {ext}")
|
||||
insert(Media(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform))
|
||||
media_class = Media
|
||||
insert(media_class(**media_kwargs))
|
||||
|
||||
|
||||
class ETLController:
|
||||
@@ -81,27 +95,35 @@ class ETLController:
|
||||
self.transformers = []
|
||||
|
||||
def register_transformer(self, transformer: Transformer):
|
||||
"""Adds a Transformer to the list of available Transformers.
|
||||
"""Add a single Transformer instance to the list of available Transformers.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
transformer : Transformer
|
||||
The Transformer to register
|
||||
Instance of platform-specific Transformer to be controlled by the ETLController
|
||||
"""
|
||||
|
||||
self.transformers.append(transformer)
|
||||
|
||||
def register_transformers(self, transformers):
|
||||
"""Add a a list of Transformer instances to the list of available Transformers.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
scrapers: <list>cisticola.scraper.Scraper
|
||||
List of instances of platform-specific Transformers to be controlled by the ETLController
|
||||
|
||||
"""
|
||||
for t in transformers:
|
||||
self.register_transformer(t)
|
||||
|
||||
def connect_to_db(self, engine: Engine):
|
||||
"""Connects the ETLController to a SQLAlchemy engine.
|
||||
"""Connect the ETLController to a SQLAlchemy engine.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
engine : Engine
|
||||
SQLAlchemy Engine object
|
||||
engine : sqlalchemy.engine.Engine
|
||||
Instance of SQLAlchemy Engine object to connect to
|
||||
"""
|
||||
# create tables
|
||||
mapper_registry.metadata.create_all(bind=engine)
|
||||
@@ -111,11 +133,36 @@ class ETLController:
|
||||
|
||||
# MAY4 can try adding some new functions for batching post inserts
|
||||
def flush_posts(self, session):
|
||||
"""Save all outstanding posts to the database. For efficiency, instead of saving posts one at a time, the ETLController maintains a list of posts (``posts_to_insert``) and saves them in bulk.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
session: sqlalchemy.orm.Session
|
||||
SQLAlchemy Session that interfaces with the database
|
||||
"""
|
||||
session.bulk_save_objects(self.posts_to_insert)
|
||||
# logger.info(f"Bulk saved {len(self.posts_to_insert)} posts")
|
||||
self.posts_to_insert = []
|
||||
|
||||
def insert_post(self, obj, session, hydrate: bool = True, flush: bool = False):
|
||||
"""Insert an object into the connected database.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj:
|
||||
Instance of ORM-mapped class in the ``cisticola.base`` module to be inserted into the database
|
||||
session: sqlalchemy.orm.Session
|
||||
SQLAlchemy Session that interfaces with the database
|
||||
hydrate: bool
|
||||
If ``True``, additional data fields are extracted from the object and populated in the given database table
|
||||
flush: bool
|
||||
If ``True``, the object is returned with additional populated data fields (such as a primary key ID).
|
||||
If ``False``, the object is added to ``posts_to_insert`` and nothing is returned
|
||||
|
||||
Returns
|
||||
-------
|
||||
None, or instance of ORM-mapped class from ``cisticola.base`` that has been inserted into the database, with additional data fields if ``flush`` argument is ``True``.
|
||||
"""
|
||||
if hydrate and type(obj) != Video:
|
||||
obj.hydrate()
|
||||
|
||||
@@ -133,8 +180,23 @@ class ETLController:
|
||||
return None
|
||||
|
||||
def insert_or_select(self, obj, session, hydrate: bool = True):
|
||||
"""Inserts an object into the database or returns an existing object from the database.
|
||||
Regardless, the resulting object has an `id` attribute that can be referenced later."""
|
||||
"""Insert an object into the database or return an existing object from the database.
|
||||
Regardless, the resulting object has an `id` attribute that can be referenced later.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj:
|
||||
Instance of ORM-mapped class in the ``cisticola.base`` module to be inserted into the database
|
||||
session: sqlalchemy.orm.Session
|
||||
SQLAlchemy Session that interfaces with the database
|
||||
hydrate: bool
|
||||
If ``True``, additional data fields are extracted from the object and populated in the given database table
|
||||
|
||||
Returns
|
||||
-------
|
||||
Object that has been inserted into the database, or existing object in the database, or None.
|
||||
|
||||
"""
|
||||
|
||||
instance = None
|
||||
|
||||
@@ -209,7 +271,7 @@ class ETLController:
|
||||
|
||||
@logger.catch(reraise=True)
|
||||
def transform_results(self, results: List[ScraperResult], hydrate: bool = True):
|
||||
"""Transforms raw ScraperResults objects into Post objects and
|
||||
"""Transform raw ScraperResults objects into Post objects and
|
||||
Media objects. Then, adds them to the database.
|
||||
|
||||
Parameters
|
||||
@@ -254,6 +316,8 @@ class ETLController:
|
||||
----------
|
||||
hydrate : bool
|
||||
Whether or not to fully hydrate transformed media. Default True.
|
||||
min_date: datetime.datetime
|
||||
Posts made before this date are not transformed.
|
||||
"""
|
||||
|
||||
if self.session is None:
|
||||
@@ -263,7 +327,6 @@ class ETLController:
|
||||
session = self.session()
|
||||
|
||||
BATCH_SIZE = 5000
|
||||
offset = 0
|
||||
batch = []
|
||||
|
||||
logger.info(f"Fetching first untransformed post batch of {BATCH_SIZE}")
|
||||
@@ -294,9 +357,15 @@ class ETLController:
|
||||
).all()
|
||||
|
||||
|
||||
|
||||
@logger.catch(reraise=True)
|
||||
def transform_info(self, results: List[ChannelInfo]):
|
||||
"""Transform raw RawChannelInfo objects into ChannelInfo objects.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
results : List[ChannelInfo]
|
||||
A list of ChannelInfo objects to be transformed
|
||||
"""
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
@@ -325,6 +394,9 @@ class ETLController:
|
||||
|
||||
@logger.catch(reraise=True)
|
||||
def transform_all_untransformed_info(self):
|
||||
"""Transform all RawChannelInfo objects in the database that do not have an
|
||||
equivalent ChannelInfo object stored.
|
||||
"""
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
@@ -355,15 +427,15 @@ class ETLController:
|
||||
|
||||
@logger.catch(reraise=True)
|
||||
def transform_media(self, results: List, hydrate: bool = True):
|
||||
"""Transforms raw ScraperResults objects into Post objects and
|
||||
Media objects. Then, adds them to the database.
|
||||
"""Transform raw ScraperResults objects into Post objects and
|
||||
Media objects, then add them to the database.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
results : List[ScraperResult]
|
||||
A list of ScraperResult objects to be transformed
|
||||
hydrate : bool
|
||||
Whether or not to fully hydrate transformed media. Default True.
|
||||
Whether or not to fully hydrate transformed media. Default ``True``.
|
||||
"""
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
|
||||
Reference in New Issue
Block a user