added and made more consistent docstrings, wrote script that makes minor edits to Sphinx apidocs to improve documentation clarity

This commit is contained in:
Tristan Lee
2023-08-03 17:27:33 -05:00
parent b8ddc400f3
commit edd772eb94
7 changed files with 183 additions and 39 deletions

View File

@@ -24,7 +24,7 @@ PIL.Image.MAX_IMAGE_PIXELS = 1024 * 1024 * 256
@dataclass
class ScraperResult:
"""A minimally processed result from a scraper
"""Minimally processed set of information from a scraper about one post
"""
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
@@ -100,7 +100,7 @@ class Channel:
@dataclass
class RawChannelInfo:
"""A minimally processed result from a scraper
"""Minimally processed set of information from a scraper about one channel
"""
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.

View File

@@ -323,19 +323,43 @@ class ScraperController:
self.session = None
def register_scraper(self, scraper: Scraper):
"""Register a single Scraper instance to the controller.
"""Add a single Scraper instance to the list of available Scrapers.
Parameters
----------
scraper: cisticola.scraper.Scraper
Instance of platform-specific scraper to be controlled by the ScraperController
"""
self.scrapers.append(scraper)
def register_scrapers(self, scraper: List[Scraper]):
"""Register a list of Scraper instances to the controller.
def register_scrapers(self, scrapers: List[Scraper]):
"""Add a a list of Scraper instances to the list of available Scrapers.
Parameters
----------
scrapers: <list>cisticola.scraper.Scraper
List of instances of platform-specific scrapers to be controlled by the ScraperController
"""
self.scrapers.extend(scraper)
self.scrapers.extend(scrapers)
def remove_all_scrapers(self):
"""Reset the ScraperController so that it doesn't control any scrapers
"""
self.scrapers = []
def scrape_all_channels(self, archive_media: bool = True, fetch_old: bool = False):
"""Scrape posts from all channels in the database, that satisfy a researcher-specified criteria
Parameters
----------
archive_media: bool
If ``True``, any media files (images, video, etc.) from posts are archived.
If ``False``, media files are not archived.
fetch_old: bool
If ``True``, scrape all posts from channels, regardless of when channel was last scraped.
If ``False``, scrape only posts that are more recent than the previous scrape of each channel.
"""
if self.session is None:
logger.error("No DB session")
return
@@ -350,6 +374,8 @@ class ScraperController:
return self.scrape_channels(channels, archive_media=archive_media, fetch_old=fetch_old)
def scrape_all_channel_info(self):
"""Scrape profile information from all channels in the database.
"""
if self.session is None:
logger.error("No DB session")
return
@@ -368,7 +394,7 @@ class ScraperController:
return self.scrape_channel_info(channels)
def scrape_channels(self, channels: List[Channel], archive_media: bool = True, fetch_old: bool = False):
"""Scrape all posts for all specified channels.
"""Scrape all posts from a specified list of channels.
Parameters
----------
@@ -376,7 +402,10 @@ class ScraperController:
List of Channel instances to be scraped
archive_media: bool
If ``True``, any media files (images, video, etc.) from posts are archived.
If ``False``, media files are not archived.
If ``False``, media files are not archived.
fetch_old: bool
If ``True``, scrape all posts from channels, regardless of when channel was last scraped.
If ``False``, scrape only posts that are more recent than the previous scrape of each channel.
"""
if self.session is None:
@@ -455,6 +484,16 @@ class ScraperController:
session.close()
def archive_unarchived_media_batch(self, session = None, chronological=False):
"""Archive previously unarchived media URLs from a batch of raw_post rows.
Parameters
----------
session: sqlalchemy.orm.Session or None
SQLAlchemy Session that interfaces with the database
chronological: bool
If ``True``, media attachments are archived starting with the oldest post
If ``False``, media attachments are archived in random order
"""
if session is None:
session = self.session()
if chronological:
@@ -489,6 +528,14 @@ class ScraperController:
@logger.catch(reraise = True)
def archive_unarchived_media(self, chronological=False):
"""Archive previously unarchived media URLs from all raw_post rows.
Parameters
----------
chronological: bool
If ``True``, media attachments are archived starting with the oldest post
If ``False``, media attachments are archived in random order
"""
if self.session is None:
logger.error("No DB session")
return
@@ -498,9 +545,6 @@ class ScraperController:
while True:
self.archive_unarchived_media_batch(self, session=session, chronological=chronological)
session.close()
@logger.catch(reraise = True)
def scrape_channel_info(self, channels: List[Channel]):
"""Scrape channel info for specified channels.
@@ -509,9 +553,6 @@ class ScraperController:
----------
channels: list<Channel>
List of Channel instances to be scraped
archive_media: bool
If ``True``, any media files (images, video, etc.) from posts are archived.
If ``False``, media files are not archived.
"""
if self.session is None:
@@ -551,6 +592,11 @@ class ScraperController:
def connect_to_db(self, engine):
"""Connect the specified SQLAlchemy engine to the controller.
Parameters
----------
engine: sqlalchemy.engine.Engine
Instance of SQLAlchemy engine to connect to
"""
# create tables

View File

@@ -30,7 +30,7 @@ class Transformer:
Returns
-------
bool
True if it can be handled by this Transformer, false otherwise.
``True`` if it can be handled by this Transformer, false otherwise.
"""
pass
@@ -52,22 +52,36 @@ class Transformer:
pass
def transform_media(self, data: ScraperResult, transformed: Post, insert: Callable):
'''Transform media'''
"""Transform a post's media attachment to standard form and insert into database.
Parameters
----------
data: cisticola.base.ScraperResult
Raw post data of post that media file was attached to
transformed: cisticola.base.Post
Transformed post data of post that media file was attached to
insert: Callable
A function that either inserts the object into a database or finds an object with the
relevant unique constraints if applicable.
"""
for k in data.archived_urls:
if data.archived_urls[k]:
archived_url = data.archived_urls[k]
filename = archived_url.split('/')[-1]
ext = None if '.' not in filename else filename.split('.')[-1].lower()
if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform))
elif ext == 'oga' or ext == 'mp3' or ext == "wav" or ext == 'aif' or ext == 'aiff' or ext == 'aac':
insert(Audio(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform))
elif ext == 'jpg' or ext == 'jpeg' or ext == 'png' or ext == 'gif' or ext == 'bmp' or ext == 'heic' or ext == 'tiff':
insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform))
media_kwargs = dict(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform)
if ext in ('mp4', 'mov', 'avi', 'mkv'):
media_class = Video
elif ext in ('oga', 'mp3', "wav", 'aif', 'aiff', 'aac'):
media_class = Audio
elif ext in ('jpg', 'jpeg', 'png', 'gif', 'bmp', 'heic', 'tiff'):
media_class = Image
else:
logger.warning(f"Unknown file extension {ext}")
insert(Media(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform))
media_class = Media
insert(media_class(**media_kwargs))
class ETLController:
@@ -81,27 +95,35 @@ class ETLController:
self.transformers = []
def register_transformer(self, transformer: Transformer):
"""Adds a Transformer to the list of available Transformers.
"""Add a single Transformer instance to the list of available Transformers.
Parameters
----------
transformer : Transformer
The Transformer to register
Instance of platform-specific Transformer to be controlled by the ETLController
"""
self.transformers.append(transformer)
def register_transformers(self, transformers):
"""Add a a list of Transformer instances to the list of available Transformers.
Parameters
----------
scrapers: <list>cisticola.scraper.Scraper
List of instances of platform-specific Transformers to be controlled by the ETLController
"""
for t in transformers:
self.register_transformer(t)
def connect_to_db(self, engine: Engine):
"""Connects the ETLController to a SQLAlchemy engine.
"""Connect the ETLController to a SQLAlchemy engine.
Parameters
----------
engine : Engine
SQLAlchemy Engine object
engine : sqlalchemy.engine.Engine
Instance of SQLAlchemy Engine object to connect to
"""
# create tables
mapper_registry.metadata.create_all(bind=engine)
@@ -111,11 +133,36 @@ class ETLController:
# MAY4 can try adding some new functions for batching post inserts
def flush_posts(self, session):
"""Save all outstanding posts to the database. For efficiency, instead of saving posts one at a time, the ETLController maintains a list of posts (``posts_to_insert``) and saves them in bulk.
Parameters
----------
session: sqlalchemy.orm.Session
SQLAlchemy Session that interfaces with the database
"""
session.bulk_save_objects(self.posts_to_insert)
# logger.info(f"Bulk saved {len(self.posts_to_insert)} posts")
self.posts_to_insert = []
def insert_post(self, obj, session, hydrate: bool = True, flush: bool = False):
"""Insert an object into the connected database.
Parameters
----------
obj:
Instance of ORM-mapped class in the ``cisticola.base`` module to be inserted into the database
session: sqlalchemy.orm.Session
SQLAlchemy Session that interfaces with the database
hydrate: bool
If ``True``, additional data fields are extracted from the object and populated in the given database table
flush: bool
If ``True``, the object is returned with additional populated data fields (such as a primary key ID).
If ``False``, the object is added to ``posts_to_insert`` and nothing is returned
Returns
-------
None, or instance of ORM-mapped class from ``cisticola.base`` that has been inserted into the database, with additional data fields if ``flush`` argument is ``True``.
"""
if hydrate and type(obj) != Video:
obj.hydrate()
@@ -133,8 +180,23 @@ class ETLController:
return None
def insert_or_select(self, obj, session, hydrate: bool = True):
"""Inserts an object into the database or returns an existing object from the database.
Regardless, the resulting object has an `id` attribute that can be referenced later."""
"""Insert an object into the database or return an existing object from the database.
Regardless, the resulting object has an `id` attribute that can be referenced later.
Parameters
----------
obj:
Instance of ORM-mapped class in the ``cisticola.base`` module to be inserted into the database
session: sqlalchemy.orm.Session
SQLAlchemy Session that interfaces with the database
hydrate: bool
If ``True``, additional data fields are extracted from the object and populated in the given database table
Returns
-------
Object that has been inserted into the database, or existing object in the database, or None.
"""
instance = None
@@ -209,7 +271,7 @@ class ETLController:
@logger.catch(reraise=True)
def transform_results(self, results: List[ScraperResult], hydrate: bool = True):
"""Transforms raw ScraperResults objects into Post objects and
"""Transform raw ScraperResults objects into Post objects and
Media objects. Then, adds them to the database.
Parameters
@@ -254,6 +316,8 @@ class ETLController:
----------
hydrate : bool
Whether or not to fully hydrate transformed media. Default True.
min_date: datetime.datetime
Posts made before this date are not transformed.
"""
if self.session is None:
@@ -263,7 +327,6 @@ class ETLController:
session = self.session()
BATCH_SIZE = 5000
offset = 0
batch = []
logger.info(f"Fetching first untransformed post batch of {BATCH_SIZE}")
@@ -294,9 +357,15 @@ class ETLController:
).all()
@logger.catch(reraise=True)
def transform_info(self, results: List[ChannelInfo]):
"""Transform raw RawChannelInfo objects into ChannelInfo objects.
Parameters
----------
results : List[ChannelInfo]
A list of ChannelInfo objects to be transformed
"""
if self.session is None:
logger.error("No DB session")
return
@@ -325,6 +394,9 @@ class ETLController:
@logger.catch(reraise=True)
def transform_all_untransformed_info(self):
"""Transform all RawChannelInfo objects in the database that do not have an
equivalent ChannelInfo object stored.
"""
if self.session is None:
logger.error("No DB session")
return
@@ -355,15 +427,15 @@ class ETLController:
@logger.catch(reraise=True)
def transform_media(self, results: List, hydrate: bool = True):
"""Transforms raw ScraperResults objects into Post objects and
Media objects. Then, adds them to the database.
"""Transform raw ScraperResults objects into Post objects and
Media objects, then add them to the database.
Parameters
----------
results : List[ScraperResult]
A list of ScraperResult objects to be transformed
hydrate : bool
Whether or not to fully hydrate transformed media. Default True.
Whether or not to fully hydrate transformed media. Default ``True``.
"""
if self.session is None:
logger.error("No DB session")

View File

@@ -14,6 +14,8 @@ MODULEPATH = ../cisticola
SOURCEFILES = cisticola.*
MODULEFILE = modules.rst
POST_APIDOC = edit_apidoc.sh
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
@@ -25,6 +27,8 @@ apidoc:
rm $(SOURCEDIR)/$(SOURCEFILES)
$(SPHINXAPIDOC) $(APIDOCFLAGS) -o "$(SOURCEDIR)" "$(MODULEPATH)"
rm $(SOURCEDIR)/$(MODULEFILE)
bash $(POST_APIDOC)
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).

17
docs/edit_apidoc.sh Normal file
View File

@@ -0,0 +1,17 @@
# This script makes minor changes to the *.rst files created by Sphinx apidoc
RST_SOURCE_DIR=source
HIDE_COOKIESTRING=" :exclude-members: cookiestring"
REPLACE_MAXDEPTH="s/ :maxdepth: 4/ :maxdepth: 1/g"
# Remove display of ``cookiestring`` class variable, otherwise Sphinx generates docs containing the value of your cookiestring, based on your ``YOUTUBE_COOKIESTRING`` environment variable
for file in cisticola.scraper.base.rst cisticola.scraper.rumble.rst cisticola.scraper.youtube.rst
do
echo "$HIDE_COOKIESTRING" >> $RST_SOURCE_DIR/$file;
done
# Set max depth to 1 for subpackages (only showing module files), makes it less confusing
for file in cisticola.rst cisticola.scraper.rst cisticola.transformer.rst
do
sed -i "${REPLACE_MAXDEPTH}" ${RST_SOURCE_DIR}/${file};
done

View File

@@ -20,4 +20,9 @@ Cisticola has many components
The data extracted by scrapers varies by platform, but typically includes media files attached to posts.
Separating the "scraping" and "transforming" steps is useful because it ensures that no data is thrown away during the transormation. There may be some fields in the raw data that aren't included in the transformed format, but could be found to be useful in the future.
Separating the "scraping" and "transforming" steps is useful because it ensures that no data is thrown away during the transormation. There may be some fields in the raw data that aren't included in the transformed format, but could be found to be useful in the future.
TODO
- Add diagram
- Describe common workflow and steps
- Update environment variables

View File

@@ -10,7 +10,7 @@ Subpackages
-----------
.. toctree::
:maxdepth: 4
:maxdepth: 1
cisticola.scraper
cisticola.transformer
@@ -19,7 +19,7 @@ Submodules
----------
.. toctree::
:maxdepth: 4
:maxdepth: 1
cisticola.base
cisticola.utils