diff --git a/cisticola/base.py b/cisticola/base.py index d9516eb..0087d20 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -24,7 +24,7 @@ PIL.Image.MAX_IMAGE_PIXELS = 1024 * 1024 * 256 @dataclass class ScraperResult: - """A minimally processed result from a scraper + """Minimally processed set of information from a scraper about one post """ #: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``. @@ -100,7 +100,7 @@ class Channel: @dataclass class RawChannelInfo: - """A minimally processed result from a scraper + """Minimally processed set of information from a scraper about one channel """ #: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``. diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index a841be9..a520e0d 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -323,19 +323,43 @@ class ScraperController: self.session = None def register_scraper(self, scraper: Scraper): - """Register a single Scraper instance to the controller. + """Add a single Scraper instance to the list of available Scrapers. + + Parameters + ---------- + scraper: cisticola.scraper.Scraper + Instance of platform-specific scraper to be controlled by the ScraperController """ self.scrapers.append(scraper) - def register_scrapers(self, scraper: List[Scraper]): - """Register a list of Scraper instances to the controller. + def register_scrapers(self, scrapers: List[Scraper]): + """Add a a list of Scraper instances to the list of available Scrapers. + + Parameters + ---------- + scrapers: cisticola.scraper.Scraper + List of instances of platform-specific scrapers to be controlled by the ScraperController + """ - self.scrapers.extend(scraper) + self.scrapers.extend(scrapers) def remove_all_scrapers(self): + """Reset the ScraperController so that it doesn't control any scrapers + """ self.scrapers = [] def scrape_all_channels(self, archive_media: bool = True, fetch_old: bool = False): + """Scrape posts from all channels in the database, that satisfy a researcher-specified criteria + + Parameters + ---------- + archive_media: bool + If ``True``, any media files (images, video, etc.) from posts are archived. + If ``False``, media files are not archived. + fetch_old: bool + If ``True``, scrape all posts from channels, regardless of when channel was last scraped. + If ``False``, scrape only posts that are more recent than the previous scrape of each channel. + """ if self.session is None: logger.error("No DB session") return @@ -350,6 +374,8 @@ class ScraperController: return self.scrape_channels(channels, archive_media=archive_media, fetch_old=fetch_old) def scrape_all_channel_info(self): + """Scrape profile information from all channels in the database. + """ if self.session is None: logger.error("No DB session") return @@ -368,7 +394,7 @@ class ScraperController: return self.scrape_channel_info(channels) def scrape_channels(self, channels: List[Channel], archive_media: bool = True, fetch_old: bool = False): - """Scrape all posts for all specified channels. + """Scrape all posts from a specified list of channels. Parameters ---------- @@ -376,7 +402,10 @@ class ScraperController: List of Channel instances to be scraped archive_media: bool If ``True``, any media files (images, video, etc.) from posts are archived. - If ``False``, media files are not archived. + If ``False``, media files are not archived. + fetch_old: bool + If ``True``, scrape all posts from channels, regardless of when channel was last scraped. + If ``False``, scrape only posts that are more recent than the previous scrape of each channel. """ if self.session is None: @@ -455,6 +484,16 @@ class ScraperController: session.close() def archive_unarchived_media_batch(self, session = None, chronological=False): + """Archive previously unarchived media URLs from a batch of raw_post rows. + + Parameters + ---------- + session: sqlalchemy.orm.Session or None + SQLAlchemy Session that interfaces with the database + chronological: bool + If ``True``, media attachments are archived starting with the oldest post + If ``False``, media attachments are archived in random order + """ if session is None: session = self.session() if chronological: @@ -489,6 +528,14 @@ class ScraperController: @logger.catch(reraise = True) def archive_unarchived_media(self, chronological=False): + """Archive previously unarchived media URLs from all raw_post rows. + + Parameters + ---------- + chronological: bool + If ``True``, media attachments are archived starting with the oldest post + If ``False``, media attachments are archived in random order + """ if self.session is None: logger.error("No DB session") return @@ -498,9 +545,6 @@ class ScraperController: while True: self.archive_unarchived_media_batch(self, session=session, chronological=chronological) - - session.close() - @logger.catch(reraise = True) def scrape_channel_info(self, channels: List[Channel]): """Scrape channel info for specified channels. @@ -509,9 +553,6 @@ class ScraperController: ---------- channels: list List of Channel instances to be scraped - archive_media: bool - If ``True``, any media files (images, video, etc.) from posts are archived. - If ``False``, media files are not archived. """ if self.session is None: @@ -551,6 +592,11 @@ class ScraperController: def connect_to_db(self, engine): """Connect the specified SQLAlchemy engine to the controller. + + Parameters + ---------- + engine: sqlalchemy.engine.Engine + Instance of SQLAlchemy engine to connect to """ # create tables diff --git a/cisticola/transformer/base.py b/cisticola/transformer/base.py index 7da666f..9c75cea 100644 --- a/cisticola/transformer/base.py +++ b/cisticola/transformer/base.py @@ -30,7 +30,7 @@ class Transformer: Returns ------- bool - True if it can be handled by this Transformer, false otherwise. + ``True`` if it can be handled by this Transformer, false otherwise. """ pass @@ -52,22 +52,36 @@ class Transformer: pass def transform_media(self, data: ScraperResult, transformed: Post, insert: Callable): - '''Transform media''' + """Transform a post's media attachment to standard form and insert into database. + + Parameters + ---------- + data: cisticola.base.ScraperResult + Raw post data of post that media file was attached to + transformed: cisticola.base.Post + Transformed post data of post that media file was attached to + insert: Callable + A function that either inserts the object into a database or finds an object with the + relevant unique constraints if applicable. + """ for k in data.archived_urls: if data.archived_urls[k]: archived_url = data.archived_urls[k] filename = archived_url.split('/')[-1] ext = None if '.' not in filename else filename.split('.')[-1].lower() - if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv': - insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform)) - elif ext == 'oga' or ext == 'mp3' or ext == "wav" or ext == 'aif' or ext == 'aiff' or ext == 'aac': - insert(Audio(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform)) - elif ext == 'jpg' or ext == 'jpeg' or ext == 'png' or ext == 'gif' or ext == 'bmp' or ext == 'heic' or ext == 'tiff': - insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform)) + media_kwargs = dict(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform) + + if ext in ('mp4', 'mov', 'avi', 'mkv'): + media_class = Video + elif ext in ('oga', 'mp3', "wav", 'aif', 'aiff', 'aac'): + media_class = Audio + elif ext in ('jpg', 'jpeg', 'png', 'gif', 'bmp', 'heic', 'tiff'): + media_class = Image else: logger.warning(f"Unknown file extension {ext}") - insert(Media(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform)) + media_class = Media + insert(media_class(**media_kwargs)) class ETLController: @@ -81,27 +95,35 @@ class ETLController: self.transformers = [] def register_transformer(self, transformer: Transformer): - """Adds a Transformer to the list of available Transformers. + """Add a single Transformer instance to the list of available Transformers. Parameters ---------- transformer : Transformer - The Transformer to register + Instance of platform-specific Transformer to be controlled by the ETLController """ self.transformers.append(transformer) def register_transformers(self, transformers): + """Add a a list of Transformer instances to the list of available Transformers. + + Parameters + ---------- + scrapers: cisticola.scraper.Scraper + List of instances of platform-specific Transformers to be controlled by the ETLController + + """ for t in transformers: self.register_transformer(t) def connect_to_db(self, engine: Engine): - """Connects the ETLController to a SQLAlchemy engine. + """Connect the ETLController to a SQLAlchemy engine. Parameters ---------- - engine : Engine - SQLAlchemy Engine object + engine : sqlalchemy.engine.Engine + Instance of SQLAlchemy Engine object to connect to """ # create tables mapper_registry.metadata.create_all(bind=engine) @@ -111,11 +133,36 @@ class ETLController: # MAY4 can try adding some new functions for batching post inserts def flush_posts(self, session): + """Save all outstanding posts to the database. For efficiency, instead of saving posts one at a time, the ETLController maintains a list of posts (``posts_to_insert``) and saves them in bulk. + + Parameters + ---------- + session: sqlalchemy.orm.Session + SQLAlchemy Session that interfaces with the database + """ session.bulk_save_objects(self.posts_to_insert) # logger.info(f"Bulk saved {len(self.posts_to_insert)} posts") self.posts_to_insert = [] def insert_post(self, obj, session, hydrate: bool = True, flush: bool = False): + """Insert an object into the connected database. + + Parameters + ---------- + obj: + Instance of ORM-mapped class in the ``cisticola.base`` module to be inserted into the database + session: sqlalchemy.orm.Session + SQLAlchemy Session that interfaces with the database + hydrate: bool + If ``True``, additional data fields are extracted from the object and populated in the given database table + flush: bool + If ``True``, the object is returned with additional populated data fields (such as a primary key ID). + If ``False``, the object is added to ``posts_to_insert`` and nothing is returned + + Returns + ------- + None, or instance of ORM-mapped class from ``cisticola.base`` that has been inserted into the database, with additional data fields if ``flush`` argument is ``True``. + """ if hydrate and type(obj) != Video: obj.hydrate() @@ -133,8 +180,23 @@ class ETLController: return None def insert_or_select(self, obj, session, hydrate: bool = True): - """Inserts an object into the database or returns an existing object from the database. - Regardless, the resulting object has an `id` attribute that can be referenced later.""" + """Insert an object into the database or return an existing object from the database. + Regardless, the resulting object has an `id` attribute that can be referenced later. + + Parameters + ---------- + obj: + Instance of ORM-mapped class in the ``cisticola.base`` module to be inserted into the database + session: sqlalchemy.orm.Session + SQLAlchemy Session that interfaces with the database + hydrate: bool + If ``True``, additional data fields are extracted from the object and populated in the given database table + + Returns + ------- + Object that has been inserted into the database, or existing object in the database, or None. + + """ instance = None @@ -209,7 +271,7 @@ class ETLController: @logger.catch(reraise=True) def transform_results(self, results: List[ScraperResult], hydrate: bool = True): - """Transforms raw ScraperResults objects into Post objects and + """Transform raw ScraperResults objects into Post objects and Media objects. Then, adds them to the database. Parameters @@ -254,6 +316,8 @@ class ETLController: ---------- hydrate : bool Whether or not to fully hydrate transformed media. Default True. + min_date: datetime.datetime + Posts made before this date are not transformed. """ if self.session is None: @@ -263,7 +327,6 @@ class ETLController: session = self.session() BATCH_SIZE = 5000 - offset = 0 batch = [] logger.info(f"Fetching first untransformed post batch of {BATCH_SIZE}") @@ -294,9 +357,15 @@ class ETLController: ).all() - @logger.catch(reraise=True) def transform_info(self, results: List[ChannelInfo]): + """Transform raw RawChannelInfo objects into ChannelInfo objects. + + Parameters + ---------- + results : List[ChannelInfo] + A list of ChannelInfo objects to be transformed + """ if self.session is None: logger.error("No DB session") return @@ -325,6 +394,9 @@ class ETLController: @logger.catch(reraise=True) def transform_all_untransformed_info(self): + """Transform all RawChannelInfo objects in the database that do not have an + equivalent ChannelInfo object stored. + """ if self.session is None: logger.error("No DB session") return @@ -355,15 +427,15 @@ class ETLController: @logger.catch(reraise=True) def transform_media(self, results: List, hydrate: bool = True): - """Transforms raw ScraperResults objects into Post objects and - Media objects. Then, adds them to the database. + """Transform raw ScraperResults objects into Post objects and + Media objects, then add them to the database. Parameters ---------- results : List[ScraperResult] A list of ScraperResult objects to be transformed hydrate : bool - Whether or not to fully hydrate transformed media. Default True. + Whether or not to fully hydrate transformed media. Default ``True``. """ if self.session is None: logger.error("No DB session") diff --git a/docs/Makefile b/docs/Makefile index 8e2e9de..9933ba5 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -14,6 +14,8 @@ MODULEPATH = ../cisticola SOURCEFILES = cisticola.* MODULEFILE = modules.rst +POST_APIDOC = edit_apidoc.sh + # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) @@ -25,6 +27,8 @@ apidoc: rm $(SOURCEDIR)/$(SOURCEFILES) $(SPHINXAPIDOC) $(APIDOCFLAGS) -o "$(SOURCEDIR)" "$(MODULEPATH)" rm $(SOURCEDIR)/$(MODULEFILE) + bash $(POST_APIDOC) + # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). diff --git a/docs/edit_apidoc.sh b/docs/edit_apidoc.sh new file mode 100644 index 0000000..733010f --- /dev/null +++ b/docs/edit_apidoc.sh @@ -0,0 +1,17 @@ +# This script makes minor changes to the *.rst files created by Sphinx apidoc + +RST_SOURCE_DIR=source +HIDE_COOKIESTRING=" :exclude-members: cookiestring" +REPLACE_MAXDEPTH="s/ :maxdepth: 4/ :maxdepth: 1/g" + +# Remove display of ``cookiestring`` class variable, otherwise Sphinx generates docs containing the value of your cookiestring, based on your ``YOUTUBE_COOKIESTRING`` environment variable +for file in cisticola.scraper.base.rst cisticola.scraper.rumble.rst cisticola.scraper.youtube.rst +do + echo "$HIDE_COOKIESTRING" >> $RST_SOURCE_DIR/$file; +done + +# Set max depth to 1 for subpackages (only showing module files), makes it less confusing +for file in cisticola.rst cisticola.scraper.rst cisticola.transformer.rst +do + sed -i "${REPLACE_MAXDEPTH}" ${RST_SOURCE_DIR}/${file}; +done \ No newline at end of file diff --git a/docs/source/about.rst b/docs/source/about.rst index 035ddaf..a233fe0 100644 --- a/docs/source/about.rst +++ b/docs/source/about.rst @@ -20,4 +20,9 @@ Cisticola has many components The data extracted by scrapers varies by platform, but typically includes media files attached to posts. -Separating the "scraping" and "transforming" steps is useful because it ensures that no data is thrown away during the transormation. There may be some fields in the raw data that aren't included in the transformed format, but could be found to be useful in the future. \ No newline at end of file +Separating the "scraping" and "transforming" steps is useful because it ensures that no data is thrown away during the transormation. There may be some fields in the raw data that aren't included in the transformed format, but could be found to be useful in the future. + +TODO +- Add diagram +- Describe common workflow and steps +- Update environment variables \ No newline at end of file diff --git a/docs/source/cisticola.rst b/docs/source/cisticola.rst index 8a475e9..1054796 100644 --- a/docs/source/cisticola.rst +++ b/docs/source/cisticola.rst @@ -10,7 +10,7 @@ Subpackages ----------- .. toctree:: - :maxdepth: 4 + :maxdepth: 1 cisticola.scraper cisticola.transformer @@ -19,7 +19,7 @@ Submodules ---------- .. toctree:: - :maxdepth: 4 + :maxdepth: 1 cisticola.base cisticola.utils