added docstrings, improved Sphinx docs

2026-06-12 13:28:34 +03:00 · 2022-03-14 18:04:27 -05:00
parent 6cf3b8842d
commit e4cf9daf73
25 changed files with 700 additions and 311 deletions
--- a/cisticola/scraper/base.py
+++ b/cisticola/scraper/base.py
@@ -13,29 +13,73 @@ from cisticola.base import Channel, ScraperResult, mapper_registry
 from cisticola.scraper import make_request

 class Scraper:
+    """Base class for defining platform-specific scrapers for scraping all posts 
+    from a given channel on that specific platform. 
+    """
+
    __version__ = "Scraper 0.0.0"

    def __init__(self):
-        self.s3_client = boto3.client('s3',
-                                      region_name=os.environ['DO_SPACES_REGION'],
-                                      endpoint_url='https://{}.digitaloceanspaces.com'.format(
-                                          os.environ['DO_SPACES_REGION']),
-                                      aws_access_key_id=os.environ['DO_SPACES_KEY'],
-                                      aws_secret_access_key=os.environ['DO_SPACES_SECRET'])

+        # Initialize client to transfer files to the storage archive
+        self.s3_client = boto3.client(
+            service_name='s3',
+            region_name=os.environ['DO_SPACES_REGION'],
+            endpoint_url=f'https://{os.environ["DO_SPACES_REGION"]}.digitaloceanspaces.com',
+            aws_access_key_id=os.environ['DO_SPACES_KEY'],
+            aws_secret_access_key=os.environ['DO_SPACES_SECRET'])
+        
+        # Define request headers (necessary to bypass scraping protection 
+        # for several platform scrapers)
        self.headers = {
            'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'}

-        pass
-
    def __str__(self):
        return self.__version__

    def url_to_key(self, url: str, content_type: str) -> str:
+        """Generate a unique identifier for media from a specified post.
+
+        Parameters
+        ---------
+        url: str
+            URL of original post. 
+            e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"``
+        content_type: str
+            Content-Type of media. 
+            e.g. ``"image/jpeg"``
+
+        Returns
+        -------
+        key: str
+            Unique identifier for the media file from a specified post based on 
+            the original post URL and the media's Content-Type. 
+        """
+
        key = urlparse(url).path.split('/')[-1]
        return key 

    def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
+        """Download media file from a specified post URL.
+
+        Parameters
+        ---------
+        url: str
+            URL of original post. 
+            e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"``
+        key: str or None
+            Pre-defined unique identifier for the media file.
+
+        Returns
+        -------
+        blob: bytes
+            Raw bytes of the downloaded media file. 
+        content_type: str
+            Content-Type of media. 
+            e.g. ``"image/jpeg"``.
+        key: str
+            Unique identifier for the media file.
+        """

        r = make_request(url, headers = self.headers)

@@ -48,6 +92,27 @@ class Scraper:
        return blob, content_type, key

    def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
+        """Download media file from a specified post URL, where the media file 
+        is formatted as an m3u8 playlist, which is then decoded to an mp4 file.
+
+        Parameters
+        ---------
+        url: str
+            URL of original post. 
+            e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"``
+        key: str or None
+            Pre-defined unique identifier for the media file.
+
+        Returns
+        -------
+        blob: bytes
+            Raw bytes of the downloaded media file. 
+        content_type: str
+            Content-Type of media. 
+            e.g. ``"video/mp4"``.
+        key: str
+            Unique identifier for the media file.
+        """
        
        content_type = 'video/mp4'
        ext = '.' + content_type.split('/')[-1]
@@ -70,6 +135,23 @@ class Scraper:
        return blob, content_type, key

    def archive_blob(self, blob: bytes, content_type: str, key: str) -> str:
+        """Upload raw bytes of a media file to the storage archive. 
+
+        Parameters
+        ----------
+        blob: bytes
+            Raw bytes of the media file to be archived.
+        content_type: str
+            Content-Type of media. 
+            e.g. ``"video/mp4"``.
+        key: str
+            Unique identifier for the media file.
+
+        Returns
+        -------
+        archived_url: str
+            URL specifying the file on the storage archive.
+        """

        filename = self.__version__.replace(' ', '_') + '/' + key

@@ -81,9 +163,37 @@ class Scraper:
        return archived_url

    def can_handle(self, channel: Channel) -> bool:
+        """Whether or not the scraper can scrape the specified channel.
+
+        Parameters
+        ----------
+        channel: Channel
+            Channel to be scraped. 
+        
+        Returns
+        -------
+        bool
+            ``True`` if the scraper is capable of scraping ``channel``,
+            ``False`` if not. 
+        """
+
        raise NotImplementedError

    def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
+        """Scrape all posts from the specified Channel.
+
+        Parameters
+        ----------
+        channel: Channel
+            Channel to be scraped.
+        since: ScraperResult or None
+            Most recently scraped ScraperResult from a previous scrape, or 
+            ``None`` if scraper has not run before.
+        archive_media: bool
+            If ``True``, any media files (images, video, etc.) from posts are archived. 
+            If ``False``, media files are not archived. 
+        """
+        
        raise NotImplementedError


@@ -97,13 +207,28 @@ class ScraperController:
        self.mapper_registry = None

    def register_scraper(self, scraper: Scraper):
+        """Register a single Scraper instance to the controller.
+        """
        self.scrapers.append(scraper)

    def register_scrapers(self, scraper: List[Scraper]):
+        """Register a list of Scraper instances to the controller.
+        """
        self.scrapers.extend(scraper)
    
    @logger.catch
    def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
+        """Scrape all posts for all specified channels. 
+
+        Parameters
+        ----------
+        channels: list<Channel>
+            List of Channel instances to be scraped
+        archive_media: bool
+            If ``True``, any media files (images, video, etc.) from posts are archived. 
+            If ``False``, media files are not archived. 
+        """
+
        if self.session is None:
            logger.error("No DB session")
            return
@@ -143,15 +268,11 @@ class ScraperController:
                logger.warning(f"No handler found for Channel {channel}")

    def connect_to_db(self, engine):
+        """Connect the specified SQLAlchemy engine to the controller.
+        """
+        
        # create tables
        mapper_registry.metadata.create_all(bind=engine)

        self.session = sessionmaker()
-        self.session.configure(bind=engine)
-
-
-class ETLController:
-    """This class will transform the raw_data tables into a format more conducive to analysis."""
-
-    def __init__(self):
-        pass
+        self.session.configure(bind=engine)