added missing docstrings, created Makefile target for sphinx-apidoc, added quickstart page for installation and configuration instructions

2026-06-08 03:18:34 +03:00 · 2022-03-15 12:40:18 -05:00
parent ee9a8c10dd
commit d68d76c0ab
22 changed files with 241 additions and 48 deletions
--- a/cisticola/scraper/base.py
+++ b/cisticola/scraper/base.py
@@ -38,6 +38,24 @@ class Scraper:
    def __str__(self):
        return self.__version__

+    def get_username_from_url(self, url: str) -> str:
+        """Extract a channel's username from its URL. 
+
+        Parameters
+        ----------
+        url: str
+            URL of the channel on a given platform
+            e.g. ``"https://twitter.com/EliotHiggins"``
+        
+        Returns
+        -------
+        username: str
+            Extracted username of the channel.
+            e.g. ``"EliotHiggins"``
+        """
+        
+        raise NotImplementedError
+
    def url_to_key(self, url: str, content_type: str) -> str:
        """Generate a unique identifier for media from a specified post.

@@ -61,13 +79,13 @@ class Scraper:
        return key 

    def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
-        """Download media file from a specified post URL.
+        """Download media file from a specified media file URL.

        Parameters
        ---------
        url: str
-            URL of original post. 
-            e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"``
+            URL of media file from original post. 
+            e.g. ``"https://pbs.twimg.com/media/FN0j0dYWUAcQxfK?format=png&name=medium"``
        key: str or None
            Pre-defined unique identifier for the media file.

@@ -93,14 +111,14 @@ class Scraper:
        return blob, content_type, key

    def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
-        """Download media file from a specified post URL, where the media file 
+        """Download media file from a specified media URL, where the media file 
        is formatted as an m3u8 playlist, which is then decoded to an mp4 file.

        Parameters
        ---------
        url: str
-            URL of original post. 
-            e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"``
+            URL of m3u8 playlist file from original post. 
+            e.g. ``"https://media.gettr.com/group47/origin/2022/03/15/01/cbc436c1-1a1a-4b97-671d-c42109f3ec9b/out.m3u8"``
        key: str or None
            Pre-defined unique identifier for the media file.

@@ -136,7 +154,28 @@ class Scraper:
        return blob, content_type, key

    def ytdlp_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
-        
+        """Download media file from a specified media URL, using a fork of 
+        youtube-dl that enables faster downloading.
+
+        Parameters
+        ---------
+        url: str
+            URL of media file from original post. 
+            e.g. ``"https://rumble.com/embed/vgt7gh/"``
+        key: str or None
+            Pre-defined unique identifier for the media file.
+
+        Returns
+        -------
+        blob: bytes
+            Raw bytes of the downloaded media file. 
+        content_type: str
+            Content-Type of media. 
+            e.g. ``"video/mp4"``.
+        key: str
+            Unique identifier for the media file.
+        """
+
        content_type = 'video/mp4'

        with tempfile.TemporaryDirectory() as temp_dir:
@@ -225,6 +264,11 @@ class Scraper:
        archive_media: bool
            If ``True``, any media files (images, video, etc.) from posts are archived. 
            If ``False``, media files are not archived. 
+
+        Yields
+        ------
+        ScraperResult
+            Scraper result from a single post/comment from the specified Channel.
        """
        
        raise NotImplementedError
@@ -311,7 +355,7 @@ class ScraperController:
        self.session.configure(bind=self.engine)

    def reset_db(self):
-        """Drop all data from the SQLAlchemy database.
+        """Drop all data from the connected SQLAlchemy database.
        """

        mapper_registry.metadata.drop_all(bind=self.engine)
--- a/cisticola/scraper/bitchute.py
+++ b/cisticola/scraper/bitchute.py
@@ -1,4 +1,4 @@
-from datetime import datetime, timezone
+ from datetime import datetime, timezone
 import time
 import re 
 from html.parser import HTMLParser
@@ -17,7 +17,7 @@ class BitchuteScraper(Scraper):
    library"""
    __version__ = "BitchuteScraper 0.0.1"

-    def get_username_from_url(url):
+    def get_username_from_url(self, url):
        username = url.split('bitchute.com/channel/')[-1].strip('/')

        return username
@@ -33,7 +33,7 @@ class BitchuteScraper(Scraper):

        detail = 'comments'

-        username = BitchuteScraper.get_username_from_url(channel.url)
+        username = self.get_username_from_url(channel.url)
        scraper = get_videos_user(session, username, csrftoken, detail)

        for post in scraper:
@@ -61,7 +61,7 @@ class BitchuteScraper(Scraper):
                archived_urls=archived_urls)

    def can_handle(self, channel):
-        if channel.platform == "Bitchute" and BitchuteScraper.get_username_from_url(channel.url) is not None:
+        if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
            return True

 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
--- a/cisticola/scraper/gab.py
+++ b/cisticola/scraper/gab.py
@@ -11,14 +11,14 @@ class GabScraper(Scraper):
    """An implementation of a Scraper for Gab, using GARC library"""
    __version__ = "GabScraper 0.0.1"

-    def get_username_from_url(url):
+    def get_username_from_url(self, url):
        username = url.split('https://gab.com/')[-1]

        return username

    def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
        client = Garc(profile = 'main')
-        username = GabScraper.get_username_from_url(channel.url)
+        username = self.get_username_from_url(channel.url)

        scraper = client.userposts(username)

@@ -52,5 +52,5 @@ class GabScraper(Scraper):
                archived_urls=archived_urls)

    def can_handle(self, channel):
-        if channel.platform == "Gab" and GabScraper.get_username_from_url(channel.url) is not None:
+        if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:
            return True
--- a/cisticola/scraper/gettr.py
+++ b/cisticola/scraper/gettr.py
@@ -12,7 +12,7 @@ class GettrScraper(Scraper):
    """An implementation of a Scraper for Gettr, using gogettr library"""
    __version__ = "GettrScraper 0.0.1"

-    def get_username_from_url(url):
+    def get_username_from_url(self, url):
        username = url.split("gettr.com/user/")[1]
        if len(username.split("/")) > 1:
            return None
@@ -21,7 +21,7 @@ class GettrScraper(Scraper):

    def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
        client = PublicClient()
-        username = GettrScraper.get_username_from_url(channel.url)
+        username = self.get_username_from_url(channel.url)
        scraper = client.user_activity(username=username, type="posts")

        for post in scraper:
@@ -62,7 +62,7 @@ class GettrScraper(Scraper):
                archived_urls=archived_urls)

    def can_handle(self, channel):
-        if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None:
+        if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None:
            return True

    def url_to_key(self, url: str, content_type: str) -> str:
--- a/cisticola/scraper/instagram.py
+++ b/cisticola/scraper/instagram.py
@@ -18,6 +18,7 @@ CONTENT_TYPES = {
    'mp4' : 'video/mp4'}

 class InstagramScraper(Scraper):
+    """An implementation of a Scraper for Instagram, using instaloader library"""
    __version__ = "InstagramScraper 0.0.1"

    def get_username_from_url(self, url):
--- a/cisticola/scraper/odysee.py
+++ b/cisticola/scraper/odysee.py
@@ -13,7 +13,7 @@ class OdyseeScraper(Scraper):
    """An implementation of a Scraper for Odysee, using polyphemus library"""
    __version__ = "OdyseeScraper 0.0.1"

-    def get_username_from_url(url):
+    def get_username_from_url(self, url):

        username = url.split('odysee.com/')[-1].strip('@').split(':')[0]

@@ -21,7 +21,7 @@ class OdyseeScraper(Scraper):

    def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:

-        username = OdyseeScraper.get_username_from_url(channel.url)
+        username = self.get_username_from_url(channel.url)
        odysee_channel = OdyseeChannel(channel_name = username)
        
        all_videos = odysee_channel.get_all_videos()
@@ -70,7 +70,7 @@ class OdyseeScraper(Scraper):
                    archived_urls={})

    def can_handle(self, channel):
-        if channel.platform == "Odysee" and OdyseeScraper.get_username_from_url(channel.url) is not None:
+        if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None:
            return True

    def url_to_key(self, url: str, content_type: str) -> str:
--- a/cisticola/scraper/rumble.py
+++ b/cisticola/scraper/rumble.py
@@ -14,14 +14,14 @@ class RumbleScraper(Scraper):
    """An implementation of a Scraper for Rumble, using custom functions"""
    __version__ = "RumbleScraper 0.0.1"

-    def get_username_from_url(url):
+    def get_username_from_url(self, url):
        username = url.split('https://rumble.com/c/')[1]

        return username

    def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:

-        username = RumbleScraper.get_username_from_url(channel.url)
+        username = self.get_username_from_url(channel.url)
        scraper = get_channel_videos(username)

        for post in scraper:
@@ -54,7 +54,7 @@ class RumbleScraper(Scraper):
        return key 

    def can_handle(self, channel):
-        if channel.platform == "Rumble" and RumbleScraper.get_username_from_url(channel.url) is not None:
+        if channel.platform == "Rumble" and self.get_username_from_url(channel.url) is not None:
            return True

 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
--- a/cisticola/scraper/telegram_snscrape.py
+++ b/cisticola/scraper/telegram_snscrape.py
@@ -8,6 +8,7 @@ from cisticola.base import Channel, ScraperResult
 from cisticola.scraper.base import Scraper

 class TelegramSnscrapeScraper(Scraper):
+    """An implementation of a Scraper for Telegram, using snscrape library"""
    __version__ = "TelegramSnscrapeScraper 0.0.1"

    def can_handle(self, channel):
--- a/cisticola/scraper/telegram_telethon.py
+++ b/cisticola/scraper/telegram_telethon.py
@@ -14,6 +14,7 @@ from cisticola.scraper.base import Scraper
 MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']

 class TelegramTelethonScraper(Scraper):
+    """An implementation of a Scraper for Telegram, using Telethon library"""
    __version__ = "TelegramTelethonScraper 0.0.1"

    def get_username_from_url(self, url):
@@ -30,9 +31,9 @@ class TelegramTelethonScraper(Scraper):

        username = self.get_username_from_url(channel.url)

-        api_id = os.environ['TELEGRAM_API_ID_1']
-        api_hash = os.environ['TELEGRAM_API_HASH_1']
-        phone = os.environ['TELEGRAM_PHONE_1']
+        api_id = os.environ['TELEGRAM_API_ID']
+        api_hash = os.environ['TELEGRAM_API_HASH']
+        phone = os.environ['TELEGRAM_PHONE']

        with TelegramClient(phone, api_id, api_hash) as client:

--- a/docs/Makefile
+++ b/docs/Makefile
@@ -8,12 +8,24 @@ SPHINXBUILD   ?= sphinx-build
 SOURCEDIR     = source
 BUILDDIR      = build

+SPHINXAPIDOC  = sphinx-apidoc
+APIDOCFLAGS   = --separate --private --module-first
+MODULEPATH    = ../cisticola
+SOURCEFILES   = cisticola.*
+MODULEFILE    = modules.rst
+
 # Put it first so that "make" without argument is like "make help".
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

 .PHONY: help Makefile

+# Custom process and flags for generating Sphinx sources
+apidoc:
+	rm $(SOURCEDIR)/$(SOURCEFILES)
+	$(SPHINXAPIDOC) $(APIDOCFLAGS) -o "$(SOURCEDIR)" "$(MODULEPATH)"
+	rm $(SOURCEDIR)/$(MODULEFILE)
+
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
--- a/docs/make.bat
+++ b/docs/make.bat
@@ -10,6 +10,12 @@ if "%SPHINXBUILD%" == "" (
 set SOURCEDIR=source
 set BUILDDIR=build

+set SPHINXAPIDOC=sphinx-apidoc
+set APIDOCFLAGS=--separate --private --module-first
+set MODULEPATH=../cisticola
+set SOURCEFILES=cisticola.*
+set MODULEFILE=modules.rst
+
 if "%1" == "" goto help

 %SPHINXBUILD% >NUL 2>NUL
@@ -28,6 +34,11 @@ if errorlevel 9009 (
 %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
 goto end

+:apidoc
+	del %SOURCEDIR%\%SOURCEFILES%
+	%SPHINXAPIDOC% %APIDOCFLAGS% -o %SOURCEDIR% %MODULEPATH%
+	del %SOURCEDIR%\%MODULEFILE%
+
 :help
 %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%

--- a/docs/source/cisticola.rst
+++ b/docs/source/cisticola.rst
@@ -23,3 +23,4 @@ Submodules
   :maxdepth: 4

   cisticola.base
+   cisticola.utils
--- a/docs/source/cisticola.scraper.instagram.rst
+++ b/docs/source/cisticola.scraper.instagram.rst
@@ -0,0 +1,8 @@
+cisticola.scraper.instagram module
+==================================
+
+.. automodule:: cisticola.scraper.instagram
+   :members:
+   :undoc-members:
+   :show-inheritance:
+   :private-members:
--- a/docs/source/cisticola.scraper.rst
+++ b/docs/source/cisticola.scraper.rst
@@ -17,9 +17,11 @@ Submodules
   cisticola.scraper.bitchute
   cisticola.scraper.gab
   cisticola.scraper.gettr
+   cisticola.scraper.instagram
   cisticola.scraper.odysee
   cisticola.scraper.rumble
   cisticola.scraper.telegram_snscrape
   cisticola.scraper.telegram_telethon
   cisticola.scraper.twitter
-   cisticola.scraper.utils
+   cisticola.scraper.vkontakte
+   cisticola.scraper.youtube
--- a/docs/source/cisticola.scraper.utils.rst
+++ b/docs/source/cisticola.scraper.utils.rst
@@ -1,8 +0,0 @@
-cisticola.scraper.utils module
-==============================
-
-.. automodule:: cisticola.scraper.utils
-   :members:
-   :undoc-members:
-   :show-inheritance:
-   :private-members:
--- a/docs/source/cisticola.scraper.vkontakte.rst
+++ b/docs/source/cisticola.scraper.vkontakte.rst
@@ -0,0 +1,8 @@
+cisticola.scraper.vkontakte module
+==================================
+
+.. automodule:: cisticola.scraper.vkontakte
+   :members:
+   :undoc-members:
+   :show-inheritance:
+   :private-members:
--- a/docs/source/cisticola.scraper.youtube.rst
+++ b/docs/source/cisticola.scraper.youtube.rst
@@ -0,0 +1,8 @@
+cisticola.scraper.youtube module
+================================
+
+.. automodule:: cisticola.scraper.youtube
+   :members:
+   :undoc-members:
+   :show-inheritance:
+   :private-members:
--- a/docs/source/cisticola.transformer.bitchute.rst
+++ b/docs/source/cisticola.transformer.bitchute.rst
@@ -0,0 +1,8 @@
+cisticola.transformer.bitchute module
+=====================================
+
+.. automodule:: cisticola.transformer.bitchute
+   :members:
+   :undoc-members:
+   :show-inheritance:
+   :private-members:
--- a/docs/source/cisticola.transformer.rst
+++ b/docs/source/cisticola.transformer.rst
@@ -14,4 +14,5 @@ Submodules
   :maxdepth: 4

   cisticola.transformer.base
+   cisticola.transformer.bitchute
   cisticola.transformer.twitter
--- a/docs/source/cisticola.utils.rst
+++ b/docs/source/cisticola.utils.rst
@@ -0,0 +1,8 @@
+cisticola.utils module
+======================
+
+.. automodule:: cisticola.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
+   :private-members:
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -2,16 +2,7 @@ Welcome to Cisticola's documentation!
 =====================================

 .. toctree::
-   :maxdepth: 2
-   :caption: Contents:
+  :maxdepth: 1

-   cisticola
-
-
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
+  quickstart
+  cisticola
--- a/docs/source/quickstart.rst
+++ b/docs/source/quickstart.rst
@@ -0,0 +1,96 @@
+Quickstart
+==========
+
+Installation
+------------
+
+The *cisticola* application uses pipenv_ for dependency management. To install the dependencies of *cisticola*, first install pipenv using the following command:
+
+.. code-block::
+
+    pip install pipenv
+
+and then install the dependencies using the following command from the package root directory:
+
+.. code-block::
+
+    pipenv install
+
+To install the necessary dependencies for building the documentation and running unit tests, run the following command from the package root directory:
+
+.. code-block::
+
+    pipenv install --dev
+
+Environment Variables
+---------------------
+
+Three of the scrapers in *cisticola* (:py:mod:`~cisticola.scraper.gab.GabScraper`,  :py:mod:`~cisticola.scraper.instagram.InstagramScraper`, and :py:mod:`~cisticola.scraper.telegram_telethon.TelegramTelethonScraper`) require platform credentials to work correctly. 
+
+Gab
+"""
+
+The Gab credentials can be configured by running the following command from the root directory:
+
+.. code-block::
+
+    pipenv run garc configure 
+
+which will direct you to provide the username and password for your Gab account.
+
+Instagram
+"""""""""
+
+The Instagram credentials can be configured by setting the following environment variables, either in the project's ``.env`` file or in the system's environment:
+
+- ``INSTAGRAM_USERNAME``: username of your Instagram account
+- ``INSTAGRAM_PASSWORD``: password of your Instagram account
+
+Telegram Telethon
+"""""""""""""""""
+
+The Telegram credentials can be configured by setting the following environment variables, either in the project's ``.env`` file or in the system's environment:
+
+- ``TELEGRAM_API_ID``: API ID number for your Telegram application
+- ``TELEGRAM_API_HASH``: API hash for your Telegram application
+- ``TELEGRAM_PHONE``: phone number for the account corresponding to your your Telegram application
+
+If you do not already have a Telegram application, you can create one by following the instructions on `this page`_.
+
+Documentation
+-------------
+
+The *cisticola* application uses Sphinx_ to generate and display its documentation. To build the documentation in the HTML format, run the following command from the ``docs/`` directory:
+
+.. code-block::
+
+    pipenv run make html
+
+For developers, if changes are made to the package structure or additional modules are created, you can update the Sphinx source ``*.rst`` files by running the following command from the ``docs/`` directory:
+
+.. code-block::
+
+    pipenv run make apidoc
+
+Testing
+-------
+
+The *cisticola* application uses pytest_ for unit testing. To run the test suite, run the following command from the package root directory:
+
+.. code-block::
+
+    pipenv run pytest
+
+Examples
+--------
+
+An example of a *cisticola* ingest file ``russian_telegram_ingest.py`` is included in the package root directory, showing how the list of channels to scrape is defined, and how the :py:mod:`~cisticola.scraper.base.ScraperController` and :py:mod:`~cisticola.transformer.base.Transformer` classes are used. To run the ingest script, run the following command from the package root directory:
+
+.. code-block::
+
+    pipenv run python russian_telegram_ingest.py
+
+.. _pipenv: https://pipenv.pypa.io/en/latest/
+.. _Sphinx: https://www.sphinx-doc.org/en/master/
+.. _pytest: https://docs.pytest.org/en/7.1.x/
+.. _this page: https://core.telegram.org/api/obtaining_api_id