diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index a2f921f..28dbe76 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -38,6 +38,24 @@ class Scraper: def __str__(self): return self.__version__ + def get_username_from_url(self, url: str) -> str: + """Extract a channel's username from its URL. + + Parameters + ---------- + url: str + URL of the channel on a given platform + e.g. ``"https://twitter.com/EliotHiggins"`` + + Returns + ------- + username: str + Extracted username of the channel. + e.g. ``"EliotHiggins"`` + """ + + raise NotImplementedError + def url_to_key(self, url: str, content_type: str) -> str: """Generate a unique identifier for media from a specified post. @@ -61,13 +79,13 @@ class Scraper: return key def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: - """Download media file from a specified post URL. + """Download media file from a specified media file URL. Parameters --------- url: str - URL of original post. - e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"`` + URL of media file from original post. + e.g. ``"https://pbs.twimg.com/media/FN0j0dYWUAcQxfK?format=png&name=medium"`` key: str or None Pre-defined unique identifier for the media file. @@ -93,14 +111,14 @@ class Scraper: return blob, content_type, key def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: - """Download media file from a specified post URL, where the media file + """Download media file from a specified media URL, where the media file is formatted as an m3u8 playlist, which is then decoded to an mp4 file. Parameters --------- url: str - URL of original post. - e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"`` + URL of m3u8 playlist file from original post. + e.g. ``"https://media.gettr.com/group47/origin/2022/03/15/01/cbc436c1-1a1a-4b97-671d-c42109f3ec9b/out.m3u8"`` key: str or None Pre-defined unique identifier for the media file. @@ -136,7 +154,28 @@ class Scraper: return blob, content_type, key def ytdlp_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: - + """Download media file from a specified media URL, using a fork of + youtube-dl that enables faster downloading. + + Parameters + --------- + url: str + URL of media file from original post. + e.g. ``"https://rumble.com/embed/vgt7gh/"`` + key: str or None + Pre-defined unique identifier for the media file. + + Returns + ------- + blob: bytes + Raw bytes of the downloaded media file. + content_type: str + Content-Type of media. + e.g. ``"video/mp4"``. + key: str + Unique identifier for the media file. + """ + content_type = 'video/mp4' with tempfile.TemporaryDirectory() as temp_dir: @@ -225,6 +264,11 @@ class Scraper: archive_media: bool If ``True``, any media files (images, video, etc.) from posts are archived. If ``False``, media files are not archived. + + Yields + ------ + ScraperResult + Scraper result from a single post/comment from the specified Channel. """ raise NotImplementedError @@ -311,7 +355,7 @@ class ScraperController: self.session.configure(bind=self.engine) def reset_db(self): - """Drop all data from the SQLAlchemy database. + """Drop all data from the connected SQLAlchemy database. """ mapper_registry.metadata.drop_all(bind=self.engine) diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index 8a365f4..47a822e 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -1,4 +1,4 @@ -from datetime import datetime, timezone + from datetime import datetime, timezone import time import re from html.parser import HTMLParser @@ -17,7 +17,7 @@ class BitchuteScraper(Scraper): library""" __version__ = "BitchuteScraper 0.0.1" - def get_username_from_url(url): + def get_username_from_url(self, url): username = url.split('bitchute.com/channel/')[-1].strip('/') return username @@ -33,7 +33,7 @@ class BitchuteScraper(Scraper): detail = 'comments' - username = BitchuteScraper.get_username_from_url(channel.url) + username = self.get_username_from_url(channel.url) scraper = get_videos_user(session, username, csrftoken, detail) for post in scraper: @@ -61,7 +61,7 @@ class BitchuteScraper(Scraper): archived_urls=archived_urls) def can_handle(self, channel): - if channel.platform == "Bitchute" and BitchuteScraper.get_username_from_url(channel.url) is not None: + if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None: return True #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# diff --git a/cisticola/scraper/gab.py b/cisticola/scraper/gab.py index 910ebc2..f90f2a3 100644 --- a/cisticola/scraper/gab.py +++ b/cisticola/scraper/gab.py @@ -11,14 +11,14 @@ class GabScraper(Scraper): """An implementation of a Scraper for Gab, using GARC library""" __version__ = "GabScraper 0.0.1" - def get_username_from_url(url): + def get_username_from_url(self, url): username = url.split('https://gab.com/')[-1] return username def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: client = Garc(profile = 'main') - username = GabScraper.get_username_from_url(channel.url) + username = self.get_username_from_url(channel.url) scraper = client.userposts(username) @@ -52,5 +52,5 @@ class GabScraper(Scraper): archived_urls=archived_urls) def can_handle(self, channel): - if channel.platform == "Gab" and GabScraper.get_username_from_url(channel.url) is not None: + if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None: return True \ No newline at end of file diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index 3cd069e..4fb15cc 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -12,7 +12,7 @@ class GettrScraper(Scraper): """An implementation of a Scraper for Gettr, using gogettr library""" __version__ = "GettrScraper 0.0.1" - def get_username_from_url(url): + def get_username_from_url(self, url): username = url.split("gettr.com/user/")[1] if len(username.split("/")) > 1: return None @@ -21,7 +21,7 @@ class GettrScraper(Scraper): def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: client = PublicClient() - username = GettrScraper.get_username_from_url(channel.url) + username = self.get_username_from_url(channel.url) scraper = client.user_activity(username=username, type="posts") for post in scraper: @@ -62,7 +62,7 @@ class GettrScraper(Scraper): archived_urls=archived_urls) def can_handle(self, channel): - if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None: + if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None: return True def url_to_key(self, url: str, content_type: str) -> str: diff --git a/cisticola/scraper/instagram.py b/cisticola/scraper/instagram.py index eb20ecb..f9ae76e 100644 --- a/cisticola/scraper/instagram.py +++ b/cisticola/scraper/instagram.py @@ -18,6 +18,7 @@ CONTENT_TYPES = { 'mp4' : 'video/mp4'} class InstagramScraper(Scraper): + """An implementation of a Scraper for Instagram, using instaloader library""" __version__ = "InstagramScraper 0.0.1" def get_username_from_url(self, url): diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py index 61ed9ca..eb7ec04 100644 --- a/cisticola/scraper/odysee.py +++ b/cisticola/scraper/odysee.py @@ -13,7 +13,7 @@ class OdyseeScraper(Scraper): """An implementation of a Scraper for Odysee, using polyphemus library""" __version__ = "OdyseeScraper 0.0.1" - def get_username_from_url(url): + def get_username_from_url(self, url): username = url.split('odysee.com/')[-1].strip('@').split(':')[0] @@ -21,7 +21,7 @@ class OdyseeScraper(Scraper): def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: - username = OdyseeScraper.get_username_from_url(channel.url) + username = self.get_username_from_url(channel.url) odysee_channel = OdyseeChannel(channel_name = username) all_videos = odysee_channel.get_all_videos() @@ -70,7 +70,7 @@ class OdyseeScraper(Scraper): archived_urls={}) def can_handle(self, channel): - if channel.platform == "Odysee" and OdyseeScraper.get_username_from_url(channel.url) is not None: + if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None: return True def url_to_key(self, url: str, content_type: str) -> str: diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index 8546d6e..9863fb0 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -14,14 +14,14 @@ class RumbleScraper(Scraper): """An implementation of a Scraper for Rumble, using custom functions""" __version__ = "RumbleScraper 0.0.1" - def get_username_from_url(url): + def get_username_from_url(self, url): username = url.split('https://rumble.com/c/')[1] return username def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: - username = RumbleScraper.get_username_from_url(channel.url) + username = self.get_username_from_url(channel.url) scraper = get_channel_videos(username) for post in scraper: @@ -54,7 +54,7 @@ class RumbleScraper(Scraper): return key def can_handle(self, channel): - if channel.platform == "Rumble" and RumbleScraper.get_username_from_url(channel.url) is not None: + if channel.platform == "Rumble" and self.get_username_from_url(channel.url) is not None: return True #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# diff --git a/cisticola/scraper/telegram_snscrape.py b/cisticola/scraper/telegram_snscrape.py index 3f3f45d..ec5b292 100644 --- a/cisticola/scraper/telegram_snscrape.py +++ b/cisticola/scraper/telegram_snscrape.py @@ -8,6 +8,7 @@ from cisticola.base import Channel, ScraperResult from cisticola.scraper.base import Scraper class TelegramSnscrapeScraper(Scraper): + """An implementation of a Scraper for Telegram, using snscrape library""" __version__ = "TelegramSnscrapeScraper 0.0.1" def can_handle(self, channel): diff --git a/cisticola/scraper/telegram_telethon.py b/cisticola/scraper/telegram_telethon.py index 76d68f2..b8231bc 100644 --- a/cisticola/scraper/telegram_telethon.py +++ b/cisticola/scraper/telegram_telethon.py @@ -14,6 +14,7 @@ from cisticola.scraper.base import Scraper MEDIA_TYPES = ['photo', 'video', 'document', 'webpage'] class TelegramTelethonScraper(Scraper): + """An implementation of a Scraper for Telegram, using Telethon library""" __version__ = "TelegramTelethonScraper 0.0.1" def get_username_from_url(self, url): @@ -30,9 +31,9 @@ class TelegramTelethonScraper(Scraper): username = self.get_username_from_url(channel.url) - api_id = os.environ['TELEGRAM_API_ID_1'] - api_hash = os.environ['TELEGRAM_API_HASH_1'] - phone = os.environ['TELEGRAM_PHONE_1'] + api_id = os.environ['TELEGRAM_API_ID'] + api_hash = os.environ['TELEGRAM_API_HASH'] + phone = os.environ['TELEGRAM_PHONE'] with TelegramClient(phone, api_id, api_hash) as client: diff --git a/docs/Makefile b/docs/Makefile index d0c3cbf..ab3e9be 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -8,12 +8,24 @@ SPHINXBUILD ?= sphinx-build SOURCEDIR = source BUILDDIR = build +SPHINXAPIDOC = sphinx-apidoc +APIDOCFLAGS = --separate --private --module-first +MODULEPATH = ../cisticola +SOURCEFILES = cisticola.* +MODULEFILE = modules.rst + # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile +# Custom process and flags for generating Sphinx sources +apidoc: + rm $(SOURCEDIR)/$(SOURCEFILES) + $(SPHINXAPIDOC) $(APIDOCFLAGS) -o "$(SOURCEDIR)" "$(MODULEPATH)" + rm $(SOURCEDIR)/$(MODULEFILE) + # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile diff --git a/docs/make.bat b/docs/make.bat index 6fcf05b..3ab2ef7 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -10,6 +10,12 @@ if "%SPHINXBUILD%" == "" ( set SOURCEDIR=source set BUILDDIR=build +set SPHINXAPIDOC=sphinx-apidoc +set APIDOCFLAGS=--separate --private --module-first +set MODULEPATH=../cisticola +set SOURCEFILES=cisticola.* +set MODULEFILE=modules.rst + if "%1" == "" goto help %SPHINXBUILD% >NUL 2>NUL @@ -28,6 +34,11 @@ if errorlevel 9009 ( %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% goto end +:apidoc + del %SOURCEDIR%\%SOURCEFILES% + %SPHINXAPIDOC% %APIDOCFLAGS% -o %SOURCEDIR% %MODULEPATH% + del %SOURCEDIR%\%MODULEFILE% + :help %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% diff --git a/docs/source/cisticola.rst b/docs/source/cisticola.rst index 6857abd..22cdf67 100644 --- a/docs/source/cisticola.rst +++ b/docs/source/cisticola.rst @@ -23,3 +23,4 @@ Submodules :maxdepth: 4 cisticola.base + cisticola.utils diff --git a/docs/source/cisticola.scraper.instagram.rst b/docs/source/cisticola.scraper.instagram.rst new file mode 100644 index 0000000..53ddc43 --- /dev/null +++ b/docs/source/cisticola.scraper.instagram.rst @@ -0,0 +1,8 @@ +cisticola.scraper.instagram module +================================== + +.. automodule:: cisticola.scraper.instagram + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.scraper.rst b/docs/source/cisticola.scraper.rst index 5e3d9a1..b93592c 100644 --- a/docs/source/cisticola.scraper.rst +++ b/docs/source/cisticola.scraper.rst @@ -17,9 +17,11 @@ Submodules cisticola.scraper.bitchute cisticola.scraper.gab cisticola.scraper.gettr + cisticola.scraper.instagram cisticola.scraper.odysee cisticola.scraper.rumble cisticola.scraper.telegram_snscrape cisticola.scraper.telegram_telethon cisticola.scraper.twitter - cisticola.scraper.utils + cisticola.scraper.vkontakte + cisticola.scraper.youtube diff --git a/docs/source/cisticola.scraper.utils.rst b/docs/source/cisticola.scraper.utils.rst deleted file mode 100644 index ceefb4d..0000000 --- a/docs/source/cisticola.scraper.utils.rst +++ /dev/null @@ -1,8 +0,0 @@ -cisticola.scraper.utils module -============================== - -.. automodule:: cisticola.scraper.utils - :members: - :undoc-members: - :show-inheritance: - :private-members: diff --git a/docs/source/cisticola.scraper.vkontakte.rst b/docs/source/cisticola.scraper.vkontakte.rst new file mode 100644 index 0000000..405d70d --- /dev/null +++ b/docs/source/cisticola.scraper.vkontakte.rst @@ -0,0 +1,8 @@ +cisticola.scraper.vkontakte module +================================== + +.. automodule:: cisticola.scraper.vkontakte + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.scraper.youtube.rst b/docs/source/cisticola.scraper.youtube.rst new file mode 100644 index 0000000..e990195 --- /dev/null +++ b/docs/source/cisticola.scraper.youtube.rst @@ -0,0 +1,8 @@ +cisticola.scraper.youtube module +================================ + +.. automodule:: cisticola.scraper.youtube + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.transformer.bitchute.rst b/docs/source/cisticola.transformer.bitchute.rst new file mode 100644 index 0000000..7427e9f --- /dev/null +++ b/docs/source/cisticola.transformer.bitchute.rst @@ -0,0 +1,8 @@ +cisticola.transformer.bitchute module +===================================== + +.. automodule:: cisticola.transformer.bitchute + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/cisticola.transformer.rst b/docs/source/cisticola.transformer.rst index 218e1ec..358d955 100644 --- a/docs/source/cisticola.transformer.rst +++ b/docs/source/cisticola.transformer.rst @@ -14,4 +14,5 @@ Submodules :maxdepth: 4 cisticola.transformer.base + cisticola.transformer.bitchute cisticola.transformer.twitter diff --git a/docs/source/cisticola.utils.rst b/docs/source/cisticola.utils.rst new file mode 100644 index 0000000..6e5872a --- /dev/null +++ b/docs/source/cisticola.utils.rst @@ -0,0 +1,8 @@ +cisticola.utils module +====================== + +.. automodule:: cisticola.utils + :members: + :undoc-members: + :show-inheritance: + :private-members: diff --git a/docs/source/index.rst b/docs/source/index.rst index e3f70a9..3c12d81 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -2,16 +2,7 @@ Welcome to Cisticola's documentation! ===================================== .. toctree:: - :maxdepth: 2 - :caption: Contents: + :maxdepth: 1 - cisticola - - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` + quickstart + cisticola \ No newline at end of file diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst new file mode 100644 index 0000000..a6c5643 --- /dev/null +++ b/docs/source/quickstart.rst @@ -0,0 +1,96 @@ +Quickstart +========== + +Installation +------------ + +The *cisticola* application uses pipenv_ for dependency management. To install the dependencies of *cisticola*, first install pipenv using the following command: + +.. code-block:: + + pip install pipenv + +and then install the dependencies using the following command from the package root directory: + +.. code-block:: + + pipenv install + +To install the necessary dependencies for building the documentation and running unit tests, run the following command from the package root directory: + +.. code-block:: + + pipenv install --dev + +Environment Variables +--------------------- + +Three of the scrapers in *cisticola* (:py:mod:`~cisticola.scraper.gab.GabScraper`, :py:mod:`~cisticola.scraper.instagram.InstagramScraper`, and :py:mod:`~cisticola.scraper.telegram_telethon.TelegramTelethonScraper`) require platform credentials to work correctly. + +Gab +""" + +The Gab credentials can be configured by running the following command from the root directory: + +.. code-block:: + + pipenv run garc configure + +which will direct you to provide the username and password for your Gab account. + +Instagram +""""""""" + +The Instagram credentials can be configured by setting the following environment variables, either in the project's ``.env`` file or in the system's environment: + +- ``INSTAGRAM_USERNAME``: username of your Instagram account +- ``INSTAGRAM_PASSWORD``: password of your Instagram account + +Telegram Telethon +""""""""""""""""" + +The Telegram credentials can be configured by setting the following environment variables, either in the project's ``.env`` file or in the system's environment: + +- ``TELEGRAM_API_ID``: API ID number for your Telegram application +- ``TELEGRAM_API_HASH``: API hash for your Telegram application +- ``TELEGRAM_PHONE``: phone number for the account corresponding to your your Telegram application + +If you do not already have a Telegram application, you can create one by following the instructions on `this page`_. + +Documentation +------------- + +The *cisticola* application uses Sphinx_ to generate and display its documentation. To build the documentation in the HTML format, run the following command from the ``docs/`` directory: + +.. code-block:: + + pipenv run make html + +For developers, if changes are made to the package structure or additional modules are created, you can update the Sphinx source ``*.rst`` files by running the following command from the ``docs/`` directory: + +.. code-block:: + + pipenv run make apidoc + +Testing +------- + +The *cisticola* application uses pytest_ for unit testing. To run the test suite, run the following command from the package root directory: + +.. code-block:: + + pipenv run pytest + +Examples +-------- + +An example of a *cisticola* ingest file ``russian_telegram_ingest.py`` is included in the package root directory, showing how the list of channels to scrape is defined, and how the :py:mod:`~cisticola.scraper.base.ScraperController` and :py:mod:`~cisticola.transformer.base.Transformer` classes are used. To run the ingest script, run the following command from the package root directory: + +.. code-block:: + + pipenv run python russian_telegram_ingest.py + +.. _pipenv: https://pipenv.pypa.io/en/latest/ +.. _Sphinx: https://www.sphinx-doc.org/en/master/ +.. _pytest: https://docs.pytest.org/en/7.1.x/ +.. _this page: https://core.telegram.org/api/obtaining_api_id \ No newline at end of file