added missing docstrings, created Makefile target for sphinx-apidoc, added quickstart page for installation and configuration instructions

This commit is contained in:
Tristan Lee
2022-03-15 12:40:18 -05:00
parent ee9a8c10dd
commit d68d76c0ab
22 changed files with 241 additions and 48 deletions

View File

@@ -38,6 +38,24 @@ class Scraper:
def __str__(self):
return self.__version__
def get_username_from_url(self, url: str) -> str:
"""Extract a channel's username from its URL.
Parameters
----------
url: str
URL of the channel on a given platform
e.g. ``"https://twitter.com/EliotHiggins"``
Returns
-------
username: str
Extracted username of the channel.
e.g. ``"EliotHiggins"``
"""
raise NotImplementedError
def url_to_key(self, url: str, content_type: str) -> str:
"""Generate a unique identifier for media from a specified post.
@@ -61,13 +79,13 @@ class Scraper:
return key
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
"""Download media file from a specified post URL.
"""Download media file from a specified media file URL.
Parameters
---------
url: str
URL of original post.
e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"``
URL of media file from original post.
e.g. ``"https://pbs.twimg.com/media/FN0j0dYWUAcQxfK?format=png&name=medium"``
key: str or None
Pre-defined unique identifier for the media file.
@@ -93,14 +111,14 @@ class Scraper:
return blob, content_type, key
def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
"""Download media file from a specified post URL, where the media file
"""Download media file from a specified media URL, where the media file
is formatted as an m3u8 playlist, which is then decoded to an mp4 file.
Parameters
---------
url: str
URL of original post.
e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"``
URL of m3u8 playlist file from original post.
e.g. ``"https://media.gettr.com/group47/origin/2022/03/15/01/cbc436c1-1a1a-4b97-671d-c42109f3ec9b/out.m3u8"``
key: str or None
Pre-defined unique identifier for the media file.
@@ -136,7 +154,28 @@ class Scraper:
return blob, content_type, key
def ytdlp_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
"""Download media file from a specified media URL, using a fork of
youtube-dl that enables faster downloading.
Parameters
---------
url: str
URL of media file from original post.
e.g. ``"https://rumble.com/embed/vgt7gh/"``
key: str or None
Pre-defined unique identifier for the media file.
Returns
-------
blob: bytes
Raw bytes of the downloaded media file.
content_type: str
Content-Type of media.
e.g. ``"video/mp4"``.
key: str
Unique identifier for the media file.
"""
content_type = 'video/mp4'
with tempfile.TemporaryDirectory() as temp_dir:
@@ -225,6 +264,11 @@ class Scraper:
archive_media: bool
If ``True``, any media files (images, video, etc.) from posts are archived.
If ``False``, media files are not archived.
Yields
------
ScraperResult
Scraper result from a single post/comment from the specified Channel.
"""
raise NotImplementedError
@@ -311,7 +355,7 @@ class ScraperController:
self.session.configure(bind=self.engine)
def reset_db(self):
"""Drop all data from the SQLAlchemy database.
"""Drop all data from the connected SQLAlchemy database.
"""
mapper_registry.metadata.drop_all(bind=self.engine)

View File

@@ -1,4 +1,4 @@
from datetime import datetime, timezone
from datetime import datetime, timezone
import time
import re
from html.parser import HTMLParser
@@ -17,7 +17,7 @@ class BitchuteScraper(Scraper):
library"""
__version__ = "BitchuteScraper 0.0.1"
def get_username_from_url(url):
def get_username_from_url(self, url):
username = url.split('bitchute.com/channel/')[-1].strip('/')
return username
@@ -33,7 +33,7 @@ class BitchuteScraper(Scraper):
detail = 'comments'
username = BitchuteScraper.get_username_from_url(channel.url)
username = self.get_username_from_url(channel.url)
scraper = get_videos_user(session, username, csrftoken, detail)
for post in scraper:
@@ -61,7 +61,7 @@ class BitchuteScraper(Scraper):
archived_urls=archived_urls)
def can_handle(self, channel):
if channel.platform == "Bitchute" and BitchuteScraper.get_username_from_url(channel.url) is not None:
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
return True
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -11,14 +11,14 @@ class GabScraper(Scraper):
"""An implementation of a Scraper for Gab, using GARC library"""
__version__ = "GabScraper 0.0.1"
def get_username_from_url(url):
def get_username_from_url(self, url):
username = url.split('https://gab.com/')[-1]
return username
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
client = Garc(profile = 'main')
username = GabScraper.get_username_from_url(channel.url)
username = self.get_username_from_url(channel.url)
scraper = client.userposts(username)
@@ -52,5 +52,5 @@ class GabScraper(Scraper):
archived_urls=archived_urls)
def can_handle(self, channel):
if channel.platform == "Gab" and GabScraper.get_username_from_url(channel.url) is not None:
if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:
return True

View File

@@ -12,7 +12,7 @@ class GettrScraper(Scraper):
"""An implementation of a Scraper for Gettr, using gogettr library"""
__version__ = "GettrScraper 0.0.1"
def get_username_from_url(url):
def get_username_from_url(self, url):
username = url.split("gettr.com/user/")[1]
if len(username.split("/")) > 1:
return None
@@ -21,7 +21,7 @@ class GettrScraper(Scraper):
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
client = PublicClient()
username = GettrScraper.get_username_from_url(channel.url)
username = self.get_username_from_url(channel.url)
scraper = client.user_activity(username=username, type="posts")
for post in scraper:
@@ -62,7 +62,7 @@ class GettrScraper(Scraper):
archived_urls=archived_urls)
def can_handle(self, channel):
if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None:
if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None:
return True
def url_to_key(self, url: str, content_type: str) -> str:

View File

@@ -18,6 +18,7 @@ CONTENT_TYPES = {
'mp4' : 'video/mp4'}
class InstagramScraper(Scraper):
"""An implementation of a Scraper for Instagram, using instaloader library"""
__version__ = "InstagramScraper 0.0.1"
def get_username_from_url(self, url):

View File

@@ -13,7 +13,7 @@ class OdyseeScraper(Scraper):
"""An implementation of a Scraper for Odysee, using polyphemus library"""
__version__ = "OdyseeScraper 0.0.1"
def get_username_from_url(url):
def get_username_from_url(self, url):
username = url.split('odysee.com/')[-1].strip('@').split(':')[0]
@@ -21,7 +21,7 @@ class OdyseeScraper(Scraper):
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = OdyseeScraper.get_username_from_url(channel.url)
username = self.get_username_from_url(channel.url)
odysee_channel = OdyseeChannel(channel_name = username)
all_videos = odysee_channel.get_all_videos()
@@ -70,7 +70,7 @@ class OdyseeScraper(Scraper):
archived_urls={})
def can_handle(self, channel):
if channel.platform == "Odysee" and OdyseeScraper.get_username_from_url(channel.url) is not None:
if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None:
return True
def url_to_key(self, url: str, content_type: str) -> str:

View File

@@ -14,14 +14,14 @@ class RumbleScraper(Scraper):
"""An implementation of a Scraper for Rumble, using custom functions"""
__version__ = "RumbleScraper 0.0.1"
def get_username_from_url(url):
def get_username_from_url(self, url):
username = url.split('https://rumble.com/c/')[1]
return username
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = RumbleScraper.get_username_from_url(channel.url)
username = self.get_username_from_url(channel.url)
scraper = get_channel_videos(username)
for post in scraper:
@@ -54,7 +54,7 @@ class RumbleScraper(Scraper):
return key
def can_handle(self, channel):
if channel.platform == "Rumble" and RumbleScraper.get_username_from_url(channel.url) is not None:
if channel.platform == "Rumble" and self.get_username_from_url(channel.url) is not None:
return True
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -8,6 +8,7 @@ from cisticola.base import Channel, ScraperResult
from cisticola.scraper.base import Scraper
class TelegramSnscrapeScraper(Scraper):
"""An implementation of a Scraper for Telegram, using snscrape library"""
__version__ = "TelegramSnscrapeScraper 0.0.1"
def can_handle(self, channel):

View File

@@ -14,6 +14,7 @@ from cisticola.scraper.base import Scraper
MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
class TelegramTelethonScraper(Scraper):
"""An implementation of a Scraper for Telegram, using Telethon library"""
__version__ = "TelegramTelethonScraper 0.0.1"
def get_username_from_url(self, url):
@@ -30,9 +31,9 @@ class TelegramTelethonScraper(Scraper):
username = self.get_username_from_url(channel.url)
api_id = os.environ['TELEGRAM_API_ID_1']
api_hash = os.environ['TELEGRAM_API_HASH_1']
phone = os.environ['TELEGRAM_PHONE_1']
api_id = os.environ['TELEGRAM_API_ID']
api_hash = os.environ['TELEGRAM_API_HASH']
phone = os.environ['TELEGRAM_PHONE']
with TelegramClient(phone, api_id, api_hash) as client:

View File

@@ -8,12 +8,24 @@ SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build
SPHINXAPIDOC = sphinx-apidoc
APIDOCFLAGS = --separate --private --module-first
MODULEPATH = ../cisticola
SOURCEFILES = cisticola.*
MODULEFILE = modules.rst
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Custom process and flags for generating Sphinx sources
apidoc:
rm $(SOURCEDIR)/$(SOURCEFILES)
$(SPHINXAPIDOC) $(APIDOCFLAGS) -o "$(SOURCEDIR)" "$(MODULEPATH)"
rm $(SOURCEDIR)/$(MODULEFILE)
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile

View File

@@ -10,6 +10,12 @@ if "%SPHINXBUILD%" == "" (
set SOURCEDIR=source
set BUILDDIR=build
set SPHINXAPIDOC=sphinx-apidoc
set APIDOCFLAGS=--separate --private --module-first
set MODULEPATH=../cisticola
set SOURCEFILES=cisticola.*
set MODULEFILE=modules.rst
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
@@ -28,6 +34,11 @@ if errorlevel 9009 (
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:apidoc
del %SOURCEDIR%\%SOURCEFILES%
%SPHINXAPIDOC% %APIDOCFLAGS% -o %SOURCEDIR% %MODULEPATH%
del %SOURCEDIR%\%MODULEFILE%
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%

View File

@@ -23,3 +23,4 @@ Submodules
:maxdepth: 4
cisticola.base
cisticola.utils

View File

@@ -0,0 +1,8 @@
cisticola.scraper.instagram module
==================================
.. automodule:: cisticola.scraper.instagram
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -17,9 +17,11 @@ Submodules
cisticola.scraper.bitchute
cisticola.scraper.gab
cisticola.scraper.gettr
cisticola.scraper.instagram
cisticola.scraper.odysee
cisticola.scraper.rumble
cisticola.scraper.telegram_snscrape
cisticola.scraper.telegram_telethon
cisticola.scraper.twitter
cisticola.scraper.utils
cisticola.scraper.vkontakte
cisticola.scraper.youtube

View File

@@ -1,8 +0,0 @@
cisticola.scraper.utils module
==============================
.. automodule:: cisticola.scraper.utils
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.vkontakte module
==================================
.. automodule:: cisticola.scraper.vkontakte
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.scraper.youtube module
================================
.. automodule:: cisticola.scraper.youtube
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -0,0 +1,8 @@
cisticola.transformer.bitchute module
=====================================
.. automodule:: cisticola.transformer.bitchute
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -14,4 +14,5 @@ Submodules
:maxdepth: 4
cisticola.transformer.base
cisticola.transformer.bitchute
cisticola.transformer.twitter

View File

@@ -0,0 +1,8 @@
cisticola.utils module
======================
.. automodule:: cisticola.utils
:members:
:undoc-members:
:show-inheritance:
:private-members:

View File

@@ -2,16 +2,7 @@ Welcome to Cisticola's documentation!
=====================================
.. toctree::
:maxdepth: 2
:caption: Contents:
:maxdepth: 1
cisticola
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
quickstart
cisticola

View File

@@ -0,0 +1,96 @@
Quickstart
==========
Installation
------------
The *cisticola* application uses pipenv_ for dependency management. To install the dependencies of *cisticola*, first install pipenv using the following command:
.. code-block::
pip install pipenv
and then install the dependencies using the following command from the package root directory:
.. code-block::
pipenv install
To install the necessary dependencies for building the documentation and running unit tests, run the following command from the package root directory:
.. code-block::
pipenv install --dev
Environment Variables
---------------------
Three of the scrapers in *cisticola* (:py:mod:`~cisticola.scraper.gab.GabScraper`, :py:mod:`~cisticola.scraper.instagram.InstagramScraper`, and :py:mod:`~cisticola.scraper.telegram_telethon.TelegramTelethonScraper`) require platform credentials to work correctly.
Gab
"""
The Gab credentials can be configured by running the following command from the root directory:
.. code-block::
pipenv run garc configure
which will direct you to provide the username and password for your Gab account.
Instagram
"""""""""
The Instagram credentials can be configured by setting the following environment variables, either in the project's ``.env`` file or in the system's environment:
- ``INSTAGRAM_USERNAME``: username of your Instagram account
- ``INSTAGRAM_PASSWORD``: password of your Instagram account
Telegram Telethon
"""""""""""""""""
The Telegram credentials can be configured by setting the following environment variables, either in the project's ``.env`` file or in the system's environment:
- ``TELEGRAM_API_ID``: API ID number for your Telegram application
- ``TELEGRAM_API_HASH``: API hash for your Telegram application
- ``TELEGRAM_PHONE``: phone number for the account corresponding to your your Telegram application
If you do not already have a Telegram application, you can create one by following the instructions on `this page`_.
Documentation
-------------
The *cisticola* application uses Sphinx_ to generate and display its documentation. To build the documentation in the HTML format, run the following command from the ``docs/`` directory:
.. code-block::
pipenv run make html
For developers, if changes are made to the package structure or additional modules are created, you can update the Sphinx source ``*.rst`` files by running the following command from the ``docs/`` directory:
.. code-block::
pipenv run make apidoc
Testing
-------
The *cisticola* application uses pytest_ for unit testing. To run the test suite, run the following command from the package root directory:
.. code-block::
pipenv run pytest
Examples
--------
An example of a *cisticola* ingest file ``russian_telegram_ingest.py`` is included in the package root directory, showing how the list of channels to scrape is defined, and how the :py:mod:`~cisticola.scraper.base.ScraperController` and :py:mod:`~cisticola.transformer.base.Transformer` classes are used. To run the ingest script, run the following command from the package root directory:
.. code-block::
pipenv run python russian_telegram_ingest.py
.. _pipenv: https://pipenv.pypa.io/en/latest/
.. _Sphinx: https://www.sphinx-doc.org/en/master/
.. _pytest: https://docs.pytest.org/en/7.1.x/
.. _this page: https://core.telegram.org/api/obtaining_api_id