mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
added missing docstrings, created Makefile target for sphinx-apidoc, added quickstart page for installation and configuration instructions
This commit is contained in:
@@ -38,6 +38,24 @@ class Scraper:
|
||||
def __str__(self):
|
||||
return self.__version__
|
||||
|
||||
def get_username_from_url(self, url: str) -> str:
|
||||
"""Extract a channel's username from its URL.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url: str
|
||||
URL of the channel on a given platform
|
||||
e.g. ``"https://twitter.com/EliotHiggins"``
|
||||
|
||||
Returns
|
||||
-------
|
||||
username: str
|
||||
Extracted username of the channel.
|
||||
e.g. ``"EliotHiggins"``
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
"""Generate a unique identifier for media from a specified post.
|
||||
|
||||
@@ -61,13 +79,13 @@ class Scraper:
|
||||
return key
|
||||
|
||||
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
"""Download media file from a specified post URL.
|
||||
"""Download media file from a specified media file URL.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
url: str
|
||||
URL of original post.
|
||||
e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"``
|
||||
URL of media file from original post.
|
||||
e.g. ``"https://pbs.twimg.com/media/FN0j0dYWUAcQxfK?format=png&name=medium"``
|
||||
key: str or None
|
||||
Pre-defined unique identifier for the media file.
|
||||
|
||||
@@ -93,14 +111,14 @@ class Scraper:
|
||||
return blob, content_type, key
|
||||
|
||||
def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
"""Download media file from a specified post URL, where the media file
|
||||
"""Download media file from a specified media URL, where the media file
|
||||
is formatted as an m3u8 playlist, which is then decoded to an mp4 file.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
url: str
|
||||
URL of original post.
|
||||
e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"``
|
||||
URL of m3u8 playlist file from original post.
|
||||
e.g. ``"https://media.gettr.com/group47/origin/2022/03/15/01/cbc436c1-1a1a-4b97-671d-c42109f3ec9b/out.m3u8"``
|
||||
key: str or None
|
||||
Pre-defined unique identifier for the media file.
|
||||
|
||||
@@ -136,7 +154,28 @@ class Scraper:
|
||||
return blob, content_type, key
|
||||
|
||||
def ytdlp_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
|
||||
"""Download media file from a specified media URL, using a fork of
|
||||
youtube-dl that enables faster downloading.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
url: str
|
||||
URL of media file from original post.
|
||||
e.g. ``"https://rumble.com/embed/vgt7gh/"``
|
||||
key: str or None
|
||||
Pre-defined unique identifier for the media file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
blob: bytes
|
||||
Raw bytes of the downloaded media file.
|
||||
content_type: str
|
||||
Content-Type of media.
|
||||
e.g. ``"video/mp4"``.
|
||||
key: str
|
||||
Unique identifier for the media file.
|
||||
"""
|
||||
|
||||
content_type = 'video/mp4'
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
@@ -225,6 +264,11 @@ class Scraper:
|
||||
archive_media: bool
|
||||
If ``True``, any media files (images, video, etc.) from posts are archived.
|
||||
If ``False``, media files are not archived.
|
||||
|
||||
Yields
|
||||
------
|
||||
ScraperResult
|
||||
Scraper result from a single post/comment from the specified Channel.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
@@ -311,7 +355,7 @@ class ScraperController:
|
||||
self.session.configure(bind=self.engine)
|
||||
|
||||
def reset_db(self):
|
||||
"""Drop all data from the SQLAlchemy database.
|
||||
"""Drop all data from the connected SQLAlchemy database.
|
||||
"""
|
||||
|
||||
mapper_registry.metadata.drop_all(bind=self.engine)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from datetime import datetime, timezone
|
||||
from datetime import datetime, timezone
|
||||
import time
|
||||
import re
|
||||
from html.parser import HTMLParser
|
||||
@@ -17,7 +17,7 @@ class BitchuteScraper(Scraper):
|
||||
library"""
|
||||
__version__ = "BitchuteScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(url):
|
||||
def get_username_from_url(self, url):
|
||||
username = url.split('bitchute.com/channel/')[-1].strip('/')
|
||||
|
||||
return username
|
||||
@@ -33,7 +33,7 @@ class BitchuteScraper(Scraper):
|
||||
|
||||
detail = 'comments'
|
||||
|
||||
username = BitchuteScraper.get_username_from_url(channel.url)
|
||||
username = self.get_username_from_url(channel.url)
|
||||
scraper = get_videos_user(session, username, csrftoken, detail)
|
||||
|
||||
for post in scraper:
|
||||
@@ -61,7 +61,7 @@ class BitchuteScraper(Scraper):
|
||||
archived_urls=archived_urls)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Bitchute" and BitchuteScraper.get_username_from_url(channel.url) is not None:
|
||||
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
@@ -11,14 +11,14 @@ class GabScraper(Scraper):
|
||||
"""An implementation of a Scraper for Gab, using GARC library"""
|
||||
__version__ = "GabScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(url):
|
||||
def get_username_from_url(self, url):
|
||||
username = url.split('https://gab.com/')[-1]
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
client = Garc(profile = 'main')
|
||||
username = GabScraper.get_username_from_url(channel.url)
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
scraper = client.userposts(username)
|
||||
|
||||
@@ -52,5 +52,5 @@ class GabScraper(Scraper):
|
||||
archived_urls=archived_urls)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Gab" and GabScraper.get_username_from_url(channel.url) is not None:
|
||||
if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
@@ -12,7 +12,7 @@ class GettrScraper(Scraper):
|
||||
"""An implementation of a Scraper for Gettr, using gogettr library"""
|
||||
__version__ = "GettrScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(url):
|
||||
def get_username_from_url(self, url):
|
||||
username = url.split("gettr.com/user/")[1]
|
||||
if len(username.split("/")) > 1:
|
||||
return None
|
||||
@@ -21,7 +21,7 @@ class GettrScraper(Scraper):
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
client = PublicClient()
|
||||
username = GettrScraper.get_username_from_url(channel.url)
|
||||
username = self.get_username_from_url(channel.url)
|
||||
scraper = client.user_activity(username=username, type="posts")
|
||||
|
||||
for post in scraper:
|
||||
@@ -62,7 +62,7 @@ class GettrScraper(Scraper):
|
||||
archived_urls=archived_urls)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None:
|
||||
if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
|
||||
@@ -18,6 +18,7 @@ CONTENT_TYPES = {
|
||||
'mp4' : 'video/mp4'}
|
||||
|
||||
class InstagramScraper(Scraper):
|
||||
"""An implementation of a Scraper for Instagram, using instaloader library"""
|
||||
__version__ = "InstagramScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(self, url):
|
||||
|
||||
@@ -13,7 +13,7 @@ class OdyseeScraper(Scraper):
|
||||
"""An implementation of a Scraper for Odysee, using polyphemus library"""
|
||||
__version__ = "OdyseeScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(url):
|
||||
def get_username_from_url(self, url):
|
||||
|
||||
username = url.split('odysee.com/')[-1].strip('@').split(':')[0]
|
||||
|
||||
@@ -21,7 +21,7 @@ class OdyseeScraper(Scraper):
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
username = OdyseeScraper.get_username_from_url(channel.url)
|
||||
username = self.get_username_from_url(channel.url)
|
||||
odysee_channel = OdyseeChannel(channel_name = username)
|
||||
|
||||
all_videos = odysee_channel.get_all_videos()
|
||||
@@ -70,7 +70,7 @@ class OdyseeScraper(Scraper):
|
||||
archived_urls={})
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Odysee" and OdyseeScraper.get_username_from_url(channel.url) is not None:
|
||||
if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
|
||||
@@ -14,14 +14,14 @@ class RumbleScraper(Scraper):
|
||||
"""An implementation of a Scraper for Rumble, using custom functions"""
|
||||
__version__ = "RumbleScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(url):
|
||||
def get_username_from_url(self, url):
|
||||
username = url.split('https://rumble.com/c/')[1]
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
username = RumbleScraper.get_username_from_url(channel.url)
|
||||
username = self.get_username_from_url(channel.url)
|
||||
scraper = get_channel_videos(username)
|
||||
|
||||
for post in scraper:
|
||||
@@ -54,7 +54,7 @@ class RumbleScraper(Scraper):
|
||||
return key
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Rumble" and RumbleScraper.get_username_from_url(channel.url) is not None:
|
||||
if channel.platform == "Rumble" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
@@ -8,6 +8,7 @@ from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class TelegramSnscrapeScraper(Scraper):
|
||||
"""An implementation of a Scraper for Telegram, using snscrape library"""
|
||||
__version__ = "TelegramSnscrapeScraper 0.0.1"
|
||||
|
||||
def can_handle(self, channel):
|
||||
|
||||
@@ -14,6 +14,7 @@ from cisticola.scraper.base import Scraper
|
||||
MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
|
||||
|
||||
class TelegramTelethonScraper(Scraper):
|
||||
"""An implementation of a Scraper for Telegram, using Telethon library"""
|
||||
__version__ = "TelegramTelethonScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(self, url):
|
||||
@@ -30,9 +31,9 @@ class TelegramTelethonScraper(Scraper):
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
api_id = os.environ['TELEGRAM_API_ID_1']
|
||||
api_hash = os.environ['TELEGRAM_API_HASH_1']
|
||||
phone = os.environ['TELEGRAM_PHONE_1']
|
||||
api_id = os.environ['TELEGRAM_API_ID']
|
||||
api_hash = os.environ['TELEGRAM_API_HASH']
|
||||
phone = os.environ['TELEGRAM_PHONE']
|
||||
|
||||
with TelegramClient(phone, api_id, api_hash) as client:
|
||||
|
||||
|
||||
@@ -8,12 +8,24 @@ SPHINXBUILD ?= sphinx-build
|
||||
SOURCEDIR = source
|
||||
BUILDDIR = build
|
||||
|
||||
SPHINXAPIDOC = sphinx-apidoc
|
||||
APIDOCFLAGS = --separate --private --module-first
|
||||
MODULEPATH = ../cisticola
|
||||
SOURCEFILES = cisticola.*
|
||||
MODULEFILE = modules.rst
|
||||
|
||||
# Put it first so that "make" without argument is like "make help".
|
||||
help:
|
||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
|
||||
.PHONY: help Makefile
|
||||
|
||||
# Custom process and flags for generating Sphinx sources
|
||||
apidoc:
|
||||
rm $(SOURCEDIR)/$(SOURCEFILES)
|
||||
$(SPHINXAPIDOC) $(APIDOCFLAGS) -o "$(SOURCEDIR)" "$(MODULEPATH)"
|
||||
rm $(SOURCEDIR)/$(MODULEFILE)
|
||||
|
||||
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||
%: Makefile
|
||||
|
||||
@@ -10,6 +10,12 @@ if "%SPHINXBUILD%" == "" (
|
||||
set SOURCEDIR=source
|
||||
set BUILDDIR=build
|
||||
|
||||
set SPHINXAPIDOC=sphinx-apidoc
|
||||
set APIDOCFLAGS=--separate --private --module-first
|
||||
set MODULEPATH=../cisticola
|
||||
set SOURCEFILES=cisticola.*
|
||||
set MODULEFILE=modules.rst
|
||||
|
||||
if "%1" == "" goto help
|
||||
|
||||
%SPHINXBUILD% >NUL 2>NUL
|
||||
@@ -28,6 +34,11 @@ if errorlevel 9009 (
|
||||
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
goto end
|
||||
|
||||
:apidoc
|
||||
del %SOURCEDIR%\%SOURCEFILES%
|
||||
%SPHINXAPIDOC% %APIDOCFLAGS% -o %SOURCEDIR% %MODULEPATH%
|
||||
del %SOURCEDIR%\%MODULEFILE%
|
||||
|
||||
:help
|
||||
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
|
||||
|
||||
@@ -23,3 +23,4 @@ Submodules
|
||||
:maxdepth: 4
|
||||
|
||||
cisticola.base
|
||||
cisticola.utils
|
||||
|
||||
8
docs/source/cisticola.scraper.instagram.rst
Normal file
8
docs/source/cisticola.scraper.instagram.rst
Normal file
@@ -0,0 +1,8 @@
|
||||
cisticola.scraper.instagram module
|
||||
==================================
|
||||
|
||||
.. automodule:: cisticola.scraper.instagram
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
@@ -17,9 +17,11 @@ Submodules
|
||||
cisticola.scraper.bitchute
|
||||
cisticola.scraper.gab
|
||||
cisticola.scraper.gettr
|
||||
cisticola.scraper.instagram
|
||||
cisticola.scraper.odysee
|
||||
cisticola.scraper.rumble
|
||||
cisticola.scraper.telegram_snscrape
|
||||
cisticola.scraper.telegram_telethon
|
||||
cisticola.scraper.twitter
|
||||
cisticola.scraper.utils
|
||||
cisticola.scraper.vkontakte
|
||||
cisticola.scraper.youtube
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
cisticola.scraper.utils module
|
||||
==============================
|
||||
|
||||
.. automodule:: cisticola.scraper.utils
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
8
docs/source/cisticola.scraper.vkontakte.rst
Normal file
8
docs/source/cisticola.scraper.vkontakte.rst
Normal file
@@ -0,0 +1,8 @@
|
||||
cisticola.scraper.vkontakte module
|
||||
==================================
|
||||
|
||||
.. automodule:: cisticola.scraper.vkontakte
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
8
docs/source/cisticola.scraper.youtube.rst
Normal file
8
docs/source/cisticola.scraper.youtube.rst
Normal file
@@ -0,0 +1,8 @@
|
||||
cisticola.scraper.youtube module
|
||||
================================
|
||||
|
||||
.. automodule:: cisticola.scraper.youtube
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
8
docs/source/cisticola.transformer.bitchute.rst
Normal file
8
docs/source/cisticola.transformer.bitchute.rst
Normal file
@@ -0,0 +1,8 @@
|
||||
cisticola.transformer.bitchute module
|
||||
=====================================
|
||||
|
||||
.. automodule:: cisticola.transformer.bitchute
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
@@ -14,4 +14,5 @@ Submodules
|
||||
:maxdepth: 4
|
||||
|
||||
cisticola.transformer.base
|
||||
cisticola.transformer.bitchute
|
||||
cisticola.transformer.twitter
|
||||
|
||||
8
docs/source/cisticola.utils.rst
Normal file
8
docs/source/cisticola.utils.rst
Normal file
@@ -0,0 +1,8 @@
|
||||
cisticola.utils module
|
||||
======================
|
||||
|
||||
.. automodule:: cisticola.utils
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:private-members:
|
||||
@@ -2,16 +2,7 @@ Welcome to Cisticola's documentation!
|
||||
=====================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Contents:
|
||||
:maxdepth: 1
|
||||
|
||||
cisticola
|
||||
|
||||
|
||||
|
||||
Indices and tables
|
||||
==================
|
||||
|
||||
* :ref:`genindex`
|
||||
* :ref:`modindex`
|
||||
* :ref:`search`
|
||||
quickstart
|
||||
cisticola
|
||||
96
docs/source/quickstart.rst
Normal file
96
docs/source/quickstart.rst
Normal file
@@ -0,0 +1,96 @@
|
||||
Quickstart
|
||||
==========
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
The *cisticola* application uses pipenv_ for dependency management. To install the dependencies of *cisticola*, first install pipenv using the following command:
|
||||
|
||||
.. code-block::
|
||||
|
||||
pip install pipenv
|
||||
|
||||
and then install the dependencies using the following command from the package root directory:
|
||||
|
||||
.. code-block::
|
||||
|
||||
pipenv install
|
||||
|
||||
To install the necessary dependencies for building the documentation and running unit tests, run the following command from the package root directory:
|
||||
|
||||
.. code-block::
|
||||
|
||||
pipenv install --dev
|
||||
|
||||
Environment Variables
|
||||
---------------------
|
||||
|
||||
Three of the scrapers in *cisticola* (:py:mod:`~cisticola.scraper.gab.GabScraper`, :py:mod:`~cisticola.scraper.instagram.InstagramScraper`, and :py:mod:`~cisticola.scraper.telegram_telethon.TelegramTelethonScraper`) require platform credentials to work correctly.
|
||||
|
||||
Gab
|
||||
"""
|
||||
|
||||
The Gab credentials can be configured by running the following command from the root directory:
|
||||
|
||||
.. code-block::
|
||||
|
||||
pipenv run garc configure
|
||||
|
||||
which will direct you to provide the username and password for your Gab account.
|
||||
|
||||
Instagram
|
||||
"""""""""
|
||||
|
||||
The Instagram credentials can be configured by setting the following environment variables, either in the project's ``.env`` file or in the system's environment:
|
||||
|
||||
- ``INSTAGRAM_USERNAME``: username of your Instagram account
|
||||
- ``INSTAGRAM_PASSWORD``: password of your Instagram account
|
||||
|
||||
Telegram Telethon
|
||||
"""""""""""""""""
|
||||
|
||||
The Telegram credentials can be configured by setting the following environment variables, either in the project's ``.env`` file or in the system's environment:
|
||||
|
||||
- ``TELEGRAM_API_ID``: API ID number for your Telegram application
|
||||
- ``TELEGRAM_API_HASH``: API hash for your Telegram application
|
||||
- ``TELEGRAM_PHONE``: phone number for the account corresponding to your your Telegram application
|
||||
|
||||
If you do not already have a Telegram application, you can create one by following the instructions on `this page`_.
|
||||
|
||||
Documentation
|
||||
-------------
|
||||
|
||||
The *cisticola* application uses Sphinx_ to generate and display its documentation. To build the documentation in the HTML format, run the following command from the ``docs/`` directory:
|
||||
|
||||
.. code-block::
|
||||
|
||||
pipenv run make html
|
||||
|
||||
For developers, if changes are made to the package structure or additional modules are created, you can update the Sphinx source ``*.rst`` files by running the following command from the ``docs/`` directory:
|
||||
|
||||
.. code-block::
|
||||
|
||||
pipenv run make apidoc
|
||||
|
||||
Testing
|
||||
-------
|
||||
|
||||
The *cisticola* application uses pytest_ for unit testing. To run the test suite, run the following command from the package root directory:
|
||||
|
||||
.. code-block::
|
||||
|
||||
pipenv run pytest
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
An example of a *cisticola* ingest file ``russian_telegram_ingest.py`` is included in the package root directory, showing how the list of channels to scrape is defined, and how the :py:mod:`~cisticola.scraper.base.ScraperController` and :py:mod:`~cisticola.transformer.base.Transformer` classes are used. To run the ingest script, run the following command from the package root directory:
|
||||
|
||||
.. code-block::
|
||||
|
||||
pipenv run python russian_telegram_ingest.py
|
||||
|
||||
.. _pipenv: https://pipenv.pypa.io/en/latest/
|
||||
.. _Sphinx: https://www.sphinx-doc.org/en/master/
|
||||
.. _pytest: https://docs.pytest.org/en/7.1.x/
|
||||
.. _this page: https://core.telegram.org/api/obtaining_api_id
|
||||
Reference in New Issue
Block a user