From 1f99e524365f619c90410515b47eaeed2190721f Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Wed, 30 Mar 2022 08:05:10 -0500 Subject: [PATCH] refactored Gab scraper to use gabber instead of garc --- Pipfile | 2 +- Pipfile.lock | 76 ++++++++---------------------- cisticola/scraper/gab.py | 51 +++++++++++++++----- docs/source/quickstart.rst | 8 +++- pytest.ini | 2 + tests/scraper/bitchute.py | 1 + tests/scraper/gab.py | 1 + tests/scraper/gettr.py | 1 + tests/scraper/instagram.py | 1 + tests/scraper/odysee.py | 1 + tests/scraper/rumble.py | 1 + tests/scraper/telegram_snscrape.py | 1 + tests/scraper/telegram_telethon.py | 1 + tests/scraper/twitter.py | 1 + tests/scraper/vkontakte.py | 1 + tests/scraper/youtube.py | 1 + tests/transformer/twitter.py | 3 ++ 17 files changed, 82 insertions(+), 71 deletions(-) diff --git a/Pipfile b/Pipfile index 3be4b27..d57b5a9 100644 --- a/Pipfile +++ b/Pipfile @@ -14,7 +14,6 @@ boto3 = "*" snscrape = {git = "https://github.com/bellingcat/snscrape.git"} ffmpeg-python = "*" polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"} -garc = "*" yt-dlp = "*" telethon = "*" pytesseract = "*" @@ -22,6 +21,7 @@ pyexiftool = {git = "https://github.com/smarnach/pyexiftool.git"} instaloader = "*" gspread = "*" cryptg = "*" +gabber = {git = "https://github.com/stanfordio/gabber.git"} [dev-packages] pytest = "*" diff --git a/Pipfile.lock b/Pipfile.lock index d78928c..bb0e2a2 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "3fb247a6b9b76ed811db7636b02ad848365d38dadb0da6a27c090e559e5540ec" + "sha256": "b712e767d64e54e83e8c2d8a27a68203583ed7ad31d4ea3b4b6076a72a2150fd" }, "pipfile-spec": 6, "requires": { @@ -16,14 +16,6 @@ ] }, "default": { - "attrs": { - "hashes": [ - "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4", - "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", - "version": "==21.4.0" - }, "beautifulsoup4": { "hashes": [ "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf", @@ -280,12 +272,9 @@ "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==0.18.2" }, - "garc": { - "hashes": [ - "sha256:6f1da8ccdb30b165b8d9247314b73d1002f60381480e61fdbf108dc9abf3c216" - ], - "index": "pypi", - "version": "==2.1" + "gabber": { + "git": "https://github.com/stanfordio/gabber.git", + "ref": "d80c44c488ad4e087ba4c8f033802fe2071843bd" }, "gogettr": { "hashes": [ @@ -387,13 +376,6 @@ "markers": "python_version >= '3'", "version": "==3.3" }, - "iniconfig": { - "hashes": [ - "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3", - "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32" - ], - "version": "==1.1.1" - }, "instaloader": { "hashes": [ "sha256:7fa6147810eedcc1dedcdec8cfa1f220c9379ab8faeab6a336a7c181d944e2e4" @@ -411,11 +393,11 @@ }, "loguru": { "hashes": [ - "sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c", - "sha256:4e2414d534a2ab57573365b3e6d0234dfb1d84b68b7f3b948e6fb743860a77c3" + "sha256:b28e72ac7a98be3d28ad28570299a393dfcd32e5e3f6a353dec94675767b6319", + "sha256:f8087ac396b5ee5f67c963b495d615ebbceac2796379599820e324419d53667c" ], "index": "pypi", - "version": "==0.6.0" + "version": "==0.5.3" }, "lxml": { "hashes": [ @@ -602,26 +584,10 @@ "markers": "python_version >= '3.7'", "version": "==9.0.1" }, - "pluggy": { - "hashes": [ - "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159", - "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3" - ], - "markers": "python_version >= '3.6'", - "version": "==1.0.0" - }, "polyphemus": { "git": "https://github.com/bellingcat/polyphemus.git", "ref": "00a5123a3768a55ffe29f2c803a4181895f17890" }, - "py": { - "hashes": [ - "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719", - "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", - "version": "==1.11.0" - }, "pyaes": { "hashes": [ "sha256:02c1b1405c38d3c370b085fb952dd8bea3fadcee6411ad99f312cc129c536d8f" @@ -732,14 +698,6 @@ "index": "pypi", "version": "==0.3.9" }, - "pytest": { - "hashes": [ - "sha256:841132caef6b1ad17a9afde46dc4f6cfa59a05f9555aae5151f73bdf2820ca63", - "sha256:92f723789a8fdd7180b6b06483874feca4c48a5c76968e03bb3e7f806a1869ea" - ], - "markers": "python_version >= '3.7'", - "version": "==7.1.1" - }, "python-dateutil": { "hashes": [ "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", @@ -763,6 +721,12 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", "version": "==0.1.0.post0" }, + "ratelimit": { + "hashes": [ + "sha256:af8a9b64b821529aca09ebaf6d8d279100d766f19e90b5059ac6a718ca6dee42" + ], + "version": "==2.2.1" + }, "regex": { "hashes": [ "sha256:0008650041531d0eadecc96a73d37c2dc4821cf51b0766e374cb4f1ddc4e1c14", @@ -944,13 +908,13 @@ "index": "pypi", "version": "==1.24.0" }, - "tomli": { + "tqdm": { "hashes": [ - "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", - "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f" + "sha256:4230a49119a416c88cc47d0d2d32d5d90f1a282d5e497d49801950704e49863d", + "sha256:6461b009d6792008d0000e1b0c7ca50195ec78c0e808a3a6b668a56a3236c3a5" ], - "markers": "python_version >= '3.7'", - "version": "==2.0.1" + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==4.63.1" }, "tzdata": { "hashes": [ @@ -1325,7 +1289,7 @@ "sha256:841132caef6b1ad17a9afde46dc4f6cfa59a05f9555aae5151f73bdf2820ca63", "sha256:92f723789a8fdd7180b6b06483874feca4c48a5c76968e03bb3e7f806a1869ea" ], - "markers": "python_version >= '3.7'", + "index": "pypi", "version": "==7.1.1" }, "pytest-cov": { @@ -1443,7 +1407,7 @@ "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f" ], - "markers": "python_version >= '3.7'", + "markers": "python_full_version < '3.11.0'", "version": "==2.0.1" }, "typing-extensions": { diff --git a/cisticola/scraper/gab.py b/cisticola/scraper/gab.py index 126f500..2307ca5 100644 --- a/cisticola/scraper/gab.py +++ b/cisticola/scraper/gab.py @@ -1,15 +1,16 @@ -from datetime import datetime, timezone +from datetime import datetime, timezone, date import json from typing import Generator +import os -from garc import Garc +from gabber.client import Client, GAB_API_BASE_URL from cisticola.base import Channel, ScraperResult from cisticola.scraper.base import Scraper class GabScraper(Scraper): - """An implementation of a Scraper for Gab, using GARC library""" - __version__ = "GabScraper 0.0.1" + """An implementation of a Scraper for Gab, using gabber library""" + __version__ = "GabScraper 0.0.2" def get_username_from_url(self, url): username = url.split('https://gab.com/')[-1] @@ -17,13 +18,23 @@ class GabScraper(Scraper): return username def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: - client = Garc(profile = 'main') + client = Client( + username = os.environ['GAB_USER'], + password = os.environ['GAB_PASS'], + threads = 25) + username = self.get_username_from_url(channel.url) - scraper = client.userposts(username) + result = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json() + user_id = int(result['id']) + + scraper = client.pull_statuses( + id = user_id, + created_after = date.min, + replies = False) for post in scraper: - if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")) <= since.date: + if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): break media_urls = [] @@ -31,10 +42,18 @@ class GabScraper(Scraper): if archive_media: - media_urls.extend([p['url'] for p in post['media_attachments']]) - - if post.get('repost') is not None: - media_urls.extend([p['url'] for p in post['repost']['media_attachments']]) + for attachment in post.get('media_attachments'): + if attachment.get('type') == 'video': + media_urls.append(attachment['source_mp4']) + else: + media_urls.append(attachment['url']) + + if post.get('reblog') is not None: + for attachment in post['reblog'].get('media_attachments'): + if attachment.get('type') == 'video': + media_urls.append(attachment['source_mp4']) + else: + media_urls.append(attachment['url']) for url in media_urls: media_blob, content_type, key = self.url_to_blob(url) @@ -57,8 +76,14 @@ class GabScraper(Scraper): return True def get_profile(self, channel: Channel) -> dict: - client = Garc(profile = 'main') + + client = Client( + username = os.environ['GAB_USER'], + password = os.environ['GAB_PASS'], + threads = 25) + username = self.get_username_from_url(channel.url) - profile = list(client.user(username))[0] + + profile = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json() return profile \ No newline at end of file diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index a6c5643..4dd87ce 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -75,12 +75,18 @@ For developers, if changes are made to the package structure or additional modul Testing ------- -The *cisticola* application uses pytest_ for unit testing. To run the test suite, run the following command from the package root directory: +The *cisticola* application uses pytest_ for unit testing. To run the full test suite, run the following command from the package root directory: .. code-block:: pipenv run pytest +To run the test suite without archiving media (which can take a long time), run the following command from the package root directory: + +.. code-block:: + + pipenv run pytest -m "not media" + Examples -------- diff --git a/pytest.ini b/pytest.ini index 8d9973f..744f87d 100644 --- a/pytest.ini +++ b/pytest.ini @@ -14,6 +14,8 @@ addopts = markers = profile: marks tests for only extracting channel metadata (deselect with '-m "not profile"') + media: marks tests for archiving all media attachments (deselect with '-m + "not media"') filterwarnings = ignore:the imp module is deprecated:DeprecationWarning ignore:The localize method is no longer necessary, as this time zone supports the fold attribute diff --git a/tests/scraper/bitchute.py b/tests/scraper/bitchute.py index 687a6b0..94707ec 100644 --- a/tests/scraper/bitchute.py +++ b/tests/scraper/bitchute.py @@ -9,6 +9,7 @@ def test_scrape_bitchute_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = BitchuteScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media def test_scrape_bitchute_channel(controller, channel_kwargs): controller.reset_db() diff --git a/tests/scraper/gab.py b/tests/scraper/gab.py index 943f40f..ed9d32a 100644 --- a/tests/scraper/gab.py +++ b/tests/scraper/gab.py @@ -9,6 +9,7 @@ def test_scrape_gab_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = GabScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media def test_scrape_gab_channel(controller, channel_kwargs): controller.reset_db() diff --git a/tests/scraper/gettr.py b/tests/scraper/gettr.py index 6a3b70e..81a8bb8 100644 --- a/tests/scraper/gettr.py +++ b/tests/scraper/gettr.py @@ -9,6 +9,7 @@ def test_scrape_gettr_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = GettrScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media def test_scrape_gettr_channel(controller, channel_kwargs): controller.reset_db() diff --git a/tests/scraper/instagram.py b/tests/scraper/instagram.py index 840d6fa..98a0684 100644 --- a/tests/scraper/instagram.py +++ b/tests/scraper/instagram.py @@ -9,6 +9,7 @@ def test_scrape_instagram_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = InstagramScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media def test_scrape_instagram_channel(controller, channel_kwargs): controller.reset_db() diff --git a/tests/scraper/odysee.py b/tests/scraper/odysee.py index 8eba07d..84a45f8 100644 --- a/tests/scraper/odysee.py +++ b/tests/scraper/odysee.py @@ -9,6 +9,7 @@ def test_scrape_odysee_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = OdyseeScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media def test_scrape_odysee_channel(controller, channel_kwargs): controller.reset_db() diff --git a/tests/scraper/rumble.py b/tests/scraper/rumble.py index f64b24f..18c8749 100644 --- a/tests/scraper/rumble.py +++ b/tests/scraper/rumble.py @@ -9,6 +9,7 @@ def test_scrape_rumble_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = RumbleScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media def test_scrape_rumble_channel(controller, channel_kwargs): controller.reset_db() diff --git a/tests/scraper/telegram_snscrape.py b/tests/scraper/telegram_snscrape.py index 420b917..dbaed43 100644 --- a/tests/scraper/telegram_snscrape.py +++ b/tests/scraper/telegram_snscrape.py @@ -9,6 +9,7 @@ def test_scrape_telegram_snscrape_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = TelegramSnscrapeScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media def test_scrape_telegram_snscrape_channel(controller, channel_kwargs): controller.reset_db() diff --git a/tests/scraper/telegram_telethon.py b/tests/scraper/telegram_telethon.py index 1942fca..c6fb399 100644 --- a/tests/scraper/telegram_telethon.py +++ b/tests/scraper/telegram_telethon.py @@ -9,6 +9,7 @@ def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = TelegramTelethonScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media def test_scrape_telegram_telethon_channel(controller, channel_kwargs): controller.reset_db() diff --git a/tests/scraper/twitter.py b/tests/scraper/twitter.py index 7512b6a..97765aa 100644 --- a/tests/scraper/twitter.py +++ b/tests/scraper/twitter.py @@ -9,6 +9,7 @@ def test_scrape_twitter_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = TwitterScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media def test_scrape_twitter_channel(controller, channel_kwargs): controller.reset_db() diff --git a/tests/scraper/vkontakte.py b/tests/scraper/vkontakte.py index 8b0b757..4209c30 100644 --- a/tests/scraper/vkontakte.py +++ b/tests/scraper/vkontakte.py @@ -9,6 +9,7 @@ def test_scrape_vkontakte_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = VkontakteScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media def test_scrape_vkontakte_channel(controller, channel_kwargs): controller.reset_db() diff --git a/tests/scraper/youtube.py b/tests/scraper/youtube.py index e987cb8..1750b08 100644 --- a/tests/scraper/youtube.py +++ b/tests/scraper/youtube.py @@ -9,6 +9,7 @@ def test_scrape_youtube_channel_no_media(controller, channel_kwargs): controller.register_scraper(scraper = YoutubeScraper()) controller.scrape_channels(channels = channels, archive_media = False) +@pytest.mark.media def test_scrape_youtube_channel(controller, channel_kwargs): controller.reset_db() diff --git a/tests/transformer/twitter.py b/tests/transformer/twitter.py index fd95bbe..3c50d1c 100644 --- a/tests/transformer/twitter.py +++ b/tests/transformer/twitter.py @@ -1,11 +1,14 @@ from sqlalchemy.orm import sessionmaker, with_polymorphic import json +import pytest + from cisticola.base import Channel from cisticola.scraper import TwitterScraper from cisticola.transformer import TwitterTransformer from cisticola.base import Post, Media +@pytest.mark.media def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs): controller.reset_db()