From bc840e631d34f6a3ebf013af5e221d8ac1c4fcb7 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Mon, 28 Feb 2022 12:11:21 -0600 Subject: [PATCH] added Gab scraper --- Pipfile | 1 + Pipfile.lock | 58 +++++++++++++++++++++++++++++++++-- cisticola/scraper/base.py | 8 +++-- cisticola/scraper/bitchute.py | 2 +- cisticola/scraper/gab.py | 53 ++++++++++++++++++++++++++++++++ test.py | 9 ++++++ 6 files changed, 126 insertions(+), 5 deletions(-) create mode 100644 cisticola/scraper/gab.py diff --git a/Pipfile b/Pipfile index 3f094c5..d2c3af8 100644 --- a/Pipfile +++ b/Pipfile @@ -15,6 +15,7 @@ boto3 = "*" snscrape = {git = "https://github.com/bellingcat/snscrape.git"} ffmpeg-python = "*" polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"} +garc = "*" [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index 4690b31..e9c95cf 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "263a7825d8113518c7a0690d5f69526cabe2dfa6ea572bb39cbe5d26495e619c" + "sha256": "08623c70f7bb2da863def501ebdc6b0b2afab9865ef9e457b3137b8020314507" }, "pipfile-spec": 6, "requires": { @@ -23,6 +23,14 @@ ], "version": "==0.7.12" }, + "attrs": { + "hashes": [ + "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4", + "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==21.4.0" + }, "babel": { "hashes": [ "sha256:ab49e12b91d937cd11f0b67cb259a57ab4ad2b59ac7a3b41d6c06c0ac5b0def9", @@ -124,6 +132,13 @@ "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==0.18.2" }, + "garc": { + "hashes": [ + "sha256:6f1da8ccdb30b165b8d9247314b73d1002f60381480e61fdbf108dc9abf3c216" + ], + "index": "pypi", + "version": "==2.1" + }, "gogettr": { "hashes": [ "sha256:9f5c90e3b1befe6eb561d4bca9ca124faddbe5787d8b429f02703c68dd51d255", @@ -217,6 +232,13 @@ "markers": "python_version < '3.10'", "version": "==4.11.2" }, + "iniconfig": { + "hashes": [ + "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3", + "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32" + ], + "version": "==1.1.1" + }, "jinja2": { "hashes": [ "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8", @@ -414,10 +436,26 @@ "markers": "python_version >= '3.8'", "version": "==1.4.1" }, + "pluggy": { + "hashes": [ + "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159", + "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3" + ], + "markers": "python_version >= '3.6'", + "version": "==1.0.0" + }, "polyphemus": { "git": "https://github.com/bellingcat/polyphemus.git", "ref": "18b89f19ecdd32e7dc8b5564b258a67165e680ca" }, + "py": { + "hashes": [ + "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719", + "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==1.11.0" + }, "pygments": { "hashes": [ "sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65", @@ -442,6 +480,14 @@ ], "version": "==1.7.1" }, + "pytest": { + "hashes": [ + "sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db", + "sha256:e30905a0c131d3d94b89624a1cc5afec3e0ba2fbdb151867d8e0ebd49850f171" + ], + "markers": "python_version >= '3.6'", + "version": "==7.0.1" + }, "python-dateutil": { "hashes": [ "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", @@ -688,6 +734,14 @@ "index": "pypi", "version": "==1.4.31" }, + "tomli": { + "hashes": [ + "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", + "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f" + ], + "markers": "python_version >= '3.7'", + "version": "==2.0.1" + }, "tzdata": { "hashes": [ "sha256:3eee491e22ebfe1e5cfcc97a4137cd70f092ce59144d81f8924a844de05ba8f5", @@ -709,7 +763,7 @@ "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed", "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", "version": "==1.26.8" }, "zipp": { diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 7a7865e..8b0bb90 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -19,6 +19,9 @@ class Scraper: 'DO_SPACES_KEY'), aws_secret_access_key=os.getenv('DO_SPACES_SECRET')) + self.headers = { + 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'} + pass def __str__(self): @@ -32,12 +35,13 @@ class Scraper: def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: n_retries = 0 - r = requests.get(url) + + r = requests.get(url, headers = self.headers) while r.status_code != 200 and n_retries < 5: logger.warning(f"{n_retries}/5: Request for {url} failed") n_retries += 1 - r = requests.get(url) + r = requests.get(url, headers = self.headers) if r.status_code != 200: logger.error(f"Could not fetch URL {url}") diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index 8063713..9e7f202 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -26,7 +26,7 @@ class BitchuteScraper(cisticola.scraper.base.Scraper): def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: session = requests.Session() - session.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0" + session.headers.update(self.headers) request = session.get("https://www.bitchute.com/search") csrftoken = BeautifulSoup(request.text, 'html.parser').findAll( "input", {"name": "csrfmiddlewaretoken"})[0].get("value") diff --git a/cisticola/scraper/gab.py b/cisticola/scraper/gab.py new file mode 100644 index 0000000..27a0924 --- /dev/null +++ b/cisticola/scraper/gab.py @@ -0,0 +1,53 @@ +import cisticola.base +import cisticola.scraper.base +from datetime import datetime +import json +from typing import Generator, Tuple +from garc import Garc +import tempfile + +class GabScraper(cisticola.scraper.base.Scraper): + """An implementation of a Scraper for Gab, using GARC library""" + __version__ = "GabScraper 0.0.1" + + def get_username_from_url(url): + username = url.split('https://gab.com/')[-1] + + return username + + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]: + client = Garc(profile = 'main') + username = GabScraper.get_username_from_url(channel.url) + + scraper = client.userposts(username) + + for post in scraper: + if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo = None) <= since.date: + break + + media_urls = [] + archived_urls = {} + + media_urls.extend([p['url'] for p in post['media_attachments']]) + + if post.get('repost') is not None: + media_urls.extend([p['url'] for p in post['repost']['media_attachments']]) + + for url in media_urls: + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_media(media_blob, content_type, key) + archived_urls[url] = archived_url + + yield cisticola.base.ScraperResult( + scraper=self.__version__, + platform="Gab", + channel=channel.id, + platform_id=post['id'], + date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo = None), + date_archived=datetime.now(), + raw_data=json.dumps(post), + archived_urls=archived_urls) + + def can_handle(self, channel): + if channel.platform == "Gab" and GabScraper.get_username_from_url(channel.url) is not None: + return True \ No newline at end of file diff --git a/test.py b/test.py index eef8e29..dfcb2cd 100644 --- a/test.py +++ b/test.py @@ -4,6 +4,7 @@ import cisticola.scraper.twitter import cisticola.scraper.gettr import cisticola.scraper.bitchute import cisticola.scraper.odysee +import cisticola.scraper.gab from sqlalchemy import create_engine @@ -31,6 +32,11 @@ test_channels = [ id=5, name="Mak1n' Bacon (test)", platform_id='Mak1nBacon', category="test", followers=None, platform="Odysee", url="https://odysee.com/@Mak1nBacon", screenname='Mak1nBacon', country="US", + influencer=None, public=True, chat=False, notes=""), + cisticola.base.Channel( + id=6, name="Capt. Marc Simon (test)", platform_id='marc_capt', + category="test", followers=None, platform="Gab", + url="https://gab.com/marc_capt", screenname='marc_capt', country="CA", influencer=None, public=True, chat=False, notes="")] @@ -51,6 +57,9 @@ controller.register_scraper(bitchute) odysee = cisticola.scraper.odysee.OdyseeScraper() controller.register_scraper(odysee) +gab = cisticola.scraper.gab.GabScraper() +controller.register_scraper(gab) + engine = create_engine('sqlite:///test3.db') controller.connect_to_db(engine)